arm64: implement: {uqshl, sqshl, sqshlu} (scalar, imm) and fix two
bugs in the implementation of the (vector, imm) variants.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2923 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 28176a0..997565c 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -6659,16 +6659,20 @@
    if (vex_streq(nm, "sqshlu")) {
       IROp qop = mkVecQSHLNSATS2U(size);
       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
-      /* This is different from the other two cases, in that
-         saturation can occur even if there is no shift. */
-      /* Saturation has occurred if any of the shifted-out bits, or
-         the top bit of the original value, are nonzero. */
-      UInt rshift = laneBits - 1 - shift;
-      vassert(rshift >= 0 && rshift < laneBits);
-      /* qDiff1 is the shifted out bits, and the top bit of the original
-         value, preceded by zeroes. */
-      assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
-      assign(*qDiff2, mkexpr(z128));
+      if (shift == 0) {
+         /* If there's no shift, saturation depends on the top bit
+            of the source. */
+         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
+         assign(*qDiff2, mkexpr(z128));
+      } else {
+         /* Saturation has occurred if any of the shifted-out bits are
+            nonzero.  We get the shifted-out bits by right-shifting the
+            original value. */
+         UInt rshift = laneBits - shift;
+         vassert(rshift >= 1 && rshift < laneBits);
+         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
+         assign(*qDiff2, mkexpr(z128));
+      }
       return;
    }
 
@@ -7544,18 +7548,6 @@
       return True;
    }
 
-   if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
-      /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
-      UInt sh = immhb - 64;
-      vassert(sh >= 0 && sh < 64);
-      putQReg128(dd,
-                 unop(Iop_ZeroHI64ofV128,
-                      sh == 0 ? getQReg128(nn)
-                              : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
-      DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
-      return True;
-   }
-
    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
       UInt sh = 128 - immhb;
@@ -7576,6 +7568,18 @@
       return True;
    }
 
+   if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
+      /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
+      UInt sh = immhb - 64;
+      vassert(sh >= 0 && sh < 64);
+      putQReg128(dd,
+                 unop(Iop_ZeroHI64ofV128,
+                      sh == 0 ? getQReg128(nn)
+                              : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
+      DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
+      return True;
+   }
+
    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
       UInt sh = immhb - 64;
@@ -7596,6 +7600,41 @@
       return True;
    }
 
+   if (opcode == BITS5(0,1,1,1,0)
+       || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
+      /* -------- 0,01110  SQSHL  #imm -------- */
+      /* -------- 1,01110  UQSHL  #imm -------- */
+      /* -------- 1,01100  SQSHLU #imm -------- */
+      UInt size  = 0;
+      UInt shift = 0;
+      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
+      if (!ok) return False;
+      vassert(size >= 0 && size <= 3);
+      /* The shift encoding has opposite sign for the leftwards case.
+         Adjust shift to compensate. */
+      UInt lanebits = 8 << size;
+      shift = lanebits - shift;
+      vassert(shift >= 0 && shift < lanebits);
+      const HChar* nm = NULL;
+      /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
+      else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
+      else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
+      else vassert(0);
+      IRTemp qDiff1 = IRTemp_INVALID;
+      IRTemp qDiff2 = IRTemp_INVALID;
+      IRTemp res = IRTemp_INVALID;
+      IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
+      /* This relies on the fact that the zeroed out lanes generate zeroed
+         result lanes and don't saturate, so there's no point in trimming 
+         the resulting res, qDiff1 or qDiff2 values. */
+      math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
+      putQReg128(dd, mkexpr(res));
+      updateQCFLAGwithDifference(qDiff1, qDiff2);
+      const HChar arr = "bhsd"[size];
+      DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
+      return True;
+   }
+
    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
        || (bitU == 1
            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
@@ -8366,7 +8405,7 @@
       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
-                                    isQ ? Iop_ZeroHI64ofV128 : Iop_INVALID);
+                                    isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
       const HChar* arr = nameArr_Q_SZ(bitQ, size);
       DIP("%s %s.%s, %s.%s, #%u\n", nm,
           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);