arm64: route all whole-vector shift/rotate/slice operations
through Iop_SliceV128, so as to give it some testing.  Implement
Iop_SliceV128 in the back end.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2940 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index d0db663..735373e 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -6844,11 +6844,9 @@
          if (imm4 == 0) {
             assign(res, mkexpr(sLo));
          } else {
-            vassert(imm4 <= 15);
-            assign(res,
-                   binop(Iop_OrV128,
-                         binop(Iop_ShlV128, mkexpr(sHi), mkU8(8 * (16-imm4))),
-                         binop(Iop_ShrV128, mkexpr(sLo), mkU8(8 * imm4))));
+            vassert(imm4 >= 1 && imm4 <= 15);
+            assign(res, triop(Iop_SliceV128,
+                              mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
          }
          putQReg128(dd, mkexpr(res));
          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
@@ -6857,10 +6855,12 @@
          if (imm4 == 0) {
             assign(res, mkexpr(sLo));
          } else {
-           assign(res,
-                  binop(Iop_ShrV128,
-                        binop(Iop_InterleaveLO64x2, mkexpr(sHi), mkexpr(sLo)),
-                        mkU8(8 * imm4)));
+            vassert(imm4 >= 1 && imm4 <= 7);
+            IRTemp hi64lo64 = newTempV128();
+            assign(hi64lo64, binop(Iop_InterleaveLO64x2,
+                                   mkexpr(sHi), mkexpr(sLo)));
+            assign(res, triop(Iop_SliceV128,
+                              mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
          }
          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
@@ -7015,8 +7015,15 @@
       IRTemp preR = newTempV128();
       IRTemp res  = newTempV128();
       if (bitQ == 0 && !isZIP1) {
-         assign(preL, binop(Iop_ShlV128, getQReg128(mm), mkU8(32)));
-         assign(preR, binop(Iop_ShlV128, getQReg128(nn), mkU8(32)));
+         IRTemp z128 = newTempV128();
+         assign(z128, mkV128(0x0000));
+         // preL = Vm shifted left 32 bits
+         // preR = Vn shifted left 32 bits
+         assign(preL, triop(Iop_SliceV128,
+                            getQReg128(mm), mkexpr(z128), mkU8(12)));
+         assign(preR, triop(Iop_SliceV128,
+                            getQReg128(nn), mkexpr(z128), mkU8(12)));
+
       } else {
          assign(preL, getQReg128(mm));
          assign(preR, getQReg128(nn));
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 884d2c7..233c275 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -2753,47 +2753,51 @@
             break;
          }
 
-         case Iop_ShlV128:
-         case Iop_ShrV128: {
-            Bool isSHR = e->Iex.Binop.op == Iop_ShrV128;
-            /* This is tricky.  Generate an EXT instruction with zeroes in
-               the high operand (shift right) or low operand (shift left).
-               Note that we can only slice in the EXT instruction at a byte
-               level of granularity, so the shift amount needs careful
-               checking. */
-            IRExpr* argL = e->Iex.Binop.arg1;
-            IRExpr* argR = e->Iex.Binop.arg2;
-            if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
-               UInt amt   = argR->Iex.Const.con->Ico.U8;
-               Bool amtOK = False;
-               switch (amt) {
-                  case 0x08: case 0x10: case 0x18: case 0x20: case 0x28:
-                  case 0x30: case 0x38: case 0x40: case 0x48: case 0x50:
-                  case 0x58: case 0x60: case 0x68: case 0x70: case 0x78:
-                     amtOK = True; break;
-               }
-               /* We could also deal with amt==0 by copying the source to
-                  the destination, but there's no need for that so far. */
-               if (amtOK) {
-                  HReg src  = iselV128Expr(env, argL);
-                  HReg srcZ = newVRegV(env);
-                  addInstr(env, ARM64Instr_VImmQ(srcZ, 0x0000));
-                  UInt immB = amt / 8;
-                  vassert(immB >= 1 && immB <= 15);
-                  HReg dst = newVRegV(env);
-                  if (isSHR) {
-                     addInstr(env, ARM64Instr_VExtV(dst, src/*lo*/, srcZ/*hi*/,
-                                                         immB));
-                  } else {
-                     addInstr(env, ARM64Instr_VExtV(dst, srcZ/*lo*/, src/*hi*/,
-                                                         16 - immB));
-                  }
-                  return dst;
-               }
-            }
-            /* else fall out; this is unhandled */
-            break;
-         }
+         // JRS 01 Sept 2014: these are tested and believed to be correct,
+         // but they are no longer used by the front end, hence commented
+         // out.  They are replaced by Iop_SliceV128, which is more general
+         // and in many cases leads to better code overall.
+         //case Iop_ShlV128:
+         //case Iop_ShrV128: {
+         //   Bool isSHR = e->Iex.Binop.op == Iop_ShrV128;
+         //   /* This is tricky.  Generate an EXT instruction with zeroes in
+         //      the high operand (shift right) or low operand (shift left).
+         //      Note that we can only slice in the EXT instruction at a byte
+         //      level of granularity, so the shift amount needs careful
+         //      checking. */
+         //   IRExpr* argL = e->Iex.Binop.arg1;
+         //   IRExpr* argR = e->Iex.Binop.arg2;
+         //  if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
+         //      UInt amt   = argR->Iex.Const.con->Ico.U8;
+         //      Bool amtOK = False;
+         //      switch (amt) {
+         //         case 0x08: case 0x10: case 0x18: case 0x20: case 0x28:
+         //         case 0x30: case 0x38: case 0x40: case 0x48: case 0x50:
+         //         case 0x58: case 0x60: case 0x68: case 0x70: case 0x78:
+         //            amtOK = True; break;
+         //      }
+         //      /* We could also deal with amt==0 by copying the source to
+         //         the destination, but there's no need for that so far. */
+         //      if (amtOK) {
+         //         HReg src  = iselV128Expr(env, argL);
+         //         HReg srcZ = newVRegV(env);
+         //         addInstr(env, ARM64Instr_VImmQ(srcZ, 0x0000));
+         //         UInt immB = amt / 8;
+         //         vassert(immB >= 1 && immB <= 15);
+         //         HReg dst = newVRegV(env);
+         //         if (isSHR) {
+         //           addInstr(env, ARM64Instr_VExtV(dst, src/*lo*/, srcZ/*hi*/,
+         //                                                immB));
+         //         } else {
+         //           addInstr(env, ARM64Instr_VExtV(dst, srcZ/*lo*/, src/*hi*/,
+         //                                                16 - immB));
+         //         }
+         //         return dst;
+         //      }
+         //   }
+         //   /* else fall out; this is unhandled */
+         //   break;
+         //}
 
          case Iop_PolynomialMull8x8:
          case Iop_Mull32Ux2:
@@ -2857,7 +2861,30 @@
          addInstr(env, ARM64Instr_VBinV(vecbop, dst, argL, argR));
          return dst;
       }
-   }
+
+      if (triop->op == Iop_SliceV128) {
+         /* Note that, compared to ShlV128/ShrV128 just above, the shift
+            amount here is in bytes, not bits. */
+         IRExpr* argHi  = triop->arg1;
+         IRExpr* argLo  = triop->arg2;
+         IRExpr* argAmt = triop->arg3;
+         if (argAmt->tag == Iex_Const && argAmt->Iex.Const.con->tag == Ico_U8) {
+            UInt amt   = argAmt->Iex.Const.con->Ico.U8;
+            Bool amtOK = amt >= 1 && amt <= 15;
+            /* We could also deal with amt==0 by copying argLO to
+               the destination, but there's no need for that so far. */
+            if (amtOK) {
+               HReg srcHi = iselV128Expr(env, argHi);
+               HReg srcLo = iselV128Expr(env, argLo);
+               HReg dst = newVRegV(env);
+              addInstr(env, ARM64Instr_VExtV(dst, srcLo, srcHi, amt));
+               return dst;
+            }
+         }
+         /* else fall out; this is unhandled */
+      }
+
+   } /* if (e->tag == Iex_Triop) */
 
   v128_expr_bad:
    ppIRExpr(e);