arm64: implement remaining SQDMULH and SQRDMULH cases.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2911 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 4132780..44decc5 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -6292,7 +6292,7 @@
 }
 
 
-static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( IRExpr* srcE, UInt size )
+static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
 {
    vassert(size < 4);
    IRTemp t = newTempV128();
@@ -6412,6 +6412,9 @@
 }
 
 
+/* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
+   double that, possibly add a rounding constant (R variants), and take
+   the high half. */
 static
 void math_SQDMULH ( /*OUT*/IRTemp* res,
                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
@@ -6469,7 +6472,8 @@
    if (opZHI == Iop_INVALID) {
       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
    } else {
-      vassert(opZHI == Iop_ZeroHI64ofV128 || opZHI == Iop_ZeroHI96ofV128);
+      vassert(opZHI == Iop_ZeroHI64ofV128
+              || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
    }
    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
@@ -7491,9 +7495,9 @@
       assign(argL, getQReg128(nn));
       assign(argR, getQReg128(mm));
       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
-                             binop(qop, mkexpr(argL), mkexpr(argR)), size)));
+                             size, binop(qop, mkexpr(argL), mkexpr(argR)))));
       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
-                             binop(nop, mkexpr(argL), mkexpr(argR)), size)));
+                             size, binop(nop, mkexpr(argL), mkexpr(argR)))));
       putQReg128(dd, mkexpr(qres));
       updateQCFLAGwithDifference(qres, nres);
       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd") 
@@ -7573,6 +7577,28 @@
       return True;
    }
 
+   if (opcode == BITS5(1,0,1,1,0)) {
+      /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
+      /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
+      if (size == X00 || size == X11) return False;
+      Bool isR = bitU == 1;
+      IRTemp res, sat1q, sat1n, vN, vM;
+      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
+      newTempsV128_2(&vN, &vM);
+      assign(vN, getQReg128(nn));
+      assign(vM, getQReg128(mm));
+      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
+      putQReg128(dd,
+                 mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
+      updateQCFLAGwithDifference(
+         math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
+         math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
+      const HChar  arr = "bhsd"[size];
+      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
+      DIP("%s %c%d, %c%d, %c%d\n", nm, arr, dd, arr, nn, arr, mm);
+      return True;
+   }
+
    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
@@ -7621,8 +7647,8 @@
       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
                                          getQReg128(nn), size );
-      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(qresFW), size);
-      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(nresFW), size);
+      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
+      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
       putQReg128(dd, mkexpr(qres));
       updateQCFLAGwithDifference(qres, nres);
       const HChar arr = "bhsd"[size];
@@ -7771,6 +7797,40 @@
       return True;
    }
 
+   if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
+      /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
+      /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
+      UInt mm  = 32; // invalid
+      UInt ix  = 16; // invalid
+      switch (size) {
+         case X00:
+            return False; // b case is not allowed
+         case X01:
+            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+         case X10:
+            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+         case X11:
+            return False; // q case is not allowed
+         default:
+            vassert(0);
+      }
+      vassert(mm < 32 && ix < 16);
+      Bool isR = opcode == BITS4(1,1,0,1);
+      IRTemp res, sat1q, sat1n, vN, vM;
+      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
+      vN = newTempV128();
+      assign(vN, getQReg128(nn));
+      vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
+      IROp opZHI = mkVecZEROHIxxOFV128(size);
+      putQReg128(dd, unop(opZHI, mkexpr(res)));
+      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
+      HChar ch         = size == X01 ? 'h' : 's';
+      DIP("%s %c%d, %c%d, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, dd, ix);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }
@@ -9400,6 +9460,42 @@
       return True;
    }
 
+   if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
+      /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
+      /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
+      UInt mm  = 32; // invalid
+      UInt ix  = 16; // invalid
+      switch (size) {
+         case X00:
+            return False; // b case is not allowed
+         case X01:
+            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+         case X10:
+            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+         case X11:
+            return False; // q case is not allowed
+         default:
+            vassert(0);
+      }
+      vassert(mm < 32 && ix < 16);
+      Bool isR = opcode == BITS4(1,1,0,1);
+      IRTemp res, sat1q, sat1n, vN, vM;
+      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
+      vN = newTempV128();
+      assign(vN, getQReg128(nn));
+      vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
+      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
+      const HChar* arr = nameArr_Q_SZ(bitQ, size);
+      HChar ch         = size == X01 ? 'h' : 's';
+      DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
+      return True;
+   }
+
    return False;
 #  undef INSN
 }