arm64: implement:
suqadd, usqadd (scalar)
suqadd, usqadd (vector)


git-svn-id: svn://svn.valgrind.org/vex/trunk@2928 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index a49b7b9..837abdd 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -621,6 +621,22 @@
    return ops[size];
 }
 
+static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
+          Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
+static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
+   const IROp ops[4]
+      = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
+          Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
+   vassert(size < 4);
+   return ops[size];
+}
+
 static IROp mkVecSUB ( UInt size ) {
    const IROp ops[4]
       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
@@ -8137,6 +8153,31 @@
    UInt dd     = INSN(4,0);
    vassert(size < 4);
 
+   if (opcode == BITS5(0,0,0,1,1)) {
+      /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
+      /* -------- 1,xx,00011: USQADD std4_std4 -------- */
+      /* These are a bit tricky (to say the least).  See comments on
+         the vector variants (in dis_AdvSIMD_two_reg_misc) below for
+         details. */
+      Bool   isUSQADD = bitU == 1;
+      IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
+                             : mkVecQADDEXTUSSATSS(size);
+      IROp   nop  = mkVecADD(size);
+      IRTemp argL = newTempV128();
+      IRTemp argR = newTempV128();
+      assign(argL, getQReg128(nn));
+      assign(argR, getQReg128(dd));
+      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+                       size, binop(qop, mkexpr(argL), mkexpr(argR)));
+      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+                       size, binop(nop, mkexpr(argL), mkexpr(argR)));
+      putQReg128(dd, mkexpr(qres));
+      updateQCFLAGwithDifference(qres, nres);
+      const HChar arr = "bhsd"[size];
+      DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
+      return True;
+   }
+
    if (opcode == BITS5(0,0,1,1,1)) {
       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
@@ -9747,6 +9788,39 @@
       return True;
    }
 
+   if (opcode == BITS5(0,0,0,1,1)) {
+      /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
+      /* -------- 1,xx,00011: USQADD std7_std7 -------- */
+      if (bitQ == 0 && size == X11) return False; // implied 1d case
+      Bool isUSQADD = bitU == 1;
+      /* This is switched (in the US vs SU sense) deliberately.
+         SUQADD corresponds to the ExtUSsatSS variants and 
+         USQADD corresponds to the ExtSUsatUU variants.
+         See libvex_ir for more details. */
+      IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
+                             : mkVecQADDEXTUSSATSS(size);
+      IROp   nop  = mkVecADD(size);
+      IRTemp argL = newTempV128();
+      IRTemp argR = newTempV128();
+      IRTemp qres = newTempV128();
+      IRTemp nres = newTempV128();
+      /* Because the two arguments to the addition are implicitly 
+         extended differently (one signedly, the other unsignedly) it is
+         important to present them to the primop in the correct order. */
+      assign(argL, getQReg128(nn));
+      assign(argR, getQReg128(dd));
+      assign(qres, math_MAYBE_ZERO_HI64_fromE(
+                      bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
+      assign(nres, math_MAYBE_ZERO_HI64_fromE(
+                      bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
+      putQReg128(dd, mkexpr(qres));
+      updateQCFLAGwithDifference(qres, nres);
+      const HChar* arr = nameArr_Q_SZ(bitQ, size);
+      DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
+          nameQReg128(dd), arr, nameQReg128(nn), arr);
+      return True;
+   }
+
    if (opcode == BITS5(0,0,1,0,0)) {
       /* -------- 0,xx,00100: CLS std6_std6 -------- */
       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index b977d19..df9b427 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -702,6 +702,22 @@
    }
 }
 
+static void showARM64VecModifyOp(/*OUT*/const HChar** nm,
+                                 /*OUT*/const HChar** ar,
+                                 ARM64VecModifyOp op ) {
+   switch (op) {
+      case ARM64vecmo_SUQADD64x2:   *nm = "suqadd";    *ar = "2d";   return;
+      case ARM64vecmo_SUQADD32x4:   *nm = "suqadd";    *ar = "4s";   return;
+      case ARM64vecmo_SUQADD16x8:   *nm = "suqadd";    *ar = "8h";   return;
+      case ARM64vecmo_SUQADD8x16:   *nm = "suqadd";    *ar = "16b";  return;
+      case ARM64vecmo_USQADD64x2:   *nm = "usqadd";    *ar = "2d";   return;
+      case ARM64vecmo_USQADD32x4:   *nm = "usqadd";    *ar = "4s";   return;
+      case ARM64vecmo_USQADD16x8:   *nm = "usqadd";    *ar = "8h";   return;
+      case ARM64vecmo_USQADD8x16:   *nm = "usqadd";    *ar = "16b";  return;
+      default: vpanic("showARM64VecModifyOp");
+   }
+}
+
 static void showARM64VecUnaryOp(/*OUT*/const HChar** nm,
                                 /*OUT*/const HChar** ar, ARM64VecUnaryOp op )
 {
@@ -1117,6 +1133,14 @@
    i->ARM64in.VBinV.argR = argR;
    return i;
 }
+ARM64Instr* ARM64Instr_VModifyV ( ARM64VecModifyOp op, HReg mod, HReg arg ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                  = ARM64in_VModifyV;
+   i->ARM64in.VModifyV.op  = op;
+   i->ARM64in.VModifyV.mod = mod;
+   i->ARM64in.VModifyV.arg = arg;
+   return i;
+}
 ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg dst, HReg arg ) {
    ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
    i->tag                 = ARM64in_VUnaryV;
@@ -1639,6 +1663,17 @@
          vex_printf(".%s", ar);
          return;
       }
+      case ARM64in_VModifyV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecModifyOp(&nm, &ar, i->ARM64in.VModifyV.op);
+         vex_printf("%s ", nm);
+         ppHRegARM64(i->ARM64in.VModifyV.mod);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VModifyV.arg);
+         vex_printf(".%s", ar);
+         return;
+      }
       case ARM64in_VUnaryV: {
          const HChar* nm = "??";
          const HChar* ar = "??";
@@ -2000,6 +2035,11 @@
          addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL);
          addHRegUse(u, HRmRead, i->ARM64in.VBinV.argR);
          return;
+      case ARM64in_VModifyV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VModifyV.mod);
+         addHRegUse(u, HRmRead, i->ARM64in.VModifyV.mod);
+         addHRegUse(u, HRmRead, i->ARM64in.VModifyV.arg);
+         return;
       case ARM64in_VUnaryV:
          addHRegUse(u, HRmWrite, i->ARM64in.VUnaryV.dst);
          addHRegUse(u, HRmRead, i->ARM64in.VUnaryV.arg);
@@ -2214,6 +2254,10 @@
          i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL);
          i->ARM64in.VBinV.argR = lookupHRegRemap(m, i->ARM64in.VBinV.argR);
          return;
+      case ARM64in_VModifyV:
+         i->ARM64in.VModifyV.mod = lookupHRegRemap(m, i->ARM64in.VModifyV.mod);
+         i->ARM64in.VModifyV.arg = lookupHRegRemap(m, i->ARM64in.VModifyV.arg);
+         return;
       case ARM64in_VUnaryV:
          i->ARM64in.VUnaryV.dst = lookupHRegRemap(m, i->ARM64in.VUnaryV.dst);
          i->ARM64in.VUnaryV.arg = lookupHRegRemap(m, i->ARM64in.VUnaryV.arg);
@@ -4493,6 +4537,43 @@
          }
          goto done;
       }
+      case ARM64in_VModifyV: {
+         /* 31        23   20    15     9 4
+            010 01110 sz 1 00000 001110 n d   SUQADD@sz  Vd, Vn
+            011 01110 sz 1 00000 001110 n d   USQADD@sz  Vd, Vn
+         */
+         UInt vD = qregNo(i->ARM64in.VModifyV.mod);
+         UInt vN = qregNo(i->ARM64in.VModifyV.arg);
+         switch (i->ARM64in.VModifyV.op) {
+            case ARM64vecmo_SUQADD64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_SUQADD32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_SUQADD16x8:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_SUQADD8x16:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_USQADD64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_USQADD32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_USQADD16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, X00000, X001110, vN, vD);
+               break;
+            case ARM64vecmo_USQADD8x16:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X001110, vN, vD);
+               break;
+            default:
+               goto bad;
+         }
+         goto done;
+      }
       case ARM64in_VUnaryV: {
          /* 31        23   20    15     9 4
             010 01110 11 1 00000 111110 n d  FABS Vd.2d,  Vn.2d
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 9755b52..e100b0f 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -386,6 +386,16 @@
 
 typedef
    enum {
+      ARM64vecmo_SUQADD64x2=335, ARM64vecmo_SUQADD32x4,
+      ARM64vecmo_SUQADD16x8,     ARM64vecmo_SUQADD8x16,
+      ARM64vecmo_USQADD64x2,     ARM64vecmo_USQADD32x4,
+      ARM64vecmo_USQADD16x8,     ARM64vecmo_USQADD8x16,
+      ARM64vecmo_INVALID
+   }
+   ARM64VecModifyOp;
+
+typedef
+   enum {
       ARM64vecu_FNEG64x2=300, ARM64vecu_FNEG32x4,
       ARM64vecu_FABS64x2,     ARM64vecu_FABS32x4,
       ARM64vecu_NOT,
@@ -482,6 +492,7 @@
       ARM64in_FPSR,
       /* ARM64in_V*V: vector ops on vector registers */
       ARM64in_VBinV,
+      ARM64in_VModifyV,
       ARM64in_VUnaryV,
       ARM64in_VNarrowV,
       ARM64in_VShiftImmV,
@@ -746,6 +757,13 @@
             HReg          argL;
             HReg          argR;
          } VBinV;
+         /* binary vector operation on vector registers.
+            Dst reg is also a src. */
+         struct {
+            ARM64VecModifyOp op;
+            HReg             mod;
+            HReg             arg;
+         } VModifyV;
          /* unary vector operation on vector registers */
          struct {
             ARM64VecUnaryOp op;
@@ -871,6 +889,7 @@
 extern ARM64Instr* ARM64Instr_FPCR    ( Bool toFPCR, HReg iReg );
 extern ARM64Instr* ARM64Instr_FPSR    ( Bool toFPSR, HReg iReg );
 extern ARM64Instr* ARM64Instr_VBinV   ( ARM64VecBinOp op, HReg, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VModifyV ( ARM64VecModifyOp, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VNarrowV ( ARM64VecNarrowOp op, UInt dszBlg2,
                                          HReg dst, HReg src );
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 0787419..70c8073 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -2296,6 +2296,7 @@
             addInstr(env, ARM64Instr_VQfromXX(res, argL, argR));
             return res;
          }
+         /* -- Cases where we can generate a simple three-reg instruction. -- */
          case Iop_AndV128:
          case Iop_OrV128:
          case Iop_XorV128:
@@ -2471,6 +2472,40 @@
             }
             return res;
          }
+         /* -- These only have 2 operand instructions, so we have to first move
+            the first argument into a new register, for modification. -- */
+         case Iop_QAddExtUSsatSS8x16: case Iop_QAddExtUSsatSS16x8:
+         case Iop_QAddExtUSsatSS32x4: case Iop_QAddExtUSsatSS64x2:
+         case Iop_QAddExtSUsatUU8x16: case Iop_QAddExtSUsatUU16x8:
+         case Iop_QAddExtSUsatUU32x4: case Iop_QAddExtSUsatUU64x2:
+         {
+            HReg res  = newVRegV(env);
+            HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
+            ARM64VecModifyOp op = ARM64vecmo_INVALID;
+            switch (e->Iex.Binop.op) {
+               /* In the following 8 cases, the US - SU switching is intended.
+                  See comments on the libvex_ir.h for details.  Also in the 
+                  ARM64 front end, where used these primops are generated. */
+               case Iop_QAddExtUSsatSS8x16: op = ARM64vecmo_SUQADD8x16; break;
+               case Iop_QAddExtUSsatSS16x8: op = ARM64vecmo_SUQADD16x8; break;
+               case Iop_QAddExtUSsatSS32x4: op = ARM64vecmo_SUQADD32x4; break;
+               case Iop_QAddExtUSsatSS64x2: op = ARM64vecmo_SUQADD64x2; break;
+               case Iop_QAddExtSUsatUU8x16: op = ARM64vecmo_USQADD8x16; break;
+               case Iop_QAddExtSUsatUU16x8: op = ARM64vecmo_USQADD16x8; break;
+               case Iop_QAddExtSUsatUU32x4: op = ARM64vecmo_USQADD32x4; break;
+               case Iop_QAddExtSUsatUU64x2: op = ARM64vecmo_USQADD64x2; break;
+               default: vassert(0);
+            }
+            /* The order of the operands is important.  Although this is
+               basically addition, the two operands are extended differently,
+               making it important to get them into the correct registers in
+               the instruction. */
+            addInstr(env, ARM64Instr_VMov(16, res, argR));
+            addInstr(env, ARM64Instr_VModifyV(op, res, argL));
+            return res;
+         }
+         /* -- Shifts by an immediate. -- */
          case Iop_ShrN64x2: case Iop_ShrN32x4:
          case Iop_ShrN16x8: case Iop_ShrN8x16:
          case Iop_SarN64x2: case Iop_SarN32x4:
@@ -2574,7 +2609,7 @@
             /* else fall out; this is unhandled */
             break;
          }
-
+         /* -- Saturating narrowing by an immediate -- */
          /* uu */
          case Iop_QandQShrNnarrow16Uto8Ux8:
          case Iop_QandQShrNnarrow32Uto16Ux4:
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index c305c22..c8f90fe 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -740,6 +740,16 @@
       case Iop_QAdd32Sx4: vex_printf("QAdd32Sx4"); return;
       case Iop_QAdd64Ux2: vex_printf("QAdd64Ux2"); return;
       case Iop_QAdd64Sx2: vex_printf("QAdd64Sx2"); return;
+
+      case Iop_QAddExtUSsatSS8x16: vex_printf("QAddExtUSsatSS8x16"); return;
+      case Iop_QAddExtUSsatSS16x8: vex_printf("QAddExtUSsatSS16x8"); return;
+      case Iop_QAddExtUSsatSS32x4: vex_printf("QAddExtUSsatSS32x4"); return;
+      case Iop_QAddExtUSsatSS64x2: vex_printf("QAddExtUSsatSS64x2"); return;
+      case Iop_QAddExtSUsatUU8x16: vex_printf("QAddExtSUsatUU8x16"); return;
+      case Iop_QAddExtSUsatUU16x8: vex_printf("QAddExtSUsatUU16x8"); return;
+      case Iop_QAddExtSUsatUU32x4: vex_printf("QAddExtSUsatUU32x4"); return;
+      case Iop_QAddExtSUsatUU64x2: vex_printf("QAddExtSUsatUU64x2"); return;
+
       case Iop_PwAdd8x16: vex_printf("PwAdd8x16"); return;
       case Iop_PwAdd16x8: vex_printf("PwAdd16x8"); return;
       case Iop_PwAdd32x4: vex_printf("PwAdd32x4"); return;
@@ -2892,6 +2902,10 @@
       case Iop_QAdd32Ux4: case Iop_QAdd64Ux2:
       case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
       case Iop_QAdd32Sx4: case Iop_QAdd64Sx2:
+      case Iop_QAddExtUSsatSS8x16: case Iop_QAddExtUSsatSS16x8:
+      case Iop_QAddExtUSsatSS32x4: case Iop_QAddExtUSsatSS64x2:
+      case Iop_QAddExtSUsatUU8x16: case Iop_QAddExtSUsatUU16x8:
+      case Iop_QAddExtSUsatUU32x4: case Iop_QAddExtSUsatUU64x2:
       case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
       case Iop_Sub8x16:   case Iop_Sub16x8:
       case Iop_Sub32x4:   case Iop_Sub64x2:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 9f66681..bc2fa46 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1397,10 +1397,20 @@
       /* MISC (vector integer cmp != 0) */
       Iop_CmpNEZ8x16, Iop_CmpNEZ16x8, Iop_CmpNEZ32x4, Iop_CmpNEZ64x2,
 
-      /* ADDITION (normal / unsigned sat / signed sat) */
-      Iop_Add8x16,   Iop_Add16x8,   Iop_Add32x4,   Iop_Add64x2,
-      Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2,
-      Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2,
+      /* ADDITION (normal / U->U sat / S->S sat) */
+      Iop_Add8x16,    Iop_Add16x8,    Iop_Add32x4,    Iop_Add64x2,
+      Iop_QAdd8Ux16,  Iop_QAdd16Ux8,  Iop_QAdd32Ux4,  Iop_QAdd64Ux2,
+      Iop_QAdd8Sx16,  Iop_QAdd16Sx8,  Iop_QAdd32Sx4,  Iop_QAdd64Sx2,
+
+      /* ADDITION, ARM64 specific saturating variants. */
+      /* Unsigned widen left arg, signed widen right arg, add, saturate S->S.
+         This corresponds to SUQADD. */
+      Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
+      Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2,
+      /* Signed widen left arg, unsigned widen right arg, add, saturate U->U.
+         This corresponds to USQADD. */
+      Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
+      Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2,
 
       /* SUBTRACTION (normal / unsigned sat / signed sat) */
       Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,