arm64: implement:
FRECPS  d_d_d, s_s_s 
FRSQRTS d_d_d, s_s_s 
FRECPE  d_d, s_s 
FRSQRTE d_d, s_s 
FRECPX  d_d, s_s 
FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s 
FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s 
FRECPE  2d_2d, 4s_4s, 2s_2s 
FRSQRTE 2d_2d, 4s_4s, 2s_2s 

git-svn-id: svn:// 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index 8843182..c40d44b 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -9700,6 +9700,23 @@
       return True;
+   if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
+      /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
+      /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
+      Bool isSQRT = (size & 2) == 2;
+      Bool isD    = (size & 1) == 1;
+      IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
+                           : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
+      IRTemp res = newTempV128();
+      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
+      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
+                                                             mkexpr(res))));
+      HChar c = isD ? 'd' : 's';
+      DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
+          c, dd, c, nn, c, mm);
+      return True;
+   }
    return False;
 #  undef INSN
@@ -9900,7 +9917,37 @@
       return True;
-#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+   if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
+      /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
+      /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
+      Bool isSQRT = bitU == 1;
+      Bool isD    = (size & 1) == 1;
+      IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
+                           : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
+      IRTemp resV = newTempV128();
+      assign(resV, unop(op, getQReg128(nn)));
+      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
+                                                             mkexpr(resV))));
+      HChar c = isD ? 'd' : 's';
+      DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
+      return True;
+   }
+   if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
+      /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
+      Bool   isD = (size & 1) == 1;
+      IRType ty  = isD ? Ity_F64 : Ity_F32;
+      IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
+      IRTemp res = newTemp(ty);
+      IRTemp rm  = mk_get_IR_rounding_mode();
+      assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
+      putQReg128(dd, mkV128(0x0000));
+      putQRegLane(dd, 0, mkexpr(res));
+      HChar c = isD ? 'd' : 's';
+      DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
+      return True;
+   }
    return False;
 #  undef INSN
@@ -11449,6 +11496,23 @@
       return True;
+   if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
+      /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
+      /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
+      Bool isSQRT = (size & 2) == 2;
+      Bool isD    = (size & 1) == 1;
+      if (bitQ == 0 && isD) return False; // implied 1d case
+      IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
+                           : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
+      IRTemp res = newTempV128();
+      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
+      DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
    return False;
 #  undef INSN
@@ -11857,7 +11921,6 @@
       return True;
    ix = 0;
    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
@@ -11928,8 +11991,6 @@
       return True;
    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
@@ -11983,6 +12044,23 @@
       /* else fall through */
+   if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
+      /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
+      /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
+      Bool isSQRT = bitU == 1;
+      Bool isD    = (size & 1) == 1;
+      IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
+                           : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
+      if (bitQ == 0 && isD) return False; // implied 1d case
+      IRTemp resV = newTempV128();
+      assign(resV, unop(op, getQReg128(nn)));
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
+      const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
+      DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
+          nameQReg128(dd), arr, nameQReg128(nn), arr);
+      return True;
+   }
    return False;
 #  undef INSN
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index 8b5114e..7cc0910 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -559,10 +559,11 @@
 static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) {
    switch (op) {
-      case ARM64fpu_NEG:  return "neg  ";
-      case ARM64fpu_ABS:  return "abs  ";
-      case ARM64fpu_SQRT: return "sqrt ";
-      case ARM64fpu_RINT: return "rinti";
+      case ARM64fpu_NEG:   return "neg  ";
+      case ARM64fpu_ABS:   return "abs  ";
+      case ARM64fpu_SQRT:  return "sqrt ";
+      case ARM64fpu_RINT:  return "rinti";
+      case ARM64fpu_RECPX: return "recpx";
       default: vpanic("showARM64FpUnaryOp");
@@ -687,22 +688,26 @@
       case ARM64vecb_UQRSHL32x4:   *nm = "uqrshl";    *ar = "4s";   return;
       case ARM64vecb_UQRSHL16x8:   *nm = "uqrshl";    *ar = "8h";   return;
       case ARM64vecb_UQRSHL8x16:   *nm = "uqrshl";    *ar = "16b";  return;
-      case ARM64vecb_SSHL64x2:     *nm = "sshl";      *ar = "2d";   return;
-      case ARM64vecb_SSHL32x4:     *nm = "sshl";      *ar = "4s";   return;
-      case ARM64vecb_SSHL16x8:     *nm = "sshl";      *ar = "8h";   return;
-      case ARM64vecb_SSHL8x16:     *nm = "sshl";      *ar = "16b";  return;
-      case ARM64vecb_USHL64x2:     *nm = "ushl";      *ar = "2d";   return;
-      case ARM64vecb_USHL32x4:     *nm = "ushl";      *ar = "4s";   return;
-      case ARM64vecb_USHL16x8:     *nm = "ushl";      *ar = "8h";   return;
-      case ARM64vecb_USHL8x16:     *nm = "ushl";      *ar = "16b";  return;
-      case ARM64vecb_SRSHL64x2:    *nm = "srshl";     *ar = "2d";   return;
-      case ARM64vecb_SRSHL32x4:    *nm = "srshl";     *ar = "4s";   return;
-      case ARM64vecb_SRSHL16x8:    *nm = "srshl";     *ar = "8h";   return;
-      case ARM64vecb_SRSHL8x16:    *nm = "srshl";     *ar = "16b";  return;
-      case ARM64vecb_URSHL64x2:    *nm = "urshl";     *ar = "2d";   return;
-      case ARM64vecb_URSHL32x4:    *nm = "urshl";     *ar = "4s";   return;
-      case ARM64vecb_URSHL16x8:    *nm = "urshl";     *ar = "8h";   return;
-      case ARM64vecb_URSHL8x16:    *nm = "urshl";     *ar = "16b";  return;
+      case ARM64vecb_SSHL64x2:     *nm = "sshl  ";    *ar = "2d";   return;
+      case ARM64vecb_SSHL32x4:     *nm = "sshl  ";    *ar = "4s";   return;
+      case ARM64vecb_SSHL16x8:     *nm = "sshl  ";    *ar = "8h";   return;
+      case ARM64vecb_SSHL8x16:     *nm = "sshl  ";    *ar = "16b";  return;
+      case ARM64vecb_USHL64x2:     *nm = "ushl  ";    *ar = "2d";   return;
+      case ARM64vecb_USHL32x4:     *nm = "ushl  ";    *ar = "4s";   return;
+      case ARM64vecb_USHL16x8:     *nm = "ushl  ";    *ar = "8h";   return;
+      case ARM64vecb_USHL8x16:     *nm = "ushl  ";    *ar = "16b";  return;
+      case ARM64vecb_SRSHL64x2:    *nm = "srshl ";    *ar = "2d";   return;
+      case ARM64vecb_SRSHL32x4:    *nm = "srshl ";    *ar = "4s";   return;
+      case ARM64vecb_SRSHL16x8:    *nm = "srshl ";    *ar = "8h";   return;
+      case ARM64vecb_SRSHL8x16:    *nm = "srshl ";    *ar = "16b";  return;
+      case ARM64vecb_URSHL64x2:    *nm = "urshl ";    *ar = "2d";   return;
+      case ARM64vecb_URSHL32x4:    *nm = "urshl ";    *ar = "4s";   return;
+      case ARM64vecb_URSHL16x8:    *nm = "urshl ";    *ar = "8h";   return;
+      case ARM64vecb_URSHL8x16:    *nm = "urshl ";    *ar = "16b";  return;
+      case ARM64vecb_FRECPS64x2:   *nm = "frecps";    *ar = "2d";   return;
+      case ARM64vecb_FRECPS32x4:   *nm = "frecps";    *ar = "4s";   return;
+      case ARM64vecb_FRSQRTS64x2:  *nm = "frsqrts";   *ar = "2d";   return;
+      case ARM64vecb_FRSQRTS32x4:  *nm = "frsqrts";   *ar = "4s";   return;
       default: vpanic("showARM64VecBinOp");
@@ -752,6 +757,10 @@
       case ARM64vecu_REV644S:     *nm = "rev64";   *ar = "4s";  return;
       case ARM64vecu_URECPE32x4:  *nm = "urecpe";  *ar = "4s";  return;
       case ARM64vecu_URSQRTE32x4: *nm = "ursqrte"; *ar = "4s";  return;
+      case ARM64vecu_FRECPE64x2:  *nm = "frecpe";  *ar = "2d";  return;
+      case ARM64vecu_FRECPE32x4:  *nm = "frecpe";  *ar = "4s";  return;
+      case ARM64vecu_FRSQRTE64x2: *nm = "frsqrte"; *ar = "2d";  return;
+      case ARM64vecu_FRSQRTE32x4: *nm = "frsqrte"; *ar = "4s";  return;
       default: vpanic("showARM64VecUnaryOp");
@@ -2601,6 +2610,7 @@
 #define X110010  BITS8(0,0, 1,1,0,0,1,0)
 #define X110100  BITS8(0,0, 1,1,0,1,0,0)
 #define X110101  BITS8(0,0, 1,1,0,1,0,1)
+#define X110110  BITS8(0,0, 1,1,0,1,1,0)
 #define X110111  BITS8(0,0, 1,1,0,1,1,1)
 #define X111000  BITS8(0,0, 1,1,1,0,0,0)
 #define X111001  BITS8(0,0, 1,1,1,0,0,1)
@@ -2642,6 +2652,8 @@
 #define X11011110  BITS8(1,1,0,1,1,1,1,0)
 #define X11110001  BITS8(1,1,1,1,0,0,0,1)
 #define X11110011  BITS8(1,1,1,1,0,0,1,1)
+#define X11110101  BITS8(1,1,1,1,0,1,0,1)
+#define X11110111  BITS8(1,1,1,1,0,1,1,1)
 /* --- 4 fields --- */
@@ -3878,7 +3890,7 @@
             000,11110 01 1,0000 0,0 10000 n d  FMOV Dd, Dn (not handled)
             ------------------- 0,1 ---------  FABS ------
             ------------------- 1,0 ---------  FNEG ------
-            ------------------- 1,1 ---------  FQSRT -----
+            ------------------- 1,1 ---------  FSQRT -----
          UInt dD  = dregNo(i->ARM64in.VUnaryD.dst);
          UInt dN  = dregNo(i->ARM64in.VUnaryD.src);
@@ -3902,6 +3914,13 @@
            *p++ = X_3_8_5_6_5_5(X000, X11110011, X00111, X110000, dN, dD);
            goto done;
+         /* 
+            010, 11110 11 1,0000 1,1111 10 n d  FRECPX Dd, Dm
+         */
+         if (i->ARM64in.VUnaryD.op == ARM64fpu_RECPX) {
+           *p++ = X_3_8_5_6_5_5(X010, X11110111, X00001, X111110, dN, dD);
+           goto done;
+         }
          goto bad;
       case ARM64in_VUnaryS: {
@@ -3909,7 +3928,7 @@
             000,11110 00 1,0000 0,0 10000 n d  FMOV Sd, Sn (not handled)
             ------------------- 0,1 ---------  FABS ------
             ------------------- 1,0 ---------  FNEG ------
-            ------------------- 1,1 ---------  FQSRT -----
+            ------------------- 1,1 ---------  FSQRT -----
          UInt sD  = dregNo(i->ARM64in.VUnaryS.dst);
          UInt sN  = dregNo(i->ARM64in.VUnaryS.src);
@@ -3933,6 +3952,13 @@
            *p++ = X_3_8_5_6_5_5(X000, X11110001, X00111, X110000, sN, sD);
            goto done;
+         /* 
+            010, 11110 10 1,0000 1,1111 10 n d  FRECPX Sd, Sm
+         */
+         if (i->ARM64in.VUnaryS.op == ARM64fpu_RECPX) {
+           *p++ = X_3_8_5_6_5_5(X010, X11110101, X00001, X111110, sN, sD);
+           goto done;
+         }
          goto bad;
       case ARM64in_VBinD: {
@@ -4176,6 +4202,11 @@
             010 01110 sz 1 m  010101 n d   SRSHL@sz  Vd, Vn, Vm
             011 01110 sz 1 m  010001 n d   USHL@sz   Vd, Vn, Vm
             011 01110 sz 1 m  010101 n d   URSHL@sz  Vd, Vn, Vm
+            010 01110 01 1 m  111111 n d   FRECPS  Vd.2d, Vn.2d, Vm.2d
+            010 01110 00 1 m  111111 n d   FRECPS  Vd.4s, Vn.4s, Vm.4s
+            010 01110 11 1 m  111111 n d   FRSQRTS Vd.2d, Vn.2d, Vm.2d
+            010 01110 10 1 m  111111 n d   FRSQRTS Vd.4s, Vn.4s, Vm.4s
          UInt vD = qregNo(i->ARM64in.VBinV.dst);
          UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -4616,6 +4647,19 @@
                *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010101, vN, vD);
+            case ARM64vecb_FRECPS64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X111111, vN, vD);
+               break;
+            case ARM64vecb_FRECPS32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X111111, vN, vD);
+               break;
+            case ARM64vecb_FRSQRTS64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X111111, vN, vD);
+               break;
+            case ARM64vecb_FRSQRTS32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X111111, vN, vD);
+               break;
                goto bad;
@@ -4692,6 +4736,12 @@
             010 01110 10 1 00001 110010 n d  URECPE Vd.4s, Vn.4s
             011 01110 10 1 00001 110010 n d  URSQRTE Vd.4s, Vn.4s
+            010 01110 11 1 00001 110110 n d  FRECPE Vd.2d, Vn.2d
+            010 01110 10 1 00001 110110 n d  FRECPE Vd.4s, Vn.4s
+            011 01110 11 1 00001 110110 n d  FRECPE Vd.2d, Vn.2d
+            011 01110 10 1 00001 110110 n d  FRECPE Vd.4s, Vn.4s
          UInt vD = qregNo(i->ARM64in.VUnaryV.dst);
          UInt vN = qregNo(i->ARM64in.VUnaryV.arg);
@@ -4771,6 +4821,18 @@
             case ARM64vecu_URSQRTE32x4:
                *p++ = X_3_8_5_6_5_5(X011, X01110101, X00001, X110010, vN, vD);
+            case ARM64vecu_FRECPE64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, X00001, X110110, vN, vD);
+               break;
+            case ARM64vecu_FRECPE32x4:
+               *p++ = X_3_8_5_6_5_5(X010, X01110101, X00001, X110110, vN, vD);
+               break;
+            case ARM64vecu_FRSQRTE64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, X00001, X110110, vN, vD);
+               break;
+            case ARM64vecu_FRSQRTE32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, X00001, X110110, vN, vD);
+               break;
                goto bad;
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index b25cf20..281cb0e 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -301,6 +301,7 @@
+      ARM64fpu_RECPX,
@@ -383,6 +384,8 @@
       ARM64vecb_SRSHL16x8,   ARM64vecb_SRSHL8x16, 
       ARM64vecb_URSHL64x2,   ARM64vecb_URSHL32x4,
       ARM64vecb_URSHL16x8,   ARM64vecb_URSHL8x16, 
+      ARM64vecb_FRECPS64x2,  ARM64vecb_FRECPS32x4,
+      ARM64vecb_FRSQRTS64x2, ARM64vecb_FRSQRTS32x4,
@@ -413,6 +416,8 @@
       ARM64vecu_REV6416B,     ARM64vecu_REV648H,      ARM64vecu_REV644S,
+      ARM64vecu_FRECPE64x2,   ARM64vecu_FRECPE32x4,
+      ARM64vecu_FRSQRTE64x2,  ARM64vecu_FRSQRTE32x4,
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 9baed6b..afde38f 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -366,7 +366,7 @@
 /* Set the FP rounding mode: 'mode' is an I32-typed expression
    denoting a value in the range 0 .. 3, indicating a round mode
    encoded as per type IRRoundingMode -- the first four values only
-   (Irrm_NEAREST, Irrm_NegINF, Irrm_PosINF, Irrm_ZERO).  Set the PPC
+   (Irrm_NEAREST, Irrm_NegINF, Irrm_PosINF, Irrm_ZERO).  Set the ARM64
    FSCR to have the same rounding.
    For speed & simplicity, we're setting the *entire* FPCR here.
@@ -2244,9 +2244,12 @@
          case Iop_Reverse32sIn64_x2:
          case Iop_RecipEst32Ux4:
          case Iop_RSqrtEst32Ux4:
+         case Iop_RecipEst64Fx2: case Iop_RecipEst32Fx4:
+         case Iop_RSqrtEst64Fx2: case Iop_RSqrtEst32Fx4:
-            HReg res = newVRegV(env);
-            HReg arg = iselV128Expr(env, e->Iex.Unop.arg);
+            HReg res   = newVRegV(env);
+            HReg arg   = iselV128Expr(env, e->Iex.Unop.arg);
+            Bool setRM = False;
             ARM64VecUnaryOp op = ARM64vecu_INVALID;
             switch (e->Iex.Unop.op) {
                case Iop_NotV128:           op = ARM64vecu_NOT;         break;
@@ -2274,8 +2277,23 @@
                case Iop_Reverse32sIn64_x2: op = ARM64vecu_REV644S;     break;
                case Iop_RecipEst32Ux4:     op = ARM64vecu_URECPE32x4;  break;
                case Iop_RSqrtEst32Ux4:     op = ARM64vecu_URSQRTE32x4; break;
+               case Iop_RecipEst64Fx2:     setRM = True;
+                                           op = ARM64vecu_FRECPE64x2;  break;
+               case Iop_RecipEst32Fx4:     setRM = True;
+                                           op = ARM64vecu_FRECPE32x4;  break;
+               case Iop_RSqrtEst64Fx2:     setRM = True;
+                                           op = ARM64vecu_FRSQRTE64x2; break;
+               case Iop_RSqrtEst32Fx4:     setRM = True;
+                                           op = ARM64vecu_FRSQRTE32x4; break;
                default: vassert(0);
+            if (setRM) {
+               // This is a bit of a kludge.  We should do rm properly for
+               // these recip-est insns, but that would require changing the
+               // primop's type to take an rmode.
+               set_FPCR_rounding_mode(env, IRExpr_Const(
+                                              IRConst_U32(Irrm_NEAREST)));
+            }
             addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
             return res;
@@ -2407,11 +2425,14 @@
          case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
          case Iop_Max64Fx2: case Iop_Max32Fx4:
          case Iop_Min64Fx2: case Iop_Min32Fx4:
+         case Iop_RecipStep64Fx2: case Iop_RecipStep32Fx4:
+         case Iop_RSqrtStep64Fx2: case Iop_RSqrtStep32Fx4:
-            HReg res  = newVRegV(env);
-            HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
-            HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
-            Bool sw   = False;
+            HReg res   = newVRegV(env);
+            HReg argL  = iselV128Expr(env, e->Iex.Binop.arg1);
+            HReg argR  = iselV128Expr(env, e->Iex.Binop.arg2);
+            Bool sw    = False;
+            Bool setRM = False;
             ARM64VecBinOp op = ARM64vecb_INVALID;
             switch (e->Iex.Binop.op) {
                case Iop_AndV128:    op = ARM64vecb_AND; break;
@@ -2528,8 +2549,23 @@
                case Iop_Max32Fx4:       op = ARM64vecb_FMAX32x4; break;
                case Iop_Min64Fx2:       op = ARM64vecb_FMIN64x2; break;
                case Iop_Min32Fx4:       op = ARM64vecb_FMIN32x4; break;
+               case Iop_RecipStep64Fx2: setRM = True;
+                                        op = ARM64vecb_FRECPS64x2; break;
+               case Iop_RecipStep32Fx4: setRM = True;
+                                        op = ARM64vecb_FRECPS32x4; break;
+               case Iop_RSqrtStep64Fx2: setRM = True;
+                                        op = ARM64vecb_FRSQRTS64x2; break;
+               case Iop_RSqrtStep32Fx4: setRM = True;
+                                        op = ARM64vecb_FRSQRTS32x4; break;
                default: vassert(0);
+            if (setRM) {
+               // This is a bit of a kludge.  We should do rm properly for
+               // these recip-step insns, but that would require changing the
+               // primop's type to take an rmode.
+               set_FPCR_rounding_mode(env, IRExpr_Const(
+                                              IRConst_U32(Irrm_NEAREST)));
+            }
             if (sw) {
                addInstr(env, ARM64Instr_VBinV(op, res, argR, argL));
             } else {
@@ -3034,18 +3070,20 @@
    if (e->tag == Iex_Binop) {
       switch (e->Iex.Binop.op) {
-         case Iop_RoundF64toInt: {
+         case Iop_RoundF64toInt:
+         case Iop_SqrtF64:
+         case Iop_RecpExpF64: {
             HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
             HReg dst = newVRegD(env);
             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
-            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_RINT, dst, src));
-            return dst;
-         }
-         case Iop_SqrtF64: {
-            HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
-            HReg dst = newVRegD(env);
-            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
-            addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_SQRT, dst, src));
+            ARM64FpUnaryOp op = ARM64fpu_INVALID;
+            switch (e->Iex.Binop.op) {
+               case Iop_RoundF64toInt: op = ARM64fpu_RINT;  break;
+               case Iop_SqrtF64:       op = ARM64fpu_SQRT;  break;
+               case Iop_RecpExpF64:    op = ARM64fpu_RECPX; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VUnaryD(op, dst, src));
             return dst;
          case Iop_I64StoF64:
@@ -3195,18 +3233,20 @@
    if (e->tag == Iex_Binop) {
       switch (e->Iex.Binop.op) {
-         case Iop_RoundF32toInt: {
+         case Iop_RoundF32toInt:
+         case Iop_SqrtF32:
+         case Iop_RecpExpF32: {
             HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
             HReg dst = newVRegD(env);
             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
-            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_RINT, dst, src));
-            return dst;
-         }
-         case Iop_SqrtF32: {
-            HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
-            HReg dst = newVRegD(env);
-            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
-            addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_SQRT, dst, src));
+            ARM64FpUnaryOp op = ARM64fpu_INVALID;
+            switch (e->Iex.Binop.op) {
+               case Iop_RoundF32toInt: op = ARM64fpu_RINT;  break;
+               case Iop_SqrtF32:       op = ARM64fpu_SQRT;  break;
+               case Iop_RecpExpF32:    op = ARM64fpu_RECPX; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VUnaryS(op, dst, src));
             return dst;
          case Iop_F64toF32: {
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 64e487d..c56095c 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -337,6 +337,9 @@
       case Iop_TruncF64asF32: vex_printf("TruncF64asF32"); return;
+      case Iop_RecpExpF64: vex_printf("RecpExpF64"); return;
+      case Iop_RecpExpF32: vex_printf("RecpExpF32"); return;
       case Iop_QAdd32S: vex_printf("QAdd32S"); return;
       case Iop_QSub32S: vex_printf("QSub32S"); return; 
       case Iop_Add16x2:   vex_printf("Add16x2"); return;
@@ -637,10 +640,15 @@
       case Iop_RecipEst32F0x4: vex_printf("RecipEst32F0x4"); return;
       case Iop_RecipStep32Fx2: vex_printf("RecipStep32Fx2"); return;
       case Iop_RecipStep32Fx4: vex_printf("RecipStep32Fx4"); return;
+      case Iop_RecipEst64Fx2: vex_printf("RecipEst64Fx2"); return;
+      case Iop_RecipStep64Fx2: vex_printf("RecipStep64Fx2"); return;
       case Iop_Abs32Fx4:  vex_printf("Abs32Fx4"); return;
       case Iop_Abs64Fx2:  vex_printf("Abs64Fx2"); return;
       case Iop_RSqrtStep32Fx4:  vex_printf("RSqrtStep32Fx4"); return;
+      case Iop_RSqrtStep64Fx2:  vex_printf("RSqrtStep64Fx2"); return;
       case Iop_RSqrtStep32Fx2:  vex_printf("RSqrtStep32Fx2"); return;
+      case Iop_RSqrtEst64Fx2: vex_printf("RSqrtEst64Fx2"); return;
       case Iop_RSqrtEst32F0x4: vex_printf("RSqrtEst32F0x4"); return;
       case Iop_RSqrtEst32Fx8: vex_printf("RSqrtEst32Fx8"); return;
@@ -2753,10 +2761,12 @@
          UNARY(Ity_F32, Ity_F32);
       case Iop_SqrtF64:
+      case Iop_RecpExpF64:
          BINARY(ity_RMode,Ity_F64, Ity_F64);
       case Iop_SqrtF32:
       case Iop_RoundF32toInt:
+      case Iop_RecpExpF32:
          BINARY(ity_RMode,Ity_F32, Ity_F32);
       case Iop_CmpF32:
@@ -2971,8 +2981,8 @@
       case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8:
       case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4:
       case Iop_Perm8x16: case Iop_Perm32x4:
-      case Iop_RecipStep32Fx4:
-      case Iop_RSqrtStep32Fx4:
+      case Iop_RecipStep32Fx4: case Iop_RecipStep64Fx2:
+      case Iop_RSqrtStep32Fx4: case Iop_RSqrtStep64Fx2:
       case Iop_CipherV128:
       case Iop_CipherLV128:
       case Iop_NCipherV128:
@@ -2995,6 +3005,7 @@
       case Iop_NotV128:
       case Iop_RecipEst32Fx4: case Iop_RecipEst32F0x4:
+      case Iop_RecipEst64Fx2: case Iop_RSqrtEst64Fx2:
       case Iop_RecipEst32Ux4:
       case Iop_RSqrtEst32F0x4:
       case Iop_Sqrt32Fx4:  case Iop_Sqrt32F0x4:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index bd97f87..3d2c2b2 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -754,6 +754,11 @@
       /* NB: pretty much the same as Iop_F64toF32, except no change 
          of type. */
+      /* --- guest arm64 specifics, not mandated by 754. --- */
+      Iop_RecpExpF64,  /* FRECPX d  :: IRRoundingMode(I32) x F64 -> F64 */
+      Iop_RecpExpF32,  /* FRECPX s  :: IRRoundingMode(I32) x F32 -> F32 */
       /* ------------------ 32-bit SIMD Integer ------------------ */
       /* 32x1 saturating add/sub (ok, well, not really SIMD :) */
@@ -1284,8 +1289,8 @@
       /* Vector Reciprocal Estimate finds an approximate reciprocal of each
-      element in the operand vector, and places the results in the destination
-      vector.  */
+         element in the operand vector, and places the results in the
+         destination vector.  */
       /* Vector Reciprocal Step computes (2.0 - arg1 * arg2).
@@ -1348,6 +1353,12 @@
+      /* see 32Fx4 variants for description */
+      Iop_RecipEst64Fx2,    // unary
+      Iop_RecipStep64Fx2,   // binary
+      Iop_RSqrtEst64Fx2,    // unary
+      Iop_RSqrtStep64Fx2,   // binary
       /* --- 64x2 lowest-lane-only scalar FP --- */
       /* In binary cases, upper half is copied from first operand.  In