arm64: add support for 
FCVT{N,M,A,P,Z}{S,U} d_d, s_s 
FCVTN 4h/8h_4s, 2s/4s_2d 
FCVTL 4s_4h/8h, 2d_2s/4s 
FCVT Sd, Hn 
FCVT Dd, Hn 
FCVT Hd, Sn 
FCVT Hd, Dn 



git-svn-id: svn://svn.valgrind.org/vex/trunk@3111 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index fbd891e..293d65e 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -1416,7 +1416,7 @@
    UInt laneSzB = 0;
    switch (laneTy) {
       case Ity_I8:                 laneSzB = 1;  break;
-      case Ity_I16:                laneSzB = 2;  break;
+      case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
       case Ity_V128:               laneSzB = 16; break;
@@ -1436,7 +1436,7 @@
    Int    off = offsetQRegLane(qregNo, ty, 0);
    switch (ty) {
       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
-      case Ity_F32: case Ity_F64: case Ity_V128:
+      case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
          break;
       default:
          vassert(0); // Other cases are probably invalid
@@ -1450,7 +1450,7 @@
    Int off = offsetQRegLane(qregNo, ty, 0);
    switch (ty) {
       case Ity_I8:
-      case Ity_I16:
+      case Ity_F16: case Ity_I16:
       case Ity_I32: case Ity_I64:
       case Ity_F32: case Ity_F64: case Ity_V128:
          break;
@@ -1537,7 +1537,7 @@
    switch (laneTy) {
       case Ity_F64: case Ity_I64:
       case Ity_I32: case Ity_F32:
-      case Ity_I16:
+      case Ity_I16: case Ity_F16:
       case Ity_I8:
          break;
       default:
@@ -1552,7 +1552,7 @@
    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
    switch (laneTy) {
       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
-      case Ity_F64: case Ity_F32:
+      case Ity_F64: case Ity_F32: case Ity_F16:
          break;
       default:
          vassert(0); // Other cases are ATC
@@ -9917,6 +9917,58 @@
       return True;
    }
 
+   ix = 0; /*INVALID*/
+   switch (opcode) {
+      case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
+      case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
+      case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
+      default: break;
+   }
+   if (ix > 0) {
+      /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
+      /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
+      /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
+      /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
+      /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
+      /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
+      /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
+      /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
+      /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
+      /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
+      Bool           is64 = (size & 1) == 1;
+      IRType         tyF  = is64 ? Ity_F64 : Ity_F32;
+      IRType         tyI  = is64 ? Ity_I64 : Ity_I32;
+      IRRoundingMode irrm = 8; /*impossible*/
+      HChar          ch   = '?';
+      switch (ix) {
+         case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
+         case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
+         case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
+         case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
+         case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
+         default: vassert(0);
+      }
+      IROp cvt = Iop_INVALID;
+      if (bitU == 1) {
+         cvt = is64 ? Iop_F64toI64U : Iop_F32toI32U;
+      } else {
+         cvt = is64 ? Iop_F64toI64S : Iop_F32toI32S;
+      }
+      IRTemp src = newTemp(tyF);
+      IRTemp res = newTemp(tyI);
+      assign(src, getQRegLane(nn, 0, tyF));
+      assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
+      putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
+      if (!is64) {
+         putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
+      }
+      putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
+      HChar sOrD = is64 ? 'd' : 's';
+      DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
+          sOrD, dd, sOrD, nn);
+      return True;
+   }
+
    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
@@ -11906,18 +11958,48 @@
       return True;
    }
 
-   if (bitU == 0 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
-      /* -------- 0,01,10110: FCVTN 2s/4s_2d -------- */
-      IRTemp  rm    = mk_get_IR_rounding_mode();
-      IRExpr* srcLo = getQRegLane(nn, 0, Ity_F64);
-      IRExpr* srcHi = getQRegLane(nn, 1, Ity_F64);
-      putQRegLane(dd, 2 * bitQ + 0, binop(Iop_F64toF32, mkexpr(rm), srcLo));
-      putQRegLane(dd, 2 * bitQ + 1, binop(Iop_F64toF32, mkexpr(rm), srcHi));
+   if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
+      /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
+      UInt   nLanes = size == X00 ? 4 : 2;
+      IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
+      IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
+      IRTemp rm     = mk_get_IR_rounding_mode();
+      IRTemp src[nLanes];
+      for (UInt i = 0; i < nLanes; i++) {
+         src[i] = newTemp(srcTy);
+         assign(src[i], getQRegLane(nn, i, srcTy));
+      }
+      for (UInt i = 0; i < nLanes; i++) {
+         putQRegLane(dd, nLanes * bitQ + i,
+                         binop(opCvt, mkexpr(rm), mkexpr(src[i])));
+      }
       if (bitQ == 0) {
          putQRegLane(dd, 1, mkU64(0));
       }
-      DIP("fcvtn%s %s.%s, %s.2d\n", bitQ ? "2" : "",
-          nameQReg128(dd), bitQ ? "4s" : "2s", nameQReg128(nn));
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
+      DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
+          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
+      return True;
+   }
+
+   if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
+      /* -------- 0,0x,10110: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
+      UInt   nLanes = size == X00 ? 4 : 2;
+      IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
+      IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
+      IRTemp src[nLanes];
+      for (UInt i = 0; i < nLanes; i++) {
+         src[i] = newTemp(srcTy);
+         assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
+      }
+      for (UInt i = 0; i < nLanes; i++) {
+         putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
+      }
+      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
+      const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
+      DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
+          nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
       return True;
    }
 
@@ -12628,36 +12710,67 @@
       /* -------- 01,000111: FCVT h_d -------- */
       /* -------- 01,000100: FCVT s_d -------- */
       /* 31        23 21    16 14    9 4
-         000 11110 11 10001 00 10000 n d   FCVT Sd, Hn (unimp)
-         --------- 11 ----- 01 ---------   FCVT Dd, Hn (unimp)
-         --------- 00 ----- 11 ---------   FCVT Hd, Sn (unimp)
+         000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
+         --------- 11 ----- 01 ---------   FCVT Dd, Hn
+         --------- 00 ----- 11 ---------   FCVT Hd, Sn
          --------- 00 ----- 01 ---------   FCVT Dd, Sn
-         --------- 01 ----- 11 ---------   FCVT Hd, Dn (unimp)
+         --------- 01 ----- 11 ---------   FCVT Hd, Dn
          --------- 01 ----- 00 ---------   FCVT Sd, Dn
          Rounding, when dst is smaller than src, is per the FPCR.
       */
       UInt b2322 = ty;
       UInt b1615 = opcode & BITS2(1,1);
-      if (b2322 == BITS2(0,0) && b1615 == BITS2(0,1)) {
-         /* Convert S to D */
-         IRTemp res = newTemp(Ity_F64);
-         assign(res, unop(Iop_F32toF64, getQRegLO(nn, Ity_F32)));
-         putQReg128(dd, mkV128(0x0000));
-         putQRegLO(dd, mkexpr(res));
-         DIP("fcvt %s, %s\n",
-             nameQRegLO(dd, Ity_F64), nameQRegLO(nn, Ity_F32));
-         return True;
-      }
-      if (b2322 == BITS2(0,1) && b1615 == BITS2(0,0)) {
-         /* Convert D to S */
-         IRTemp res = newTemp(Ity_F32);
-         assign(res, binop(Iop_F64toF32, mkexpr(mk_get_IR_rounding_mode()),
-                                         getQRegLO(nn, Ity_F64)));
-         putQReg128(dd, mkV128(0x0000));
-         putQRegLO(dd, mkexpr(res));
-         DIP("fcvt %s, %s\n",
-             nameQRegLO(dd, Ity_F32), nameQRegLO(nn, Ity_F64));
-         return True;
+      switch ((b2322 << 2) | b1615) {
+         case BITS4(0,0,0,1):   // S -> D
+         case BITS4(1,1,0,1): { // H -> D
+            Bool   srcIsH = b2322 == BITS2(1,1);
+            IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
+            IRTemp res    = newTemp(Ity_F64);
+            assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
+                             getQRegLO(nn, srcTy)));
+            putQReg128(dd, mkV128(0x0000));
+            putQRegLO(dd, mkexpr(res));
+            DIP("fcvt %s, %s\n",
+                nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
+            return True;
+         }
+         case BITS4(0,1,0,0):   // D -> S
+         case BITS4(0,1,1,1): { // D -> H
+            Bool   dstIsH = b1615 == BITS2(1,1);
+            IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
+            IRTemp res    = newTemp(dstTy);
+            assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
+                              mkexpr(mk_get_IR_rounding_mode()),
+                              getQRegLO(nn, Ity_F64)));
+            putQReg128(dd, mkV128(0x0000));
+            putQRegLO(dd, mkexpr(res));
+            DIP("fcvt %s, %s\n",
+                nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
+            return True;
+         }
+         case BITS4(0,0,1,1):   // S -> H
+         case BITS4(1,1,0,0): { // H -> S
+            Bool   toH   = b1615 == BITS2(1,1);
+            IRType srcTy = toH ? Ity_F32 : Ity_F16;
+            IRType dstTy = toH ? Ity_F16 : Ity_F32;
+            IRTemp res = newTemp(dstTy);
+            if (toH) {
+               assign(res, binop(Iop_F32toF16,
+                                 mkexpr(mk_get_IR_rounding_mode()),
+                                 getQRegLO(nn, srcTy)));
+
+            } else {
+               assign(res, unop(Iop_F16toF32,
+                                getQRegLO(nn, srcTy)));
+            }
+            putQReg128(dd, mkV128(0x0000));
+            putQRegLO(dd, mkexpr(res));
+            DIP("fcvt %s, %s\n",
+                nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
+            return True;
+         }
+         default:
+            break;
       }
       /* else unhandled */
       return False;
@@ -13029,7 +13142,6 @@
       ---------------- 01 --------------  FCVTP-------- (round to +inf)
       ---------------- 10 --------------  FCVTM-------- (round to -inf)
       ---------------- 11 --------------  FCVTZ-------- (round to zero)
-
       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
 
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index f937c4d..fc0984a 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -176,6 +176,11 @@
    vex_printf("(S-reg)");
 }
 
+static void ppHRegARM64asHreg ( HReg reg ) {
+   ppHRegARM64(reg);
+   vex_printf("(H-reg)");
+}
+
 
 /* --------- Condition codes, ARM64 encoding. --------- */
 
@@ -1003,9 +1008,19 @@
    i->tag        = ARM64in_MFence;
    return i;
 }
+ARM64Instr* ARM64Instr_VLdStH ( Bool isLoad, HReg sD, HReg rN, UInt uimm12 ) {
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+   i->tag                   = ARM64in_VLdStH;
+   i->ARM64in.VLdStH.isLoad = isLoad;
+   i->ARM64in.VLdStH.hD     = sD;
+   i->ARM64in.VLdStH.rN     = rN;
+   i->ARM64in.VLdStH.uimm12 = uimm12;
+   vassert(uimm12 < 8192 && 0 == (uimm12 & 1));
+   return i;
+}
 ARM64Instr* ARM64Instr_VLdStS ( Bool isLoad, HReg sD, HReg rN, UInt uimm12 ) {
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
-   i->tag                  = ARM64in_VLdStS;
+   i->tag                   = ARM64in_VLdStS;
    i->ARM64in.VLdStS.isLoad = isLoad;
    i->ARM64in.VLdStS.sD     = sD;
    i->ARM64in.VLdStS.rN     = rN;
@@ -1015,7 +1030,7 @@
 }
 ARM64Instr* ARM64Instr_VLdStD ( Bool isLoad, HReg dD, HReg rN, UInt uimm12 ) {
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
-   i->tag                  = ARM64in_VLdStD;
+   i->tag                   = ARM64in_VLdStD;
    i->ARM64in.VLdStD.isLoad = isLoad;
    i->ARM64in.VLdStD.dD     = dD;
    i->ARM64in.VLdStD.rN     = rN;
@@ -1052,12 +1067,28 @@
 }
 ARM64Instr* ARM64Instr_VCvtSD ( Bool sToD, HReg dst, HReg src ) {
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
-   i->tag               = ARM64in_VCvtSD;
+   i->tag                 = ARM64in_VCvtSD;
    i->ARM64in.VCvtSD.sToD = sToD;
    i->ARM64in.VCvtSD.dst  = dst;
    i->ARM64in.VCvtSD.src  = src;
    return i;
 }
+ARM64Instr* ARM64Instr_VCvtHS ( Bool hToS, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VCvtHS;
+   i->ARM64in.VCvtHS.hToS = hToS;
+   i->ARM64in.VCvtHS.dst  = dst;
+   i->ARM64in.VCvtHS.src  = src;
+   return i;
+}
+ARM64Instr* ARM64Instr_VCvtHD ( Bool hToD, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+   i->tag                 = ARM64in_VCvtHD;
+   i->ARM64in.VCvtHD.hToD = hToD;
+   i->ARM64in.VCvtHD.dst  = dst;
+   i->ARM64in.VCvtHD.src  = src;
+   return i;
+}
 ARM64Instr* ARM64Instr_VUnaryD ( ARM64FpUnaryOp op, HReg dst, HReg src ) {
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
    i->tag                 = ARM64in_VUnaryD;
@@ -1534,6 +1565,21 @@
       case ARM64in_MFence:
          vex_printf("(mfence) dsb sy; dmb sy; isb");
          return;
+      case ARM64in_VLdStH:
+         if (i->ARM64in.VLdStH.isLoad) {
+            vex_printf("ldr    ");
+            ppHRegARM64asHreg(i->ARM64in.VLdStH.hD);
+            vex_printf(", %u(", i->ARM64in.VLdStH.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStH.rN);
+            vex_printf(")");
+         } else {
+            vex_printf("str    ");
+            vex_printf("%u(", i->ARM64in.VLdStH.uimm12);
+            ppHRegARM64(i->ARM64in.VLdStH.rN);
+            vex_printf("), ");
+            ppHRegARM64asHreg(i->ARM64in.VLdStH.hD);
+         }
+         return;
       case ARM64in_VLdStS:
          if (i->ARM64in.VLdStS.isLoad) {
             vex_printf("ldr    ");
@@ -1613,6 +1659,30 @@
             ppHRegARM64(i->ARM64in.VCvtSD.src);
          }
          return;
+      case ARM64in_VCvtHS:
+         vex_printf("fcvt%s ", i->ARM64in.VCvtHS.hToS ? "h2s" : "s2h");
+         if (i->ARM64in.VCvtHS.hToS) {
+            ppHRegARM64asSreg(i->ARM64in.VCvtHS.dst);
+            vex_printf(", ");
+            ppHRegARM64asHreg(i->ARM64in.VCvtHS.src);
+         } else {
+            ppHRegARM64asHreg(i->ARM64in.VCvtHS.dst);
+            vex_printf(", ");
+            ppHRegARM64asSreg(i->ARM64in.VCvtHS.src);
+         }
+         return;
+      case ARM64in_VCvtHD:
+         vex_printf("fcvt%s ", i->ARM64in.VCvtHD.hToD ? "h2d" : "d2h");
+         if (i->ARM64in.VCvtHD.hToD) {
+            ppHRegARM64(i->ARM64in.VCvtHD.dst);
+            vex_printf(", ");
+            ppHRegARM64asHreg(i->ARM64in.VCvtHD.src);
+         } else {
+            ppHRegARM64asHreg(i->ARM64in.VCvtHD.dst);
+            vex_printf(", ");
+            ppHRegARM64(i->ARM64in.VCvtHD.src);
+         }
+         return;
       case ARM64in_VUnaryD:
          vex_printf("f%s ", showARM64FpUnaryOp(i->ARM64in.VUnaryD.op));
          ppHRegARM64(i->ARM64in.VUnaryD.dst);
@@ -1986,6 +2056,14 @@
          return;
       case ARM64in_MFence:
          return;
+      case ARM64in_VLdStH:
+         addHRegUse(u, HRmRead, i->ARM64in.VLdStH.rN);
+         if (i->ARM64in.VLdStH.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARM64in.VLdStH.hD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARM64in.VLdStH.hD);
+         }
+         return;
       case ARM64in_VLdStS:
          addHRegUse(u, HRmRead, i->ARM64in.VLdStS.rN);
          if (i->ARM64in.VLdStS.isLoad) {
@@ -2021,6 +2099,14 @@
          addHRegUse(u, HRmWrite, i->ARM64in.VCvtSD.dst);
          addHRegUse(u, HRmRead,  i->ARM64in.VCvtSD.src);
          return;
+      case ARM64in_VCvtHS:
+         addHRegUse(u, HRmWrite, i->ARM64in.VCvtHS.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.VCvtHS.src);
+         return;
+      case ARM64in_VCvtHD:
+         addHRegUse(u, HRmWrite, i->ARM64in.VCvtHD.dst);
+         addHRegUse(u, HRmRead,  i->ARM64in.VCvtHD.src);
+         return;
       case ARM64in_VUnaryD:
          addHRegUse(u, HRmWrite, i->ARM64in.VUnaryD.dst);
          addHRegUse(u, HRmRead, i->ARM64in.VUnaryD.src);
@@ -2230,6 +2316,10 @@
          return;
       case ARM64in_MFence:
          return;
+      case ARM64in_VLdStH:
+         i->ARM64in.VLdStH.hD = lookupHRegRemap(m, i->ARM64in.VLdStH.hD);
+         i->ARM64in.VLdStH.rN = lookupHRegRemap(m, i->ARM64in.VLdStH.rN);
+         return;
       case ARM64in_VLdStS:
          i->ARM64in.VLdStS.sD = lookupHRegRemap(m, i->ARM64in.VLdStS.sD);
          i->ARM64in.VLdStS.rN = lookupHRegRemap(m, i->ARM64in.VLdStS.rN);
@@ -2254,6 +2344,14 @@
          i->ARM64in.VCvtSD.dst = lookupHRegRemap(m, i->ARM64in.VCvtSD.dst);
          i->ARM64in.VCvtSD.src = lookupHRegRemap(m, i->ARM64in.VCvtSD.src);
          return;
+      case ARM64in_VCvtHS:
+         i->ARM64in.VCvtHS.dst = lookupHRegRemap(m, i->ARM64in.VCvtHS.dst);
+         i->ARM64in.VCvtHS.src = lookupHRegRemap(m, i->ARM64in.VCvtHS.src);
+         return;
+      case ARM64in_VCvtHD:
+         i->ARM64in.VCvtHD.dst = lookupHRegRemap(m, i->ARM64in.VCvtHD.dst);
+         i->ARM64in.VCvtHD.src = lookupHRegRemap(m, i->ARM64in.VCvtHD.src);
+         return;
       case ARM64in_VUnaryD:
          i->ARM64in.VUnaryD.dst = lookupHRegRemap(m, i->ARM64in.VUnaryD.dst);
          i->ARM64in.VUnaryD.src = lookupHRegRemap(m, i->ARM64in.VUnaryD.src);
@@ -2633,6 +2731,7 @@
 #define X11011000  BITS8(1,1,0,1,1,0,0,0)
 #define X11011010  BITS8(1,1,0,1,1,0,1,0)
 #define X11011110  BITS8(1,1,0,1,1,1,1,0)
+#define X11100010  BITS8(1,1,1,0,0,0,1,0)
 #define X11110001  BITS8(1,1,1,1,0,0,0,1)
 #define X11110011  BITS8(1,1,1,1,0,0,1,1)
 #define X11110101  BITS8(1,1,1,1,0,1,0,1)
@@ -3702,6 +3801,23 @@
       //   *p++ = 0xD5033F5F; /* clrex */
       //   goto done;
       //}
+      case ARM64in_VLdStH: {
+         /* 01 111101 01 imm12 n t   LDR Ht, [Xn|SP, #imm12 * 2]
+            01 111101 00 imm12 n t   STR Ht, [Xn|SP, #imm12 * 2]
+         */
+         UInt hD     = dregEnc(i->ARM64in.VLdStH.hD);
+         UInt rN     = iregEnc(i->ARM64in.VLdStH.rN);
+         UInt uimm12 = i->ARM64in.VLdStH.uimm12;
+         Bool isLD   = i->ARM64in.VLdStH.isLoad;
+         vassert(uimm12 < 8192 && 0 == (uimm12 & 1));
+         uimm12 >>= 1;
+         vassert(uimm12 < (1<<12));
+         vassert(hD < 32);
+         vassert(rN < 31);
+         *p++ = X_2_6_2_12_5_5(X01, X111101, isLD ? X01 : X00,
+                               uimm12, rN, hD);
+         goto done;
+      }
       case ARM64in_VLdStS: {
          /* 10 111101 01 imm12 n t   LDR St, [Xn|SP, #imm12 * 4]
             10 111101 00 imm12 n t   STR St, [Xn|SP, #imm12 * 4]
@@ -3852,7 +3968,7 @@
          goto done;
       }
       case ARM64in_VCvtSD: {
-         /* 31        23 21     16  14    9 4
+         /* 31         23 21    16  14    9 4
             000,11110, 00 10001 0,1 10000 n d   FCVT Dd, Sn (S->D)
             ---------- 01 ----- 0,0 ---------   FCVT Sd, Dn (D->S)
             Rounding, when dst is smaller than src, is per the FPCR.
@@ -3866,6 +3982,36 @@
          }
          goto done;
       }
+      case ARM64in_VCvtHS: {
+         /* 31         23 21    16  14    9 4
+            000,11110, 11 10001 0,0 10000 n d   FCVT Sd, Hn (H->S)
+            ---------- 00 ----- 1,1 ---------   FCVT Hd, Sn (S->H)
+            Rounding, when dst is smaller than src, is per the FPCR.
+         */
+         UInt dd = dregEnc(i->ARM64in.VCvtHS.dst);
+         UInt nn = dregEnc(i->ARM64in.VCvtHS.src);
+         if (i->ARM64in.VCvtHS.hToS) {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X11100010, X010000, nn, dd);
+         } else {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X00100011, X110000, nn, dd);
+         }
+         goto done;
+      }
+      case ARM64in_VCvtHD: {
+         /* 31         23 21    16  14    9 4
+            000,11110, 11 10001 0,1 10000 n d   FCVT Dd, Hn (H->D)
+            ---------- 01 ----- 1,1 ---------   FCVT Hd, Dn (D->H)
+            Rounding, when dst is smaller than src, is per the FPCR.
+         */
+         UInt dd = dregEnc(i->ARM64in.VCvtHD.dst);
+         UInt nn = dregEnc(i->ARM64in.VCvtHD.src);
+         if (i->ARM64in.VCvtHD.hToD) {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X11100010, X110000, nn, dd);
+         } else {
+            *p++ = X_3_5_8_6_5_5(X000, X11110, X01100011, X110000, nn, dd);
+         }
+         goto done;
+      }
       case ARM64in_VUnaryD: {
          /* 31        23 21     16 14    9 4
             000,11110 01 1,0000 0,0 10000 n d  FMOV Dd, Dn (not handled)
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index ab3d917..fce9f83 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -482,12 +482,15 @@
       ARM64in_StrEX,
       ARM64in_MFence,
       /* ARM64in_V*: scalar ops involving vector registers */
-      ARM64in_VLdStS,   /* 32-bit FP load/store, with imm offset  */
-      ARM64in_VLdStD,   /* 64-bit FP load/store, with imm offset  */
-      ARM64in_VLdStQ,
+      ARM64in_VLdStH,   /* ld/st to/from low 16 bits of vec reg, imm offset */
+      ARM64in_VLdStS,   /* ld/st to/from low 32 bits of vec reg, imm offset */
+      ARM64in_VLdStD,   /* ld/st to/from low 64 bits of vec reg, imm offset */
+      ARM64in_VLdStQ,   /* ld/st to/from all 128 bits of vec reg, no offset */
       ARM64in_VCvtI2F,
       ARM64in_VCvtF2I,
-      ARM64in_VCvtSD,
+      ARM64in_VCvtSD,   /* scalar 32 bit FP <--> 64 bit FP */
+      ARM64in_VCvtHS,   /* scalar 16 bit FP <--> 32 bit FP */
+      ARM64in_VCvtHD,   /* scalar 16 bit FP <--> 64 bit FP */
       ARM64in_VUnaryD,
       ARM64in_VUnaryS,
       ARM64in_VBinD,
@@ -670,21 +673,28 @@
          struct {
          } MFence;
          /* --- INSTRUCTIONS INVOLVING VECTOR REGISTERS --- */
-         /* 32-bit Fp load/store */
+         /* ld/st to/from low 16 bits of vec reg, imm offset */
+         struct {
+            Bool isLoad;
+            HReg hD;
+            HReg rN;
+            UInt uimm12;  /* 0 .. 8190 inclusive, 0 % 2 */
+         } VLdStH;
+         /* ld/st to/from low 32 bits of vec reg, imm offset */
          struct {
             Bool isLoad;
             HReg sD;
             HReg rN;
             UInt uimm12;  /* 0 .. 16380 inclusive, 0 % 4 */
          } VLdStS;
-         /* 64-bit Fp load/store */
+         /* ld/st to/from low 64 bits of vec reg, imm offset */
          struct {
             Bool isLoad;
             HReg dD;
             HReg rN;
             UInt uimm12;  /* 0 .. 32760 inclusive, 0 % 8 */
          } VLdStD;
-         /* 128-bit Vector load/store. */
+         /* ld/st to/from all 128 bits of vec reg, no offset */
          struct {
             Bool isLoad;
             HReg rQ; // data
@@ -704,13 +714,24 @@
             UChar      armRM; // ARM encoded RM:
                               // 00=nearest, 01=+inf, 10=-inf, 11=zero
          } VCvtF2I;
-         /* Convert between 32-bit and 64-bit FP values (both
-            ways). (FCVT) */
+         /* Convert between 32-bit and 64-bit FP values (both ways). (FCVT) */
          struct {
             Bool sToD; /* True: F32->F64.  False: F64->F32 */
             HReg dst;
             HReg src;
          } VCvtSD;
+         /* Convert between 16-bit and 32-bit FP values (both ways). (FCVT) */
+         struct {
+            Bool hToS; /* True: F16->F32.  False: F32->F16 */
+            HReg dst;
+            HReg src;
+         } VCvtHS;
+         /* Convert between 16-bit and 64-bit FP values (both ways). (FCVT) */
+         struct {
+            Bool hToD; /* True: F16->F64.  False: F64->F16 */
+            HReg dst;
+            HReg src;
+         } VCvtHD;
          /* 64-bit FP unary */
          struct {
             ARM64FpUnaryOp op;
@@ -887,6 +908,8 @@
 extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
 extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
 extern ARM64Instr* ARM64Instr_MFence  ( void );
+extern ARM64Instr* ARM64Instr_VLdStH  ( Bool isLoad, HReg sD, HReg rN,
+                                        UInt uimm12 /* 0 .. 8190, 0 % 2 */ );
 extern ARM64Instr* ARM64Instr_VLdStS  ( Bool isLoad, HReg sD, HReg rN,
                                         UInt uimm12 /* 0 .. 16380, 0 % 4 */ );
 extern ARM64Instr* ARM64Instr_VLdStD  ( Bool isLoad, HReg dD, HReg rN,
@@ -896,6 +919,8 @@
 extern ARM64Instr* ARM64Instr_VCvtF2I ( ARM64CvtOp how, HReg rD, HReg rS,
                                         UChar armRM );
 extern ARM64Instr* ARM64Instr_VCvtSD  ( Bool sToD, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VCvtHS  ( Bool hToS, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VCvtHD  ( Bool hToD, HReg dst, HReg src );
 extern ARM64Instr* ARM64Instr_VUnaryD ( ARM64FpUnaryOp op, HReg dst, HReg src );
 extern ARM64Instr* ARM64Instr_VUnaryS ( ARM64FpUnaryOp op, HReg dst, HReg src );
 extern ARM64Instr* ARM64Instr_VBinD   ( ARM64FpBinOp op, HReg, HReg, HReg );
diff --git a/priv/host_arm64_isel.c b/priv/host_arm64_isel.c
index 829c39a..a6e507d 100644
--- a/priv/host_arm64_isel.c
+++ b/priv/host_arm64_isel.c
@@ -40,23 +40,6 @@
 #include "host_arm64_defs.h"
 
 
-//ZZ /*---------------------------------------------------------*/
-//ZZ /*--- ARMvfp control word stuff                         ---*/
-//ZZ /*---------------------------------------------------------*/
-//ZZ 
-//ZZ /* Vex-generated code expects to run with the FPU set as follows: all
-//ZZ    exceptions masked, round-to-nearest, non-vector mode, with the NZCV
-//ZZ    flags cleared, and FZ (flush to zero) disabled.  Curiously enough,
-//ZZ    this corresponds to a FPSCR value of zero.
-//ZZ 
-//ZZ    fpscr should therefore be zero on entry to Vex-generated code, and
-//ZZ    should be unchanged at exit.  (Or at least the bottom 28 bits
-//ZZ    should be zero).
-//ZZ */
-//ZZ 
-//ZZ #define DEFAULT_FPSCR 0
-
-
 /*---------------------------------------------------------*/
 /*--- ISelEnv                                           ---*/
 /*---------------------------------------------------------*/
@@ -223,6 +206,9 @@
 static HReg        iselFltExpr_wrk        ( ISelEnv* env, IRExpr* e );
 static HReg        iselFltExpr            ( ISelEnv* env, IRExpr* e );
 
+static HReg        iselF16Expr_wrk        ( ISelEnv* env, IRExpr* e );
+static HReg        iselF16Expr            ( ISelEnv* env, IRExpr* e );
+
 static HReg        iselV128Expr_wrk       ( ISelEnv* env, IRExpr* e );
 static HReg        iselV128Expr           ( ISelEnv* env, IRExpr* e );
 
@@ -1360,6 +1346,16 @@
       return ARM64cc_NE;
    }
 
+   /* --- patterns rooted at: CmpNEZ16 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ16) {
+      HReg      r1    = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARM64RIL* xFFFF = mb_mkARM64RIL_I(0xFFFF);
+      addInstr(env, ARM64Instr_Test(r1, xFFFF));
+      return ARM64cc_NE;
+   }
+
    /* --- patterns rooted at: CmpNEZ64 --- */
 
    if (e->tag == Iex_Unop
@@ -1854,6 +1850,7 @@
             addInstr(env, ARM64Instr_VXfromDorS(dst, src, False/*!fromD*/));
             return dst;
          }
+         case Iop_1Sto16:
          case Iop_1Sto32:
          case Iop_1Sto64: {
             /* As with the iselStmt case for 'tmp:I1 = expr', we could
@@ -3051,6 +3048,12 @@
             addInstr(env, ARM64Instr_VCvtSD(True/*sToD*/, dst, src));
             return dst;
          }
+         case Iop_F16toF64: {
+            HReg src = iselF16Expr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtHD(True/*hToD*/, dst, src));
+            return dst;
+         }
          case Iop_I32UtoF64:
          case Iop_I32StoF64: {
             /* Rounding mode is not involved here, since the
@@ -3226,6 +3229,12 @@
             addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_ABS, dst, src));
             return dst;
          }
+         case Iop_F16toF32: {
+            HReg src = iselF16Expr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtHS(True/*hToS*/, dst, src));
+            return dst;
+         }
          default:
             break;
       }
@@ -3253,7 +3262,7 @@
             HReg srcD = iselDblExpr(env, e->Iex.Binop.arg2);
             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
             HReg dstS = newVRegD(env);
-            addInstr(env, ARM64Instr_VCvtSD(False/*dToS*/, dstS, srcD));
+            addInstr(env, ARM64Instr_VCvtSD(False/*!sToD*/, dstS, srcD));
             return dstS;
          }
          case Iop_I32UtoF32:
@@ -3316,6 +3325,70 @@
 
 
 /*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (16 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 16-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  Values are generated into HRcFlt64
+   registers despite the values themselves being Ity_F16s. */
+
+static HReg iselF16Expr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselF16Expr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselF16Expr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F16);
+
+   if (e->tag == Iex_Get) {
+      Int offs = e->Iex.Get.offset;
+      if (offs >= 0 && offs < 8192 && 0 == (offs & 1)) {
+         HReg rD = newVRegD(env);
+         HReg rN = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStH(True/*isLoad*/, rD, rN, offs));
+         return rD;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_F32toF16: {
+            HReg srcS = iselFltExpr(env, e->Iex.Binop.arg2);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg dstH = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtHS(False/*!hToS*/, dstH, srcS));
+            return dstH;
+         }
+         case Iop_F64toF16: {
+            HReg srcD = iselDblExpr(env, e->Iex.Binop.arg2);
+            set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg dstH = newVRegD(env);
+            addInstr(env, ARM64Instr_VCvtHD(False/*!hToD*/, dstH, srcD));
+            return dstH;
+         }
+         default:
+            break;
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselF16Expr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
 /*--- ISEL: Vector expressions (256 bit)                ---*/
 /*---------------------------------------------------------*/
 
@@ -3534,9 +3607,15 @@
          return;
       }
       if (tyd == Ity_F32 && 0 == (offs & 3) && offs < (4<<12)) {
-         HReg dD   = iselFltExpr(env, stmt->Ist.Put.data);
+         HReg sD   = iselFltExpr(env, stmt->Ist.Put.data);
          HReg bbp  = get_baseblock_register();
-         addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, dD, bbp, offs));
+         addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, sD, bbp, offs));
+         return;
+      }
+      if (tyd == Ity_F16 && 0 == (offs & 1) && offs < (2<<12)) {
+         HReg hD   = iselF16Expr(env, stmt->Ist.Put.data);
+         HReg bbp  = get_baseblock_register();
+         addInstr(env, ARM64Instr_VLdStH(False/*!isLoad*/, hD, bbp, offs));
          return;
       }
 
@@ -3965,6 +4044,7 @@
             hreg   = mkHReg(True, HRcInt64, 0, j++);
             hregHI = mkHReg(True, HRcInt64, 0, j++);
             break;
+         case Ity_F16: // we'll use HRcFlt64 regs for F16 too
          case Ity_F32: // we'll use HRcFlt64 regs for F32 too
          case Ity_F64:
             hreg = mkHReg(True, HRcFlt64, 0, j++);