Add handling of 256-bit vector FP arithmetic ops, so as to support AVX.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@12673 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index f952423..ee9c3ed 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -510,6 +510,12 @@
    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 }
 
+static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
+   tl_assert(isShadowAtom(mce,a1));
+   tl_assert(isShadowAtom(mce,a2));
+   return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
+}
+
 /* --------- Undefined-if-either-undefined --------- */
 
 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
@@ -556,6 +562,12 @@
    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 }
 
+static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
+   tl_assert(isShadowAtom(mce,a1));
+   tl_assert(isShadowAtom(mce,a2));
+   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
+}
+
 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
    switch (vty) {
       case Ity_I8:   return mkUifU8(mce, a1, a2);
@@ -637,6 +649,14 @@
    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 }
 
+static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
+{
+   tl_assert(isOriginalAtom(mce, data));
+   tl_assert(isShadowAtom(mce, vbits));
+   tl_assert(sameKindedAtoms(data, vbits));
+   return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
+}
+
 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
    defined (0); all other -> undefined (1).
 */
@@ -700,6 +720,18 @@
                    vbits) );
 }
 
+static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
+{
+   tl_assert(isOriginalAtom(mce, data));
+   tl_assert(isShadowAtom(mce, vbits));
+   tl_assert(sameKindedAtoms(data, vbits));
+   return assignNew(
+             'V', mce, Ity_V256, 
+             binop(Iop_OrV256, 
+                   assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)), 
+                   vbits) );
+}
+
 /* --------- Pessimising casts. --------- */
 
 /* The function returns an expression of type DST_TY. If any of the VBITS
@@ -1811,6 +1843,16 @@
    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
+static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
+}
+
+static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
+}
+
 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
 {
    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
@@ -1985,6 +2027,50 @@
    return at;
 }
 
+/* --- ... and ... 64Fx4 versions of the same ... --- */
+
+static
+IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
+{
+   IRAtom* at;
+   tl_assert(isShadowAtom(mce, vatomX));
+   tl_assert(isShadowAtom(mce, vatomY));
+   at = mkUifUV256(mce, vatomX, vatomY);
+   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
+   return at;
+}
+
+static
+IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
+{
+   IRAtom* at;
+   tl_assert(isShadowAtom(mce, vatomX));
+   at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
+   return at;
+}
+
+/* --- ... and ... 32Fx8 versions of the same ... --- */
+
+static
+IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
+{
+   IRAtom* at;
+   tl_assert(isShadowAtom(mce, vatomX));
+   tl_assert(isShadowAtom(mce, vatomY));
+   at = mkUifUV256(mce, vatomX, vatomY);
+   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
+   return at;
+}
+
+static
+IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
+{
+   IRAtom* at;
+   tl_assert(isShadowAtom(mce, vatomX));
+   at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
+   return at;
+}
+
 /* --- --- Vector saturated narrowing --- --- */
 
 /* We used to do something very clever here, but on closer inspection
@@ -2318,6 +2404,11 @@
          /* I32(rm) x F32 x F32 x F32 -> F32 */
          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
 
+      /* V256-bit data-steering */
+      case Iop_64x4toV256:
+         return assignNew('V', mce, Ity_V256,
+                          IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
+
       default:
          ppIROp(op);
          VG_(tool_panic)("memcheck:expr2vbits_Qop");
@@ -2577,38 +2668,48 @@
       case Iop_PwMin32Ux2:
       case Iop_PwMax32Fx2:
       case Iop_PwMin32Fx2:
-         return assignNew('V', mce, Ity_I64, binop(Iop_PwMax32Ux2, mkPCast32x2(mce, vatom1),
-                     mkPCast32x2(mce, vatom2)));
+         return assignNew('V', mce, Ity_I64,
+                          binop(Iop_PwMax32Ux2, 
+                                mkPCast32x2(mce, vatom1),
+                                mkPCast32x2(mce, vatom2)));
 
       case Iop_PwMax16Sx4:
       case Iop_PwMax16Ux4:
       case Iop_PwMin16Sx4:
       case Iop_PwMin16Ux4:
-         return assignNew('V', mce, Ity_I64, binop(Iop_PwMax16Ux4, mkPCast16x4(mce, vatom1),
-                     mkPCast16x4(mce, vatom2)));
+         return assignNew('V', mce, Ity_I64,
+                          binop(Iop_PwMax16Ux4,
+                                mkPCast16x4(mce, vatom1),
+                                mkPCast16x4(mce, vatom2)));
 
       case Iop_PwMax8Sx8:
       case Iop_PwMax8Ux8:
       case Iop_PwMin8Sx8:
       case Iop_PwMin8Ux8:
-         return assignNew('V', mce, Ity_I64, binop(Iop_PwMax8Ux8, mkPCast8x8(mce, vatom1),
-                     mkPCast8x8(mce, vatom2)));
+         return assignNew('V', mce, Ity_I64,
+                          binop(Iop_PwMax8Ux8,
+                                mkPCast8x8(mce, vatom1),
+                                mkPCast8x8(mce, vatom2)));
 
       case Iop_PwAdd32x2:
       case Iop_PwAdd32Fx2:
          return mkPCast32x2(mce,
-               assignNew('V', mce, Ity_I64, binop(Iop_PwAdd32x2, mkPCast32x2(mce, vatom1),
-                     mkPCast32x2(mce, vatom2))));
+               assignNew('V', mce, Ity_I64,
+                         binop(Iop_PwAdd32x2,
+                               mkPCast32x2(mce, vatom1),
+                               mkPCast32x2(mce, vatom2))));
 
       case Iop_PwAdd16x4:
          return mkPCast16x4(mce,
-               assignNew('V', mce, Ity_I64, binop(op, mkPCast16x4(mce, vatom1),
-                     mkPCast16x4(mce, vatom2))));
+               assignNew('V', mce, Ity_I64,
+                         binop(op, mkPCast16x4(mce, vatom1),
+                                   mkPCast16x4(mce, vatom2))));
 
       case Iop_PwAdd8x8:
          return mkPCast8x8(mce,
-               assignNew('V', mce, Ity_I64, binop(op, mkPCast8x8(mce, vatom1),
-                     mkPCast8x8(mce, vatom2))));
+               assignNew('V', mce, Ity_I64,
+                         binop(op, mkPCast8x8(mce, vatom1),
+                                   mkPCast8x8(mce, vatom2))));
 
       case Iop_Shl8x8:
       case Iop_Shr8x8:
@@ -2997,13 +3098,19 @@
 
      /* Perm8x16: rearrange values in left arg using steering values
         from right arg.  So rearrange the vbits in the same way but
-        pessimise wrt steering values. */
+        pessimise wrt steering values.  Perm32x4 ditto. */
       case Iop_Perm8x16:
          return mkUifUV128(
                    mce,
                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast8x16(mce, vatom2)
                 );
+      case Iop_Perm32x4:
+         return mkUifUV128(
+                   mce,
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
+                   mkPCast32x4(mce, vatom2)
+                );
 
      /* These two take the lower half of each 16-bit lane, sign/zero
         extend it to 32, and multiply together, producing a 32x4
@@ -3052,6 +3159,28 @@
       case Iop_64HLto128:
          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
 
+      /* V256-bit SIMD */
+
+      case Iop_Add64Fx4:
+      case Iop_Sub64Fx4:
+      case Iop_Mul64Fx4:
+      case Iop_Div64Fx4:
+      case Iop_Max64Fx4:
+      case Iop_Min64Fx4:
+         return binary64Fx4(mce, vatom1, vatom2);
+
+      case Iop_Add32Fx8:
+      case Iop_Sub32Fx8:
+      case Iop_Mul32Fx8:
+      case Iop_Div32Fx8:
+      case Iop_Max32Fx8:
+      case Iop_Min32Fx8:
+         return binary32Fx8(mce, vatom1, vatom2);
+
+      /* V256-bit data-steering */
+      case Iop_V128HLtoV256:
+         return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
+
       /* Scalar floating point */
 
       case Iop_F32toI64S:
@@ -3119,7 +3248,8 @@
 
       case Iop_F64HLtoF128:
       case Iop_D64HLtoD128:
-         return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I128,
+                          binop(Iop_64HLto128, vatom1, vatom2));
 
       case Iop_F64toI32U:
       case Iop_F64toI32S:
@@ -3171,21 +3301,24 @@
       case Iop_MullU64: {
          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
-         return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, vHi64, vLo64));
+         return assignNew('V', mce, Ity_I128,
+                          binop(Iop_64HLto128, vHi64, vLo64));
       }
 
       case Iop_MullS32:
       case Iop_MullU32: {
          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
-         return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, vHi32, vLo32));
+         return assignNew('V', mce, Ity_I64,
+                          binop(Iop_32HLto64, vHi32, vLo32));
       }
 
       case Iop_MullS16:
       case Iop_MullU16: {
          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
-         return assignNew('V', mce, Ity_I32, binop(Iop_16HLto32, vHi16, vLo16));
+         return assignNew('V', mce, Ity_I32,
+                          binop(Iop_16HLto32, vHi16, vLo16));
       }
 
       case Iop_MullS8:
@@ -3305,6 +3438,9 @@
       case Iop_Shl8: case Iop_Shr8:
          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
 
+      case Iop_AndV256:
+         uifu = mkUifUV256; difd = mkDifDV256; 
+         and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
       case Iop_AndV128:
          uifu = mkUifUV128; difd = mkDifDV128; 
          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
@@ -3321,6 +3457,9 @@
          uifu = mkUifU8; difd = mkDifD8; 
          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
 
+      case Iop_OrV256:
+         uifu = mkUifUV256; difd = mkDifDV256; 
+         and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
       case Iop_OrV128:
          uifu = mkUifUV128; difd = mkDifDV128; 
          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
@@ -3356,6 +3495,8 @@
          return mkUifU64(mce, vatom1, vatom2);
       case Iop_XorV128:
          return mkUifUV128(mce, vatom1, vatom2);
+      case Iop_XorV256:
+         return mkUifUV256(mce, vatom1, vatom2);
 
       default:
          ppIROp(op);
@@ -3377,6 +3518,14 @@
       case Iop_Sqrt64F0x2:
          return unary64F0x2(mce, vatom);
 
+      case Iop_Sqrt32Fx8:
+      case Iop_RSqrt32Fx8:
+      case Iop_Recip32Fx8:
+         return unary32Fx8(mce, vatom);
+
+      case Iop_Sqrt64Fx4:
+         return unary64Fx4(mce, vatom);
+
       case Iop_Sqrt32Fx4:
       case Iop_RSqrt32Fx4:
       case Iop_Recip32Fx4:
@@ -3419,6 +3568,7 @@
       case Iop_Reverse64_8x16:
       case Iop_Reverse64_16x8:
       case Iop_Reverse64_32x4:
+      case Iop_V256toV128_1: case Iop_V256toV128_0:
          return assignNew('V', mce, Ity_V128, unop(op, vatom));
 
       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
@@ -3487,6 +3637,8 @@
       case Iop_Reverse64_8x8:
       case Iop_Reverse64_16x4:
       case Iop_Reverse64_32x2:
+      case Iop_V256to64_0: case Iop_V256to64_1:
+      case Iop_V256to64_2: case Iop_V256to64_3:
          return assignNew('V', mce, Ity_I64, unop(op, vatom));
 
       case Iop_I16StoF32:
@@ -3530,6 +3682,7 @@
       case Iop_ReinterpD64asI64:
       case Iop_DPBtoBCD:
       case Iop_BCDtoDPB:
+      case Iop_NotV256:
       case Iop_NotV128:
       case Iop_Not64:
       case Iop_Not32: