Change how FXSAVE and FXRSTOR are done, so as to avoid pushing the XMM
register contents themselves through the helper functions.  This
avoids the false positives reported in #291310.



git-svn-id: svn://svn.valgrind.org/vex/trunk@2949 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index 008638e..c321127 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -170,8 +170,10 @@
 
 extern void  amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
 
-extern void      amd64g_dirtyhelper_FXSAVE  ( VexGuestAMD64State*, HWord );
-extern VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State*, HWord );
+extern void      amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM
+                    ( VexGuestAMD64State*, HWord );
+extern VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM
+                    ( VexGuestAMD64State*, HWord );
 
 extern ULong amd64g_dirtyhelper_RDTSC ( void );
 extern void  amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st );
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index 6ca67e9..4ed9fdd 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -1723,7 +1723,8 @@
 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest state, writes guest mem) */
 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
-void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
+void amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
+                                                HWord addr )
 {
    /* Derived from values obtained from
       vendor_id       : AuthenticAMD
@@ -1738,7 +1739,6 @@
    Fpu_State tmp;
    UShort*   addrS = (UShort*)addr;
    UChar*    addrC = (UChar*)addr;
-   U128*     xmm   = (U128*)(addr + 160);
    UInt      mxcsr;
    UShort    fp_tags;
    UInt      summary_tags;
@@ -1803,76 +1803,30 @@
    }
 
    /* That's the first 160 bytes of the image done.  Now only %xmm0
-      .. %xmm15 remain to be copied.  If the host is big-endian, these
-      need to be byte-swapped. */
-   vassert(host_is_little_endian());
-
-#  define COPY_U128(_dst,_src)                       \
-      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-      while (0)
-
-   COPY_U128( xmm[0],  gst->guest_YMM0 );
-   COPY_U128( xmm[1],  gst->guest_YMM1 );
-   COPY_U128( xmm[2],  gst->guest_YMM2 );
-   COPY_U128( xmm[3],  gst->guest_YMM3 );
-   COPY_U128( xmm[4],  gst->guest_YMM4 );
-   COPY_U128( xmm[5],  gst->guest_YMM5 );
-   COPY_U128( xmm[6],  gst->guest_YMM6 );
-   COPY_U128( xmm[7],  gst->guest_YMM7 );
-   COPY_U128( xmm[8],  gst->guest_YMM8 );
-   COPY_U128( xmm[9],  gst->guest_YMM9 );
-   COPY_U128( xmm[10], gst->guest_YMM10 );
-   COPY_U128( xmm[11], gst->guest_YMM11 );
-   COPY_U128( xmm[12], gst->guest_YMM12 );
-   COPY_U128( xmm[13], gst->guest_YMM13 );
-   COPY_U128( xmm[14], gst->guest_YMM14 );
-   COPY_U128( xmm[15], gst->guest_YMM15 );
-
-#  undef COPY_U128
+      .. %xmm15 remain to be copied, and we let the generated IR do
+      that, so as to make Memcheck's definedness flow for the non-XMM
+      parts independant from that of the all the other control and
+      status words in the structure.  This avoids the false positives
+      shown in #291310. */
 }
 
 
 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest state, reads guest mem) */
-VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
+VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
+                                                      HWord addr )
 {
    Fpu_State tmp;
    VexEmNote warnX87 = EmNote_NONE;
    VexEmNote warnXMM = EmNote_NONE;
    UShort*   addrS   = (UShort*)addr;
    UChar*    addrC   = (UChar*)addr;
-   U128*     xmm     = (U128*)(addr + 160);
    UShort    fp_tags;
    Int       r, stno, i;
 
-   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
-      to be byte-swapped. */
-   vassert(host_is_little_endian());
-
-#  define COPY_U128(_dst,_src)                       \
-      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-      while (0)
-
-   COPY_U128( gst->guest_YMM0, xmm[0] );
-   COPY_U128( gst->guest_YMM1, xmm[1] );
-   COPY_U128( gst->guest_YMM2, xmm[2] );
-   COPY_U128( gst->guest_YMM3, xmm[3] );
-   COPY_U128( gst->guest_YMM4, xmm[4] );
-   COPY_U128( gst->guest_YMM5, xmm[5] );
-   COPY_U128( gst->guest_YMM6, xmm[6] );
-   COPY_U128( gst->guest_YMM7, xmm[7] );
-   COPY_U128( gst->guest_YMM8, xmm[8] );
-   COPY_U128( gst->guest_YMM9, xmm[9] );
-   COPY_U128( gst->guest_YMM10, xmm[10] );
-   COPY_U128( gst->guest_YMM11, xmm[11] );
-   COPY_U128( gst->guest_YMM12, xmm[12] );
-   COPY_U128( gst->guest_YMM13, xmm[13] );
-   COPY_U128( gst->guest_YMM14, xmm[14] );
-   COPY_U128( gst->guest_YMM15, xmm[15] );
-
-#  undef COPY_U128
+   /* Don't restore %xmm0 .. %xmm15, for the same reasons that
+      amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM doesn't save them.  See
+      comment in that function for details. */
 
    /* Copy the x87 registers out of the image, into a temporary
       Fpu_State struct. */
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 78df883..0f85eca 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -13575,11 +13575,12 @@
          DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
 
          /* Uses dirty helper: 
-               void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
+              void amd64g_do_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
+                                                     ULong ) */
          d = unsafeIRDirty_0_N ( 
                 0/*regparms*/, 
-                "amd64g_dirtyhelper_FXSAVE", 
-                &amd64g_dirtyhelper_FXSAVE,
+                "amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM",
+                &amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM,
                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
              );
 
@@ -13589,7 +13590,7 @@
          d->mSize = 464; /* according to recent Intel docs */
 
          /* declare we're reading guest state */
-         d->nFxState = 7;
+         d->nFxState = 6;
          vex_bzero(&d->fxState, sizeof(d->fxState));
 
          d->fxState[0].fx     = Ifx_Read;
@@ -13613,24 +13614,25 @@
          d->fxState[4].size   = sizeof(ULong);
 
          d->fxState[5].fx     = Ifx_Read;
-         d->fxState[5].offset = OFFB_YMM0;
-         d->fxState[5].size   = sizeof(U128);
-         /* plus 15 more of the above, spaced out in YMM sized steps */
-         d->fxState[5].nRepeats  = 15; 
-         d->fxState[5].repeatLen = sizeof(U256);
+         d->fxState[5].offset = OFFB_SSEROUND;
+         d->fxState[5].size   = sizeof(ULong);
 
-         d->fxState[6].fx     = Ifx_Read;
-         d->fxState[6].offset = OFFB_SSEROUND;
-         d->fxState[6].size   = sizeof(ULong);
-
-         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
-            images are packed back-to-back.  If not, the settings for
-            d->fxState[5] are wrong. */
-         vassert(32 == sizeof(U256));
-         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
-
+         /* Call the helper.  This creates all parts of the in-memory
+            image except for the XMM[0..15] array, which we do
+            separately, in order that any undefinedness in the XMM
+            registers is tracked separately by Memcheck and does not
+            "infect" the in-memory shadow for the other parts of the
+            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
+            SSEROUND). */
          stmt( IRStmt_Dirty(d) );
 
+         /* And now the XMMs themselves. */
+         UInt xmm;
+         for (xmm = 0; xmm < 16; xmm++) {
+            storeLE( binop(Iop_Add64, mkexpr(addr), mkU64(160 + xmm * 16)),
+                     getXMMReg(xmm) );
+         }
+
          goto decode_success;
       }
       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
@@ -13650,14 +13652,15 @@
          DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
 
          /* Uses dirty helper: 
-               VexEmNote amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
+              VexEmNote amd64g_do_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
+                                                           ULong )
             NOTE:
-               the VexEmNote value is simply ignored
+              the VexEmNote value is simply ignored
          */
          d = unsafeIRDirty_0_N ( 
                 0/*regparms*/, 
-                "amd64g_dirtyhelper_FXRSTOR", 
-                &amd64g_dirtyhelper_FXRSTOR,
+                "amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM", 
+                &amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM,
                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
              );
 
@@ -13667,7 +13670,7 @@
          d->mSize = 464; /* according to recent Intel docs */
 
          /* declare we're writing guest state */
-         d->nFxState = 7;
+         d->nFxState = 6;
          vex_bzero(&d->fxState, sizeof(d->fxState));
 
          d->fxState[0].fx     = Ifx_Write;
@@ -13691,24 +13694,26 @@
          d->fxState[4].size   = sizeof(ULong);
 
          d->fxState[5].fx     = Ifx_Write;
-         d->fxState[5].offset = OFFB_YMM0;
-         d->fxState[5].size   = sizeof(U128);
-         /* plus 15 more of the above, spaced out in YMM sized steps */
-         d->fxState[5].nRepeats  = 15; 
-         d->fxState[5].repeatLen = sizeof(U256);
+         d->fxState[5].offset = OFFB_SSEROUND;
+         d->fxState[5].size   = sizeof(ULong);
 
-         d->fxState[6].fx     = Ifx_Write;
-         d->fxState[6].offset = OFFB_SSEROUND;
-         d->fxState[6].size   = sizeof(ULong);
-
-         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
-            images are packed back-to-back.  If not, the settings for
-            d->fxState[5] are wrong. */
-         vassert(32 == sizeof(U256));
-         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
-
+         /* Call the helper.  This reads all parts of the in-memory
+            image except for the XMM[0..15] array, which we do
+            separately, in order that any undefinedness in the XMM
+            registers is tracked separately by Memcheck and does not
+            "infect" the in-guest-state shadow for the other parts of the
+            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
+            SSEROUND). */
          stmt( IRStmt_Dirty(d) );
 
+         /* And now the XMMs themselves. */
+         UInt xmm;
+         for (xmm = 0; xmm < 16; xmm++) {
+            putXMMReg(xmm, loadLE(Ity_V128,
+                                  binop(Iop_Add64, mkexpr(addr),
+                                                   mkU64(160 + xmm * 16))));
+         }
+
          goto decode_success;
       }
       break;