* fix Bug 290655 - Add support for AESKEYGENASSIST instruction
  (VEX part)
  Patch implementing the AES instructions (AESKEYGENASSIST, AESIMC,
  AESENC, AESENCLAST, AESDEC, AESDECLAST).



git-svn-id: svn://svn.valgrind.org/vex/trunk@2247 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index 7d3ed34..55ecfe9 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -211,6 +211,55 @@
           HWord edxIN, HWord eaxIN
        );
 
+/* Implementation of intel AES instructions as described in
+   Intel  Advanced Vector Extensions
+          Programming Reference
+          MARCH 2008
+          319433-002.
+
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 2 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state, no
+   accesses of memory, is a pure function.
+
+   opc4 contains the 4th byte of opcode. Front-end should only
+   give opcode corresponding to AESENC/AESENCLAST/AESDEC/AESDECLAST/AESIMC.
+   (will assert otherwise).
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register inputs and/or output.  We never have to deal with the memory
+   case since that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+*/
+extern void amd64g_dirtyhelper_AES ( 
+          VexGuestAMD64State* gst,
+          HWord opc4,
+          HWord gstOffL, HWord gstOffR
+       );
+
+/* Implementation of AESKEYGENASSIST. 
+
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 1 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state, no
+   accesses of memory, is a pure function.
+
+   imm8 is the Round Key constant.
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register input and output.  We never have to deal with the memory case since
+   that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+*/
+extern void amd64g_dirtyhelper_AESKEYGENASSIST ( 
+          VexGuestAMD64State* gst,
+          HWord imm8,
+          HWord gstOffL, HWord gstOffR
+       );
 
 //extern void  amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
 //extern void  amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index d554918..dd72b9c 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -2239,7 +2239,6 @@
                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
                      arat tpr_shadow vnmi flexpriority ept vpid
-                     MINUS aes (see below)
    bogomips        : 6957.57
    clflush size    : 64
    cache_alignment : 64
@@ -2263,10 +2262,7 @@
          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
          break;
       case 0x00000001:
-         // & ~(1<<25): don't claim to support AES insns.  See
-         // bug 249991.
-         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff & ~(1<<25),
-                                          0xbfebfbff);
+         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
          break;
       case 0x00000002:
          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
@@ -2921,6 +2917,324 @@
    }
 }
 
+/*---------------------------------------------------------------*/
+/*--- AES primitives and helpers                              ---*/
+/*---------------------------------------------------------------*/
+/* a 16 x 16 matrix */
+static const UChar sbox[256] = {                   // row nr
+   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
+   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
+   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
+   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
+   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
+   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
+   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
+   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
+   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
+   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
+   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
+   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
+   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
+   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
+   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
+   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
+   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+static void SubBytes (V128* v)
+{
+   V128 r;
+   UInt i;
+   for (i = 0; i < 16; i++)
+      r.w8[i] = sbox[v->w8[i]];
+   *v = r;
+}
+
+/* a 16 x 16 matrix */
+static const UChar invsbox[256] = {                // row nr
+   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
+   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,     
+   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
+   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,     
+   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
+   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,     
+   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
+   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,     
+   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
+   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,     
+   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
+   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,     
+   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
+   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,     
+   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
+   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,     
+   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
+   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,     
+   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
+   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,     
+   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
+   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,     
+   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
+   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,     
+   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
+   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,     
+   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
+   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,     
+   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
+   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,     
+   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
+   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+static void InvSubBytes (V128* v)
+{
+   V128 r;
+   UInt i;
+   for (i = 0; i < 16; i++)
+      r.w8[i] = invsbox[v->w8[i]];
+   *v = r;
+}
+
+static const UChar ShiftRows_op[16] =
+   {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
+static void ShiftRows (V128* v)
+{
+   V128 r;
+   UInt i;
+   for (i = 0; i < 16; i++)
+      r.w8[i] = v->w8[ShiftRows_op[15-i]];
+   *v = r;
+}
+
+static const UChar InvShiftRows_op[16] = 
+   {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
+static void InvShiftRows (V128* v)
+{
+   V128 r;
+   UInt i;
+   for (i = 0; i < 16; i++)
+      r.w8[i] = v->w8[InvShiftRows_op[15-i]];
+   *v = r;
+}
+
+/* Multiplication of the finite fields elements of AES.
+   See "A Specification for The AES Algorithm Rijndael 
+        (by Joan Daemen & Vincent Rijmen)"
+        Dr. Brian Gladman, v3.1, 3rd March 2001. */
+/* N values so that (hex) xy = 0x03^N.
+   0x00 cannot be used. We put 0xff for this value.*/
+/* a 16 x 16 matrix */
+static const UChar Nxy[256] = {                    // row nr
+   0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
+   0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,     
+   0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
+   0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,     
+   0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
+   0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,     
+   0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
+   0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,     
+   0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
+   0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,     
+   0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
+   0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,     
+   0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
+   0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,     
+   0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
+   0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,     
+   0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
+   0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,     
+   0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
+   0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,     
+   0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
+   0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,     
+   0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
+   0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,     
+   0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
+   0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,     
+   0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
+   0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,     
+   0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
+   0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,     
+   0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
+   0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
+};
+
+/* E values so that E = 0x03^xy. */
+static const UChar Exy[256] = {                    // row nr
+   0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
+   0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,     
+   0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
+   0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,     
+   0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
+   0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,     
+   0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
+   0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,     
+   0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
+   0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,     
+   0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
+   0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,     
+   0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
+   0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,     
+   0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
+   0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,     
+   0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
+   0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,     
+   0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
+   0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,     
+   0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
+   0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,     
+   0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
+   0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,     
+   0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
+   0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,     
+   0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
+   0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,     
+   0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
+   0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,     
+   0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
+   0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
+
+static inline UChar ff_mul(UChar u1, UChar u2)
+{
+   if ((u1 > 0) && (u2 > 0)) {
+      UInt ui = Nxy[u1] + Nxy[u2];
+      if (ui >= 255)
+         ui = ui - 255;
+      return Exy[ui];
+   } else {
+      return 0;
+   };
+}
+
+static void MixColumns (V128* v)
+{
+   V128 r;
+   Int j;
+#define P(x,row,col) (x)->w8[((row)*4+(col))]
+   for (j = 0; j < 4; j++) {
+      P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1)) 
+         ^ P(v,j,2) ^ P(v,j,3);
+      P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) ) 
+         ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
+      P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
+         ^ ff_mul(0x03, P(v,j,3) );
+      P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
+         ^ ff_mul( 0x02, P(v,j,3) );
+   }
+   *v = r;
+#undef P
+}
+
+static void InvMixColumns (V128* v)
+{
+   V128 r;
+   Int j;
+#define P(x,row,col) (x)->w8[((row)*4+(col))]
+   for (j = 0; j < 4; j++) {
+      P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
+         ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
+      P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
+         ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
+      P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
+         ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
+      P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
+         ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
+   }
+   *v = r;
+#undef P
+
+}
+
+/* For description, see definition in guest_amd64_defs.h */
+void amd64g_dirtyhelper_AES ( 
+          VexGuestAMD64State* gst,
+          HWord opc4,
+          HWord gstOffL, HWord gstOffR
+       )
+{
+   // where the args are
+   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+   switch (opc4) {
+      case 0xDC: /* AESENC */
+      case 0xDD: /* AESENCLAST */
+         ShiftRows (argR);
+         SubBytes  (argR);
+         if (opc4 == 0xDC)
+            MixColumns (argR);
+         argR->w64[0] = argR->w64[0] ^ argL->w64[0];
+         argR->w64[1] = argR->w64[1] ^ argL->w64[1];
+         break;
+
+      case 0xDE: /* AESDEC */
+      case 0xDF: /* AESDECLAST */
+         InvShiftRows (argR);
+         InvSubBytes (argR);
+         if (opc4 == 0xDE)
+            InvMixColumns (argR);
+         argR->w64[0] = argR->w64[0] ^ argL->w64[0];
+         argR->w64[1] = argR->w64[1] ^ argL->w64[1];
+         break;
+
+      case 0xDB: /* AESIMC */
+         *argR = *argL;
+         InvMixColumns (argR);
+         break;
+      default: vassert(0);
+   }
+}
+
+static inline UInt RotWord (UInt   w32)
+{
+   return ((w32 >> 8) | (w32 << 24));
+}
+
+static inline UInt SubWord (UInt   w32)
+{
+   UChar *w8;
+   UChar *r8;
+   UInt res;
+   w8 = (UChar*) &w32;
+   r8 = (UChar*) &res;
+   r8[0] = sbox[w8[0]];
+   r8[1] = sbox[w8[1]];
+   r8[2] = sbox[w8[2]];
+   r8[3] = sbox[w8[3]];
+   return res;
+}
+
+/* For description, see definition in guest_amd64_defs.h */
+extern void amd64g_dirtyhelper_AESKEYGENASSIST ( 
+          VexGuestAMD64State* gst,
+          HWord imm8,
+          HWord gstOffL, HWord gstOffR
+       )
+{
+   // where the args are
+   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+   argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
+   argR->w32[2] = SubWord (argL->w32[3]);
+   argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
+   argR->w32[0] = SubWord (argL->w32[1]);
+}
+
+
 
 /*---------------------------------------------------------------*/
 /*--- Helpers for dealing with, and describing,               ---*/
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 0e845b0..9002d05 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -14942,6 +14942,91 @@
       }
       break;
 
+   case 0xDC:
+   case 0xDD:
+   case 0xDE:
+   case 0xDF:
+   case 0xDB:
+      /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
+                  DD /r = AESENCLAST xmm1, xmm2/m128
+                  DE /r = AESDEC xmm1, xmm2/m128
+                  DF /r = AESDECLAST xmm1, xmm2/m128
+
+                  DB /r = AESIMC xmm1, xmm2/m128 */
+      if (have66noF2noF3(pfx) && sz == 2) {
+         UInt  regNoL = 0;
+         UInt  regNoR = 0;
+
+         /* This is a nasty kludge.  We need to pass 2 x V128 to the
+            helper.  Since we can't do that, use a dirty
+            helper to compute the results directly from the XMM regs in
+            the guest state.  That means for the memory case, we need to
+            move the left operand into a pseudo-register (XMM16, let's
+            call it). */
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            regNoL = eregOfRexRM(pfx, modrm);
+            regNoR = gregOfRexRM(pfx, modrm);
+            delta += 1;
+         } else {
+            regNoL = 16; /* use XMM16 as an intermediary */
+            regNoR = gregOfRexRM(pfx, modrm);
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            /* alignment check needed ???? */
+            stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+            delta += alen;
+         }
+
+         void*  fn = &amd64g_dirtyhelper_AES;
+         HChar* nm = "amd64g_dirtyhelper_AES";
+
+         /* Round up the arguments.  Note that this is a kludge -- the
+            use of mkU64 rather than mkIRExpr_HWord implies the
+            assumption that the host's word size is 64-bit. */
+         UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+         UInt gstOffR = xmmGuestRegOffset(regNoR);
+         IRExpr*  opc4         = mkU64(opc);
+         IRExpr*  gstOffLe     = mkU64(gstOffL);
+         IRExpr*  gstOffRe     = mkU64(gstOffR);
+         IRExpr** args
+            = mkIRExprVec_3( opc4, gstOffLe, gstOffRe );
+
+         IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
+         /* It's not really a dirty call, but we can't use the clean
+            helper mechanism here for the very lame reason that we can't
+            pass 2 x V128s by value to a helper, nor get one back.  Hence
+            this roundabout scheme. */
+         d->needsBBP = True;
+         d->nFxState = 2;
+         /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
+            the second.
+            AESIMC (0xDB) reads the first register, and writes the second. */
+         d->fxState[0].fx     = Ifx_Read;
+         d->fxState[0].offset = gstOffL;
+         d->fxState[0].size   = sizeof(U128);
+         d->fxState[1].fx     = (opc == 0xDB ? Ifx_Write : Ifx_Modify);
+         d->fxState[1].offset = gstOffR;
+         d->fxState[1].size   = sizeof(U128);
+
+         stmt( IRStmt_Dirty(d) );
+         {
+            HChar* opsuf;
+            switch (opc) {
+               case 0xDC: opsuf = "enc"; break;
+               case 0XDD: opsuf = "enclast"; break;
+               case 0xDE: opsuf = "dec"; break;
+               case 0xDF: opsuf = "declast"; break;
+               case 0xDB: opsuf = "imc"; break;
+               default: vassert(0);
+            }
+            DIP("aes%s %s,%s\n", opsuf, 
+                (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
+                nameXMMReg(regNoR));
+         }
+         goto decode_success;
+      }
+      break;
+
    case 0xF0:
    case 0xF1:
       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
@@ -16179,6 +16264,69 @@
       }
       break;
 
+   case 0xdf:
+      /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
+      if (have66noF2noF3(pfx) && sz == 2) {
+         UInt  regNoL = 0;
+         UInt  regNoR = 0;
+         UChar imm    = 0;
+
+         /* This is a nasty kludge.  See AESENC et al. instructions. */
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            regNoL = eregOfRexRM(pfx, modrm);
+            regNoR = gregOfRexRM(pfx, modrm);
+            imm = getUChar(delta+1);
+            delta += 1+1;
+         } else {
+            regNoL = 16; /* use XMM16 as an intermediary */
+            regNoR = gregOfRexRM(pfx, modrm);
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            /* alignment check ???? . */
+            stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+            imm = getUChar(delta+alen);
+            delta += alen+1;
+         }
+
+         /* Who ya gonna call?  Presumably not Ghostbusters. */
+         void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
+         HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
+
+         /* Round up the arguments.  Note that this is a kludge -- the
+            use of mkU64 rather than mkIRExpr_HWord implies the
+            assumption that the host's word size is 64-bit. */
+         UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+         UInt gstOffR = xmmGuestRegOffset(regNoR);
+
+         IRExpr*  imme          = mkU64(imm & 0xFF);
+         IRExpr*  gstOffLe     = mkU64(gstOffL);
+         IRExpr*  gstOffRe     = mkU64(gstOffR);
+         IRExpr** args
+            = mkIRExprVec_3( imme, gstOffLe, gstOffRe );
+
+         IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
+         /* It's not really a dirty call, but we can't use the clean
+            helper mechanism here for the very lame reason that we can't
+            pass 2 x V128s by value to a helper, nor get one back.  Hence
+            this roundabout scheme. */
+         d->needsBBP = True;
+         d->nFxState = 2;
+         d->fxState[0].fx     = Ifx_Read;
+         d->fxState[0].offset = gstOffL;
+         d->fxState[0].size   = sizeof(U128);
+         d->fxState[1].fx     = Ifx_Write;
+         d->fxState[1].offset = gstOffR;
+         d->fxState[1].size   = sizeof(U128);
+         stmt( IRStmt_Dirty(d) );
+
+         DIP("aeskeygenassist $%x,%s,%s\n", (UInt)imm,
+             (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
+             nameXMMReg(regNoR));
+
+         goto decode_success;
+      }
+      break;
+
    default:
       break;