* fix Bug 290655 - Add support for AESKEYGENASSIST instruction
(VEX part)
Patch implementing the AES instructions (AESKEYGENASSIST, AESIMC,
AESENC, AESENCLAST, AESDEC, AESDECLAST).
git-svn-id: svn://svn.valgrind.org/vex/trunk@2247 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index 7d3ed34..55ecfe9 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -211,6 +211,55 @@
HWord edxIN, HWord eaxIN
);
+/* Implementation of intel AES instructions as described in
+ Intel Advanced Vector Extensions
+ Programming Reference
+ MARCH 2008
+ 319433-002.
+
+ CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
+ actually it could be a clean helper, but for the fact that we can't
+ pass by value 2 x V128 to a clean helper, nor have one returned.)
+ Reads guest state, writes to guest state, no
+ accesses of memory, is a pure function.
+
+ opc4 contains the 4th byte of opcode. Front-end should only
+ give opcode corresponding to AESENC/AESENCLAST/AESDEC/AESDECLAST/AESIMC.
+ (will assert otherwise).
+
+ gstOffL and gstOffR are the guest state offsets for the two XMM
+ register inputs and/or output. We never have to deal with the memory
+ case since that is handled by pre-loading the relevant value into the fake
+ XMM16 register.
+
+*/
+extern void amd64g_dirtyhelper_AES (
+ VexGuestAMD64State* gst,
+ HWord opc4,
+ HWord gstOffL, HWord gstOffR
+ );
+
+/* Implementation of AESKEYGENASSIST.
+
+ CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
+ actually it could be a clean helper, but for the fact that we can't
+ pass by value 1 x V128 to a clean helper, nor have one returned.)
+ Reads guest state, writes to guest state, no
+ accesses of memory, is a pure function.
+
+ imm8 is the Round Key constant.
+
+ gstOffL and gstOffR are the guest state offsets for the two XMM
+ register input and output. We never have to deal with the memory case since
+ that is handled by pre-loading the relevant value into the fake
+ XMM16 register.
+
+*/
+extern void amd64g_dirtyhelper_AESKEYGENASSIST (
+ VexGuestAMD64State* gst,
+ HWord imm8,
+ HWord gstOffL, HWord gstOffR
+ );
//extern void amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
//extern void amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index d554918..dd72b9c 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -2239,7 +2239,6 @@
dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
arat tpr_shadow vnmi flexpriority ept vpid
- MINUS aes (see below)
bogomips : 6957.57
clflush size : 64
cache_alignment : 64
@@ -2263,10 +2262,7 @@
SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
break;
case 0x00000001:
- // & ~(1<<25): don't claim to support AES insns. See
- // bug 249991.
- SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff & ~(1<<25),
- 0xbfebfbff);
+ SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
break;
case 0x00000002:
SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
@@ -2921,6 +2917,324 @@
}
}
+/*---------------------------------------------------------------*/
+/*--- AES primitives and helpers ---*/
+/*---------------------------------------------------------------*/
+/* a 16 x 16 matrix */
+static const UChar sbox[256] = { // row nr
+ 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
+ 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+ 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
+ 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+ 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
+ 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+ 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
+ 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+ 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
+ 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+ 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
+ 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+ 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
+ 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+ 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
+ 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+ 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
+ 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+ 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
+ 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+ 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
+ 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+ 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
+ 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+ 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
+ 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+ 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
+ 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+ 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
+ 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+ 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
+ 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+static void SubBytes (V128* v)
+{
+ V128 r;
+ UInt i;
+ for (i = 0; i < 16; i++)
+ r.w8[i] = sbox[v->w8[i]];
+ *v = r;
+}
+
+/* a 16 x 16 matrix */
+static const UChar invsbox[256] = { // row nr
+ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
+ 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+ 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
+ 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+ 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
+ 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+ 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
+ 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+ 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
+ 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+ 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
+ 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+ 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
+ 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+ 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
+ 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+ 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
+ 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+ 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
+ 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+ 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
+ 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+ 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
+ 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+ 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
+ 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+ 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
+ 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+ 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
+ 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+ 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
+ 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+static void InvSubBytes (V128* v)
+{
+ V128 r;
+ UInt i;
+ for (i = 0; i < 16; i++)
+ r.w8[i] = invsbox[v->w8[i]];
+ *v = r;
+}
+
+static const UChar ShiftRows_op[16] =
+ {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
+static void ShiftRows (V128* v)
+{
+ V128 r;
+ UInt i;
+ for (i = 0; i < 16; i++)
+ r.w8[i] = v->w8[ShiftRows_op[15-i]];
+ *v = r;
+}
+
+static const UChar InvShiftRows_op[16] =
+ {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
+static void InvShiftRows (V128* v)
+{
+ V128 r;
+ UInt i;
+ for (i = 0; i < 16; i++)
+ r.w8[i] = v->w8[InvShiftRows_op[15-i]];
+ *v = r;
+}
+
+/* Multiplication of the finite fields elements of AES.
+ See "A Specification for The AES Algorithm Rijndael
+ (by Joan Daemen & Vincent Rijmen)"
+ Dr. Brian Gladman, v3.1, 3rd March 2001. */
+/* N values so that (hex) xy = 0x03^N.
+ 0x00 cannot be used. We put 0xff for this value.*/
+/* a 16 x 16 matrix */
+static const UChar Nxy[256] = { // row nr
+ 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
+ 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
+ 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
+ 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
+ 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
+ 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
+ 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
+ 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
+ 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
+ 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
+ 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
+ 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
+ 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
+ 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
+ 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
+ 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
+ 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
+ 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
+ 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
+ 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
+ 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
+ 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
+ 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
+ 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
+ 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
+ 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
+ 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
+ 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
+ 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
+ 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
+ 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
+ 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
+};
+
+/* E values so that E = 0x03^xy. */
+static const UChar Exy[256] = { // row nr
+ 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
+ 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
+ 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
+ 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
+ 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
+ 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
+ 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
+ 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
+ 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
+ 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
+ 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
+ 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
+ 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
+ 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
+ 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
+ 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
+ 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
+ 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
+ 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
+ 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
+ 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
+ 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
+ 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
+ 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
+ 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
+ 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
+ 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
+ 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
+ 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
+ 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
+ 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
+ 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
+
+static inline UChar ff_mul(UChar u1, UChar u2)
+{
+ if ((u1 > 0) && (u2 > 0)) {
+ UInt ui = Nxy[u1] + Nxy[u2];
+ if (ui >= 255)
+ ui = ui - 255;
+ return Exy[ui];
+ } else {
+ return 0;
+ };
+}
+
+static void MixColumns (V128* v)
+{
+ V128 r;
+ Int j;
+#define P(x,row,col) (x)->w8[((row)*4+(col))]
+ for (j = 0; j < 4; j++) {
+ P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
+ ^ P(v,j,2) ^ P(v,j,3);
+ P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
+ ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
+ P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
+ ^ ff_mul(0x03, P(v,j,3) );
+ P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
+ ^ ff_mul( 0x02, P(v,j,3) );
+ }
+ *v = r;
+#undef P
+}
+
+static void InvMixColumns (V128* v)
+{
+ V128 r;
+ Int j;
+#define P(x,row,col) (x)->w8[((row)*4+(col))]
+ for (j = 0; j < 4; j++) {
+ P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
+ ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
+ P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
+ ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
+ P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
+ ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
+ P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
+ ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
+ }
+ *v = r;
+#undef P
+
+}
+
+/* For description, see definition in guest_amd64_defs.h */
+void amd64g_dirtyhelper_AES (
+ VexGuestAMD64State* gst,
+ HWord opc4,
+ HWord gstOffL, HWord gstOffR
+ )
+{
+ // where the args are
+ V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+ V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+ switch (opc4) {
+ case 0xDC: /* AESENC */
+ case 0xDD: /* AESENCLAST */
+ ShiftRows (argR);
+ SubBytes (argR);
+ if (opc4 == 0xDC)
+ MixColumns (argR);
+ argR->w64[0] = argR->w64[0] ^ argL->w64[0];
+ argR->w64[1] = argR->w64[1] ^ argL->w64[1];
+ break;
+
+ case 0xDE: /* AESDEC */
+ case 0xDF: /* AESDECLAST */
+ InvShiftRows (argR);
+ InvSubBytes (argR);
+ if (opc4 == 0xDE)
+ InvMixColumns (argR);
+ argR->w64[0] = argR->w64[0] ^ argL->w64[0];
+ argR->w64[1] = argR->w64[1] ^ argL->w64[1];
+ break;
+
+ case 0xDB: /* AESIMC */
+ *argR = *argL;
+ InvMixColumns (argR);
+ break;
+ default: vassert(0);
+ }
+}
+
+static inline UInt RotWord (UInt w32)
+{
+ return ((w32 >> 8) | (w32 << 24));
+}
+
+static inline UInt SubWord (UInt w32)
+{
+ UChar *w8;
+ UChar *r8;
+ UInt res;
+ w8 = (UChar*) &w32;
+ r8 = (UChar*) &res;
+ r8[0] = sbox[w8[0]];
+ r8[1] = sbox[w8[1]];
+ r8[2] = sbox[w8[2]];
+ r8[3] = sbox[w8[3]];
+ return res;
+}
+
+/* For description, see definition in guest_amd64_defs.h */
+extern void amd64g_dirtyhelper_AESKEYGENASSIST (
+ VexGuestAMD64State* gst,
+ HWord imm8,
+ HWord gstOffL, HWord gstOffR
+ )
+{
+ // where the args are
+ V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+ V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+ argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
+ argR->w32[2] = SubWord (argL->w32[3]);
+ argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
+ argR->w32[0] = SubWord (argL->w32[1]);
+}
+
+
/*---------------------------------------------------------------*/
/*--- Helpers for dealing with, and describing, ---*/
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 0e845b0..9002d05 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -14942,6 +14942,91 @@
}
break;
+ case 0xDC:
+ case 0xDD:
+ case 0xDE:
+ case 0xDF:
+ case 0xDB:
+ /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
+ DD /r = AESENCLAST xmm1, xmm2/m128
+ DE /r = AESDEC xmm1, xmm2/m128
+ DF /r = AESDECLAST xmm1, xmm2/m128
+
+ DB /r = AESIMC xmm1, xmm2/m128 */
+ if (have66noF2noF3(pfx) && sz == 2) {
+ UInt regNoL = 0;
+ UInt regNoR = 0;
+
+ /* This is a nasty kludge. We need to pass 2 x V128 to the
+ helper. Since we can't do that, use a dirty
+ helper to compute the results directly from the XMM regs in
+ the guest state. That means for the memory case, we need to
+ move the left operand into a pseudo-register (XMM16, let's
+ call it). */
+ modrm = getUChar(delta);
+ if (epartIsReg(modrm)) {
+ regNoL = eregOfRexRM(pfx, modrm);
+ regNoR = gregOfRexRM(pfx, modrm);
+ delta += 1;
+ } else {
+ regNoL = 16; /* use XMM16 as an intermediary */
+ regNoR = gregOfRexRM(pfx, modrm);
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+ /* alignment check needed ???? */
+ stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+ delta += alen;
+ }
+
+ void* fn = &amd64g_dirtyhelper_AES;
+ HChar* nm = "amd64g_dirtyhelper_AES";
+
+ /* Round up the arguments. Note that this is a kludge -- the
+ use of mkU64 rather than mkIRExpr_HWord implies the
+ assumption that the host's word size is 64-bit. */
+ UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+ UInt gstOffR = xmmGuestRegOffset(regNoR);
+ IRExpr* opc4 = mkU64(opc);
+ IRExpr* gstOffLe = mkU64(gstOffL);
+ IRExpr* gstOffRe = mkU64(gstOffR);
+ IRExpr** args
+ = mkIRExprVec_3( opc4, gstOffLe, gstOffRe );
+
+ IRDirty* d = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
+ /* It's not really a dirty call, but we can't use the clean
+ helper mechanism here for the very lame reason that we can't
+ pass 2 x V128s by value to a helper, nor get one back. Hence
+ this roundabout scheme. */
+ d->needsBBP = True;
+ d->nFxState = 2;
+ /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
+ the second.
+ AESIMC (0xDB) reads the first register, and writes the second. */
+ d->fxState[0].fx = Ifx_Read;
+ d->fxState[0].offset = gstOffL;
+ d->fxState[0].size = sizeof(U128);
+ d->fxState[1].fx = (opc == 0xDB ? Ifx_Write : Ifx_Modify);
+ d->fxState[1].offset = gstOffR;
+ d->fxState[1].size = sizeof(U128);
+
+ stmt( IRStmt_Dirty(d) );
+ {
+ HChar* opsuf;
+ switch (opc) {
+ case 0xDC: opsuf = "enc"; break;
+ case 0XDD: opsuf = "enclast"; break;
+ case 0xDE: opsuf = "dec"; break;
+ case 0xDF: opsuf = "declast"; break;
+ case 0xDB: opsuf = "imc"; break;
+ default: vassert(0);
+ }
+ DIP("aes%s %s,%s\n", opsuf,
+ (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
+ nameXMMReg(regNoR));
+ }
+ goto decode_success;
+ }
+ break;
+
case 0xF0:
case 0xF1:
/* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
@@ -16179,6 +16264,69 @@
}
break;
+ case 0xdf:
+ /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
+ if (have66noF2noF3(pfx) && sz == 2) {
+ UInt regNoL = 0;
+ UInt regNoR = 0;
+ UChar imm = 0;
+
+ /* This is a nasty kludge. See AESENC et al. instructions. */
+ modrm = getUChar(delta);
+ if (epartIsReg(modrm)) {
+ regNoL = eregOfRexRM(pfx, modrm);
+ regNoR = gregOfRexRM(pfx, modrm);
+ imm = getUChar(delta+1);
+ delta += 1+1;
+ } else {
+ regNoL = 16; /* use XMM16 as an intermediary */
+ regNoR = gregOfRexRM(pfx, modrm);
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+ /* alignment check ???? . */
+ stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+ imm = getUChar(delta+alen);
+ delta += alen+1;
+ }
+
+ /* Who ya gonna call? Presumably not Ghostbusters. */
+ void* fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
+ HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
+
+ /* Round up the arguments. Note that this is a kludge -- the
+ use of mkU64 rather than mkIRExpr_HWord implies the
+ assumption that the host's word size is 64-bit. */
+ UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+ UInt gstOffR = xmmGuestRegOffset(regNoR);
+
+ IRExpr* imme = mkU64(imm & 0xFF);
+ IRExpr* gstOffLe = mkU64(gstOffL);
+ IRExpr* gstOffRe = mkU64(gstOffR);
+ IRExpr** args
+ = mkIRExprVec_3( imme, gstOffLe, gstOffRe );
+
+ IRDirty* d = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
+ /* It's not really a dirty call, but we can't use the clean
+ helper mechanism here for the very lame reason that we can't
+ pass 2 x V128s by value to a helper, nor get one back. Hence
+ this roundabout scheme. */
+ d->needsBBP = True;
+ d->nFxState = 2;
+ d->fxState[0].fx = Ifx_Read;
+ d->fxState[0].offset = gstOffL;
+ d->fxState[0].size = sizeof(U128);
+ d->fxState[1].fx = Ifx_Write;
+ d->fxState[1].offset = gstOffR;
+ d->fxState[1].size = sizeof(U128);
+ stmt( IRStmt_Dirty(d) );
+
+ DIP("aeskeygenassist $%x,%s,%s\n", (UInt)imm,
+ (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
+ nameXMMReg(regNoR));
+
+ goto decode_success;
+ }
+ break;
+
default:
break;