8073108: Use x86 and SPARC CPU instructions for GHASH acceleration
Reviewed-by: kvn, jrose, phh
diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp
index bbc5269..3c59c96 100644
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp
@@ -194,6 +194,11 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (has_vshasig()) {
if (FLAG_IS_DEFAULT(UseSHA)) {
UseSHA = true;
diff --git a/src/cpu/sparc/vm/assembler_sparc.hpp b/src/cpu/sparc/vm/assembler_sparc.hpp
index dd83b09..55f3387 100644
--- a/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@
flog3_op3 = 0x36,
edge_op3 = 0x36,
fsrc_op3 = 0x36,
+ xmulx_op3 = 0x36,
impdep2_op3 = 0x37,
stpartialf_op3 = 0x37,
jmpl_op3 = 0x38,
@@ -220,6 +221,8 @@
mdtox_opf = 0x110,
mstouw_opf = 0x111,
mstosw_opf = 0x113,
+ xmulx_opf = 0x115,
+ xmulxhi_opf = 0x116,
mxtod_opf = 0x118,
mwtos_opf = 0x119,
@@ -1212,6 +1215,9 @@
void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
+ void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+ void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
// Crypto SHA instructions
void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
diff --git a/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/src/cpu/sparc/vm/stubGenerator_sparc.cpp
index 6ad0b1a..5106492 100644
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@@ -4788,6 +4788,130 @@
return start;
}
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_aligned, L_main;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ Register state = I0;
+ Register subkeyH = I1;
+ Register data = I2;
+ Register len = I3;
+
+ __ save_frame(0);
+
+ __ ldx(state, 0, O0);
+ __ ldx(state, 8, O1);
+
+ // Loop label for multiblock operations
+ __ BIND(L_ghash_loop);
+
+ // Check if 'data' is unaligned
+ __ andcc(data, 7, G1);
+ __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+ __ delayed()->nop();
+
+ Register left_shift = L1;
+ Register right_shift = L2;
+ Register data_ptr = L3;
+
+ // Get left and right shift values in bits
+ __ sll(G1, LogBitsPerByte, left_shift);
+ __ mov(64, right_shift);
+ __ sub(right_shift, left_shift, right_shift);
+
+ // Align to read 'data'
+ __ sub(data, G1, data_ptr);
+
+ // Load first 8 bytes of 'data'
+ __ ldx(data_ptr, 0, O4);
+ __ sllx(O4, left_shift, O4);
+ __ ldx(data_ptr, 8, O5);
+ __ srlx(O5, right_shift, G4);
+ __ bset(G4, O4);
+
+ // Load second 8 bytes of 'data'
+ __ sllx(O5, left_shift, O5);
+ __ ldx(data_ptr, 16, G4);
+ __ srlx(G4, right_shift, G4);
+ __ ba(L_main);
+ __ delayed()->bset(G4, O5);
+
+ // If 'data' is aligned, load normally
+ __ BIND(L_aligned);
+ __ ldx(data, 0, O4);
+ __ ldx(data, 8, O5);
+
+ __ BIND(L_main);
+ __ ldx(subkeyH, 0, O2);
+ __ ldx(subkeyH, 8, O3);
+
+ __ xor3(O0, O4, O0);
+ __ xor3(O1, O5, O1);
+
+ __ xmulxhi(O0, O3, G3);
+ __ xmulx(O0, O2, O5);
+ __ xmulxhi(O1, O2, G4);
+ __ xmulxhi(O1, O3, G5);
+ __ xmulx(O0, O3, G1);
+ __ xmulx(O1, O3, G2);
+ __ xmulx(O1, O2, O3);
+ __ xmulxhi(O0, O2, O4);
+
+ __ mov(0xE1, O0);
+ __ sllx(O0, 56, O0);
+
+ __ xor3(O5, G3, O5);
+ __ xor3(O5, G4, O5);
+ __ xor3(G5, G1, G1);
+ __ xor3(G1, O3, G1);
+ __ srlx(G2, 63, O1);
+ __ srlx(G1, 63, G3);
+ __ sllx(G2, 63, O3);
+ __ sllx(G2, 58, O2);
+ __ xor3(O3, O2, O2);
+
+ __ sllx(G1, 1, G1);
+ __ or3(G1, O1, G1);
+
+ __ xor3(G1, O2, G1);
+
+ __ sllx(G2, 1, G2);
+
+ __ xmulxhi(G1, O0, O1);
+ __ xmulx(G1, O0, O2);
+ __ xmulxhi(G2, O0, O3);
+ __ xmulx(G2, O0, G1);
+
+ __ xor3(O4, O1, O4);
+ __ xor3(O5, O2, O5);
+ __ xor3(O5, O3, O5);
+
+ __ sllx(O4, 1, O2);
+ __ srlx(O5, 63, O3);
+
+ __ or3(O2, O3, O0);
+
+ __ sllx(O5, 1, O1);
+ __ srlx(G1, 63, O2);
+ __ or3(O1, O2, O1);
+ __ xor3(O1, G3, O1);
+
+ __ deccc(len);
+ __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+ __ delayed()->add(data, 16, data);
+
+ __ stx(O0, I0, 0);
+ __ stx(O1, I0, 8);
+
+ __ ret();
+ __ delayed()->restore();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
@@ -4860,6 +4984,10 @@
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
// generate SHA1/SHA256/SHA512 intrinsics code
if (UseSHA1Intrinsics) {
diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp
index c0cd16a..793dc18 100644
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp
@@ -319,6 +319,17 @@
}
}
+ // GHASH/GCM intrinsics
+ if (has_vis3() && (UseVIS > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 7cbc47d..1759ecd 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -2575,6 +2575,15 @@
emit_int8(shift);
}
+void Assembler::pslldq(XMMRegister dst, int shift) {
+ // Shift left 128 bit value in xmm register by number of bytes.
+ NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+ int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66);
+ emit_int8(0x73);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(shift);
+}
+
void Assembler::ptest(XMMRegister dst, Address src) {
assert(VM_Version::supports_sse4_1(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index 341d9e3..5ea0131 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -1527,6 +1527,8 @@
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
+ // Shift Left by bytes Logical DoubleQuadword Immediate
+ void pslldq(XMMRegister dst, int shift);
// Logical Compare 128bit
void ptest(XMMRegister dst, XMMRegister src);
diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index 50a06d7..235cdb7 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -2719,6 +2719,167 @@
return start;
}
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0b0a0908, relocInfo::none, 0);
+ __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+ __ emit_data(0x03020100, relocInfo::none, 0);
+ __ emit_data(0x07060504, relocInfo::none, 0);
+
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = rdi;
+ const Register subkeyH = rsi;
+ const Register data = rdx;
+ const Register blocks = rcx;
+
+ const Address state_param(rbp, 8+0);
+ const Address subkeyH_param(rbp, 8+4);
+ const Address data_param(rbp, 8+8);
+ const Address blocks_param(rbp, 8+12);
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+
+ __ enter();
+
+ __ movptr(state, state_param);
+ __ movptr(subkeyH, subkeyH_param);
+ __ movptr(data, data_param);
+ __ movptr(blocks, blocks_param);
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp6);
+ __ pslld (xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp4, 31);
+ __ movdqu(xmm_temp5, xmm_temp7);
+ __ pslldq(xmm_temp4, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp5, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp4);
+ __ por(xmm_temp6, xmm_temp5);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp4, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp5, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp5);
+ __ movdqu(xmm_temp4, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp4, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp7, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp4);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ // Byte swap 16-byte result
+ __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -3018,6 +3179,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 1d38af7..c5811b2 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3639,6 +3639,175 @@
return start;
}
+
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+ __ emit_data64(0x0706050403020100, relocInfo::none );
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+ __ emit_data64(0x0001020304050607, relocInfo::none );
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = c_rarg0;
+ const Register subkeyH = c_rarg1;
+ const Register data = c_rarg2;
+ const Register blocks = c_rarg3;
+
+#ifdef _WIN64
+ const int XMM_REG_LAST = 10;
+#endif
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+ const XMMRegister xmm_temp8 = xmm8;
+ const XMMRegister xmm_temp9 = xmm9;
+ const XMMRegister xmm_temp10 = xmm10;
+
+ __ enter();
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-10
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+
+ __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, xmm_temp10);
+
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, xmm_temp10);
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp6);
+ __ pslld(xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp8, 31);
+ __ movdqu(xmm_temp9, xmm_temp7);
+ __ pslldq(xmm_temp8, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp9, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp8);
+ __ por(xmm_temp6, xmm_temp9);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp3);
+ __ movdqu(xmm_temp9, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp9);
+ __ movdqu(xmm_temp8, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp8, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp4, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp8);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -4077,6 +4246,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
diff --git a/src/cpu/x86/vm/stubRoutines_x86.cpp b/src/cpu/x86/vm/stubRoutines_x86.cpp
index 200f2af..9b0d8fc 100644
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
diff --git a/src/cpu/x86/vm/stubRoutines_x86.hpp b/src/cpu/x86/vm/stubRoutines_x86.hpp
index d8e52ab..bb16048 100644
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
+ // swap mask for ghash
+ static address _ghash_long_swap_mask_addr;
+ static address _ghash_byte_swap_mask_addr;
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
+ static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+ static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index fd0a68d..1f5ae75 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -594,6 +594,17 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ // GHASH/GCM intrinsics
+ if (UseCLMUL && (UseSSE > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp
index 7f0e820..46f2e2d 100644
--- a/src/share/vm/classfile/vmSymbols.hpp
+++ b/src/share/vm/classfile/vmSymbols.hpp
@@ -863,6 +863,12 @@
do_name( implCompressMB_name, "implCompressMultiBlock0") \
do_signature(implCompressMB_signature, "([BII)I") \
\
+ /* support for com.sun.crypto.provider.GHASH */ \
+ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \
+ do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
+ do_name(processBlocks_name, "processBlocks") \
+ do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \
+ \
/* support for java.util.zip */ \
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \
diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp
index ac82173..66e5c33 100644
--- a/src/share/vm/opto/escape.cpp
+++ b/src/share/vm/opto/escape.cpp
@@ -952,6 +952,7 @@
strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp
index 4bb5ca8..5c5fec5 100644
--- a/src/share/vm/opto/library_call.cpp
+++ b/src/share/vm/opto/library_call.cpp
@@ -311,6 +311,7 @@
Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+ bool inline_ghash_processBlocks();
bool inline_sha_implCompress(vmIntrinsics::ID id);
bool inline_digestBase_implCompressMB(int predicate);
bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA,
@@ -570,6 +571,10 @@
predicates = 3;
break;
+ case vmIntrinsics::_ghash_processBlocks:
+ if (!UseGHASHIntrinsics) return NULL;
+ break;
+
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
@@ -957,6 +962,9 @@
case vmIntrinsics::_montgomerySquare:
return inline_montgomerySquare();
+ case vmIntrinsics::_ghash_processBlocks:
+ return inline_ghash_processBlocks();
+
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
@@ -6599,6 +6607,35 @@
return _gvn.transform(region);
}
+//------------------------------inline_ghash_processBlocks
+bool LibraryCallKit::inline_ghash_processBlocks() {
+ address stubAddr;
+ const char *stubName;
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics support");
+
+ stubAddr = StubRoutines::ghash_processBlocks();
+ stubName = "ghash_processBlocks";
+
+ Node* data = argument(0);
+ Node* offset = argument(1);
+ Node* len = argument(2);
+ Node* state = argument(3);
+ Node* subkeyH = argument(4);
+
+ Node* state_start = array_element_address(state, intcon(0), T_LONG);
+ assert(state_start, "state is NULL");
+ Node* subkeyH_start = array_element_address(subkeyH, intcon(0), T_LONG);
+ assert(subkeyH_start, "subkeyH is NULL");
+ Node* data_start = array_element_address(data, offset, T_BYTE);
+ assert(data_start, "data is NULL");
+
+ Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::ghash_processBlocks_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ state_start, subkeyH_start, data_start, len);
+ return true;
+}
+
//------------------------------inline_sha_implCompress-----------------------
//
// Calculate SHA (i.e., SHA-1) for single-block byte[] array.
diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp
index 57d2f57..4562dbc 100644
--- a/src/share/vm/opto/runtime.cpp
+++ b/src/share/vm/opto/runtime.cpp
@@ -92,7 +92,25 @@
// At command line specify the parameters: -XX:+FullGCALot -XX:FullGCALotStart=100000000
+// GHASH block processing
+const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
+ int argcnt = 4;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // state
+ fields[argp++] = TypePtr::NOTNULL; // subkeyH
+ fields[argp++] = TypePtr::NOTNULL; // data
+ fields[argp++] = TypeInt::INT; // blocks
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // result type needed
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = NULL; // void
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+ return TypeFunc::make(domain, range);
+}
// Compiled code entry points
address OptoRuntime::_new_instance_Java = NULL;
diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp
index 99f03f4..58c6bd5 100644
--- a/src/share/vm/opto/runtime.hpp
+++ b/src/share/vm/opto/runtime.hpp
@@ -311,6 +311,8 @@
static const TypeFunc* montgomeryMultiply_Type();
static const TypeFunc* montgomerySquare_Type();
+ static const TypeFunc* ghash_processBlocks_Type();
+
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp
index 421a3ba..0fdf47c 100644
--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -602,6 +602,9 @@
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used on SPARC") \
\
+ product(bool, UseGHASHIntrinsics, false, \
+ "Use intrinsics for GHASH versions of crypto") \
+ \
product(uintx, LargePageSizeInBytes, 0, \
"Large page size (0 to let VM choose the page size)") \
\
diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp
index b2b3a90..d943248 100644
--- a/src/share/vm/runtime/stubRoutines.cpp
+++ b/src/share/vm/runtime/stubRoutines.cpp
@@ -124,6 +124,7 @@
address StubRoutines::_aescrypt_decryptBlock = NULL;
address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_ghash_processBlocks = NULL;
address StubRoutines::_sha1_implCompress = NULL;
address StubRoutines::_sha1_implCompressMB = NULL;
diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp
index 42808a4..0f6641c 100644
--- a/src/share/vm/runtime/stubRoutines.hpp
+++ b/src/share/vm/runtime/stubRoutines.hpp
@@ -197,6 +197,7 @@
static address _aescrypt_decryptBlock;
static address _cipherBlockChaining_encryptAESCrypt;
static address _cipherBlockChaining_decryptAESCrypt;
+ static address _ghash_processBlocks;
static address _sha1_implCompress;
static address _sha1_implCompressMB;
@@ -359,6 +360,7 @@
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
+ static address ghash_processBlocks() { return _ghash_processBlocks; }
static address sha1_implCompress() { return _sha1_implCompress; }
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
diff --git a/src/share/vm/runtime/vmStructs.cpp b/src/share/vm/runtime/vmStructs.cpp
index 161a4c4..7f6f848 100644
--- a/src/share/vm/runtime/vmStructs.cpp
+++ b/src/share/vm/runtime/vmStructs.cpp
@@ -810,6 +810,7 @@
static_field(StubRoutines, _aescrypt_decryptBlock, address) \
static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
+ static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _multiplyToLen, address) \
diff --git a/test/compiler/7184394/TestAESBase.java b/test/compiler/7184394/TestAESBase.java
index 4d32048..c3bca26 100644
--- a/test/compiler/7184394/TestAESBase.java
+++ b/test/compiler/7184394/TestAESBase.java
@@ -29,6 +29,7 @@
import javax.crypto.Cipher;
import javax.crypto.KeyGenerator;
import javax.crypto.SecretKey;
+import javax.crypto.spec.GCMParameterSpec;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import java.security.AlgorithmParameters;
@@ -64,6 +65,10 @@
Cipher dCipher;
AlgorithmParameters algParams;
SecretKey key;
+ GCMParameterSpec gcm_spec;
+ byte[] aad;
+ int tlen = 12;
+ byte[] iv;
static int numThreads = 0;
int threadId;
@@ -102,6 +107,12 @@
int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+ } else if (mode.equals("GCM")) {
+ iv = new byte[64];
+ random.nextBytes(iv);
+ aad = new byte[5];
+ random.nextBytes(aad);
+ gcm_init();
} else {
algParams = cipher.getParameters();
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
@@ -188,4 +199,12 @@
}
abstract void childShowCipher();
+
+ void gcm_init() throws Exception {
+ tlen = 12;
+ gcm_spec = new GCMParameterSpec(tlen * 8, iv);
+ cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+ cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec);
+ cipher.update(aad);
+ }
}
diff --git a/test/compiler/7184394/TestAESEncode.java b/test/compiler/7184394/TestAESEncode.java
index f1a35bd..163ebb8 100644
--- a/test/compiler/7184394/TestAESEncode.java
+++ b/test/compiler/7184394/TestAESEncode.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,11 @@
@Override
public void run() {
try {
- if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ if (mode.equals("GCM")) {
+ gcm_init();
+ } else if (!noReinit) {
+ cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ }
encode = new byte[encodeLength];
if (testingMisalignment) {
int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
diff --git a/test/compiler/7184394/TestAESMain.java b/test/compiler/7184394/TestAESMain.java
index 20929e8..ddd8eea 100644
--- a/test/compiler/7184394/TestAESMain.java
+++ b/test/compiler/7184394/TestAESMain.java
@@ -41,6 +41,13 @@
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
*
* @author Tom Deneau
*/