8178800: compiler/c2/PolynomialRoot.java fails on Xeon Phi linux host with UseAVX=3
Summary: upper register bank support added for novl machines that emit EVEX
Reviewed-by: kvn, thartmann
diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp
index f2c8393..6b7f947 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -4134,28 +4134,33 @@
   if ((dst_enc < 16) && (nds_enc < 16)) {
     vandps(dst, nds, negate_field, vector_len);
   } else if ((src_enc < 16) && (dst_enc < 16)) {
-    movss(src, nds);
+    evmovdqul(src, nds, Assembler::AVX_512bit);
     vandps(dst, src, negate_field, vector_len);
   } else if (src_enc < 16) {
-    movss(src, nds);
+    evmovdqul(src, nds, Assembler::AVX_512bit);
     vandps(src, src, negate_field, vector_len);
-    movss(dst, src);
+    evmovdqul(dst, src, Assembler::AVX_512bit);
   } else if (dst_enc < 16) {
-    movdqu(src, xmm0);
-    movss(xmm0, nds);
+    evmovdqul(src, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
     vandps(dst, xmm0, negate_field, vector_len);
-    movdqu(xmm0, src);
-  } else if (nds_enc < 16) {
-    movdqu(src, xmm0);
-    vandps(xmm0, nds, negate_field, vector_len);
-    movss(dst, xmm0);
-    movdqu(xmm0, src);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
   } else {
-    movdqu(src, xmm0);
-    movss(xmm0, nds);
-    vandps(xmm0, xmm0, negate_field, vector_len);
-    movss(dst, xmm0);
-    movdqu(xmm0, src);
+    if (src_enc != dst_enc) {
+      evmovdqul(src, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+      vandps(xmm0, xmm0, negate_field, vector_len);
+      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    } else {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+      vandps(xmm0, xmm0, negate_field, vector_len);
+      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    }
   }
 }
 
@@ -4166,28 +4171,33 @@
   if ((dst_enc < 16) && (nds_enc < 16)) {
     vandpd(dst, nds, negate_field, vector_len);
   } else if ((src_enc < 16) && (dst_enc < 16)) {
-    movsd(src, nds);
+    evmovdqul(src, nds, Assembler::AVX_512bit);
     vandpd(dst, src, negate_field, vector_len);
   } else if (src_enc < 16) {
-    movsd(src, nds);
+    evmovdqul(src, nds, Assembler::AVX_512bit);
     vandpd(src, src, negate_field, vector_len);
-    movsd(dst, src);
+    evmovdqul(dst, src, Assembler::AVX_512bit);
   } else if (dst_enc < 16) {
-    movdqu(src, xmm0);
-    movsd(xmm0, nds);
+    evmovdqul(src, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
     vandpd(dst, xmm0, negate_field, vector_len);
-    movdqu(xmm0, src);
-  } else if (nds_enc < 16) {
-    movdqu(src, xmm0);
-    vandpd(xmm0, nds, negate_field, vector_len);
-    movsd(dst, xmm0);
-    movdqu(xmm0, src);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
   } else {
-    movdqu(src, xmm0);
-    movsd(xmm0, nds);
-    vandpd(xmm0, xmm0, negate_field, vector_len);
-    movsd(dst, xmm0);
-    movdqu(xmm0, src);
+    if (src_enc != dst_enc) {
+      evmovdqul(src, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+      vandpd(xmm0, xmm0, negate_field, vector_len);
+      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    } else {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+      vandpd(xmm0, xmm0, negate_field, vector_len);
+      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    }
   }
 }
 
@@ -4934,6 +4944,24 @@
   }
 }
 
+void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
+  if (VM_Version::supports_avx512vl()) {
+    Assembler::pshufd(dst, src, mode);
+  } else {
+    int dst_enc = dst->encoding();
+    if (dst_enc < 16) {
+      Assembler::pshufd(dst, src, mode);
+    } else {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      Assembler::pshufd(xmm0, src, mode);
+      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    }
+  }
+}
+
 // This instruction exists within macros, ergo we cannot control its input
 // when emitted through those patterns.
 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp
index 39b2e1a..2c2b17c 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp
@@ -1232,6 +1232,9 @@
   void punpcklbw(XMMRegister dst, XMMRegister src);
   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
 
+  void pshufd(XMMRegister dst, Address src, int mode);
+  void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
+
   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }