8269528: VectorAPI Long512VectorTest fails on X86 KNL target

Reviewed-by: kvn, sviswanathan
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
index 3e61c86..7281501 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -1462,7 +1462,7 @@
   }
 }
 
-void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
+void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
   if (vlen_in_bytes <= 16) {
     pxor (dst, dst);
     psubb(dst, src);
@@ -1477,10 +1477,12 @@
       default: assert(false, "%s", type2name(elem_bt));
     }
   } else {
+    assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
     int vlen_enc = vector_length_encoding(vlen_in_bytes);
 
     vpxor (dst, dst, dst, vlen_enc);
-    vpsubb(dst, dst, src, vlen_enc);
+    vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
+
     switch (elem_bt) {
       case T_BYTE:   /* nothing to do */            break;
       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
index 0a61be3..25a470a 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@@ -142,7 +142,7 @@
   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);
   void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
 
-  void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt);
+  void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy);
   void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
 
   // vector compare
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index ea50eeb..7e3c7f2 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -7454,7 +7454,8 @@
 
 //------------------------------------- LoadMask --------------------------------------------
 
-instruct loadMask(vec dst, vec src) %{
+instruct loadMask(legVec dst, legVec src) %{
+  predicate(!VM_Version::supports_avx512vlbw());
   match(Set dst (VectorLoadMask src));
   effect(TEMP dst);
   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
@@ -7462,7 +7463,21 @@
     int vlen_in_bytes = vector_length_in_bytes(this);
     BasicType elem_bt = vector_element_basic_type(this);
 
-    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt);
+    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadMask_evex(vec dst, vec src) %{
+  predicate(VM_Version::supports_avx512vlbw());
+  match(Set dst (VectorLoadMask src));
+  effect(TEMP dst);
+  format %{ "vector_loadmask_byte $dst,$src\n\t" %}
+  ins_encode %{
+    int vlen_in_bytes = vector_length_in_bytes(this);
+    BasicType elem_bt = vector_element_basic_type(this);
+
+    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, false);
   %}
   ins_pipe( pipe_slow );
 %}