Add support for vex coding scheme in x86 assembler

This patch adds support to emit VEX prefix which is needed
to emit instructions namely andn, blsmsk, blsr, blsi
on a cpu that has AVX2.

Test: ./test.py --host --64, test-art-host-gtest
Change-Id: I6b4902caf8560e4406c5053b142686ed28ba5404
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 86f9010..2d1e451 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -59,6 +59,98 @@
   }
 }
 
+uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
+  // VEX Byte 1
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B
+  }
+
+  // VEX.mmmmm
+  switch (mmmmm) {
+  case 1:
+    // implied 0F leading opcode byte
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // implied leading 0F 38 opcode byte
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // implied leading OF 3A opcode byte
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+  return vex_prefix;
+}
+
+uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
+  uint8_t vex_prefix = 0;
+  // VEX Byte 2
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+  // VEX.vvvv
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    Register vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
+
 void X86Assembler::call(Register reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xFF);
@@ -179,6 +271,60 @@
   EmitOperand(src, dst);
 }
 
+void X86Assembler::blsi(Register dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  /*b=*/ false,
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ false,
+                                  /*l=*/ 128,
+                                  X86ManagedRegister::FromCpuRegister(dst),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(3, src);
+}
+
+void X86Assembler::blsmsk(Register dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  /*b=*/ false,
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ false,
+                                  /*l=*/ 128,
+                                  X86ManagedRegister::FromCpuRegister(dst),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(2, src);
+}
+
+void X86Assembler::blsr(Register dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  /*b=*/ false,
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ false,
+                                  /*l=*/ 128,
+                                  X86ManagedRegister::FromCpuRegister(dst),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(1, src);
+}
+
 void X86Assembler::bswapl(Register dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
@@ -1267,6 +1413,25 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::andn(Register dst, Register src1, Register src2) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  /*b=*/ false,
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ false,
+                                  /*l=*/ 128,
+                                  X86ManagedRegister::FromCpuRegister(src1),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field
+  EmitUint8(0xF2);
+  EmitRegisterOperand(dst, src2);
+}
+
 
 void X86Assembler::andnpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 5ac9236..275e5c1 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -337,6 +337,10 @@
 
   void movntl(const Address& dst, Register src);
 
+  void blsi(Register dst, Register src);  // no addr variant (for now)
+  void blsmsk(Register dst, Register src);  // no addr variant (for now)
+  void blsr(Register dst, Register src);  // no addr varianr (for now)
+
   void bswapl(Register dst);
 
   void bsfl(Register dst, Register src);
@@ -500,6 +504,7 @@
   void andps(XmmRegister dst, const Address& src);
   void pand(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
 
+  void andn(Register dst, Register src1, Register src2);  // no addr variant (for now)
   void andnpd(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void andnps(XmmRegister dst, XmmRegister src);
   void pandn(XmmRegister dst, XmmRegister src);
@@ -837,6 +842,11 @@
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);
 
+  // Emit a 3 byte VEX Prefix
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister operand, int pp);
+
   ConstantArea constant_area_;
 
   DISALLOW_COPY_AND_ASSIGN(X86Assembler);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index ad75174..1d8bfe7 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -349,6 +349,18 @@
   DriverStr(expected, "rep_movsw");
 }
 
+TEST_F(AssemblerX86Test, Blsmask) {
+  DriverStr(RepeatRR(&x86::X86Assembler::blsmsk, "blsmsk %{reg2}, %{reg1}"), "blsmsk");
+}
+
+TEST_F(AssemblerX86Test, Blsi) {
+  DriverStr(RepeatRR(&x86::X86Assembler::blsi, "blsi %{reg2}, %{reg1}"), "blsi");
+}
+
+TEST_F(AssemblerX86Test, Blsr) {
+  DriverStr(RepeatRR(&x86::X86Assembler::blsr, "blsr %{reg2}, %{reg1}"), "blsr");
+}
+
 TEST_F(AssemblerX86Test, Bsfl) {
   DriverStr(RepeatRR(&x86::X86Assembler::bsfl, "bsfl %{reg2}, %{reg1}"), "bsfl");
 }
@@ -657,6 +669,10 @@
   DriverStr(RepeatFF(&x86::X86Assembler::pand, "pand %{reg2}, %{reg1}"), "pand");
 }
 
+TEST_F(AssemblerX86Test, Andn) {
+  DriverStr(RepeatRRR(&x86::X86Assembler::andn, "andn %{reg3}, %{reg2}, %{reg1}"), "andn");
+}
+
 TEST_F(AssemblerX86Test, AndnPD) {
   DriverStr(RepeatFF(&x86::X86Assembler::andnpd, "andnpd %{reg2}, %{reg1}"), "andnpd");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd31561..ae68fe9 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -64,6 +64,99 @@
   }
 }
 
+uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) {
+  // VEX Byte 1
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B
+  }
+
+  // VEX.mmmmm
+  switch (mmmmm) {
+  case 1:
+    // implied 0F leading opcode byte
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // implied leading 0F 38 opcode byte
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // implied leading OF 3A opcode byte
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+
+  return vex_prefix;
+}
+
+uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) {
+  // VEX Byte 2
+  uint8_t vex_prefix = 0;
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+  // VEX.vvvv
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    CpuRegister vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
+
 void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg);
@@ -1483,6 +1576,25 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::andn(CpuRegister dst, CpuRegister src1, CpuRegister src2) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(dst.NeedsRex(),
+                                  /*x=*/ false,
+                                  src2.NeedsRex(),
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ true,
+                                  /*l=*/ 128,
+                                  X86_64ManagedRegister::FromCpuRegister(src1.AsRegister()),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field
+  EmitUint8(0xF2);
+  EmitRegisterOperand(dst.LowBits(), src2.LowBits());
+}
+
 void X86_64Assembler::andnpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -3260,6 +3372,60 @@
   EmitUint8(0xC0 + dst.LowBits());
 }
 
+void X86_64Assembler::blsi(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  src.NeedsRex(),
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ true,
+                                  /*l=*/ 128,
+                                  X86_64ManagedRegister::FromCpuRegister(dst.AsRegister()),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(3, src.LowBits());
+}
+
+void X86_64Assembler::blsmsk(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  src.NeedsRex(),
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ true,
+                                  /*l=*/ 128,
+                                  X86_64ManagedRegister::FromCpuRegister(dst.AsRegister()),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(2, src.LowBits());
+}
+
+void X86_64Assembler::blsr(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(/*is_two_byte=*/ false);
+  uint8_t byte_one = EmitVexByte1(/*r=*/ false,
+                                  /*x=*/ false,
+                                  src.NeedsRex(),
+                                  /*mmmmm=*/ 2);
+  uint8_t byte_two = EmitVexByte2(/*w=*/ true,
+                                  /*l=*/ 128,
+                                  X86_64ManagedRegister::FromCpuRegister(dst.AsRegister()),
+                                  /*pp=*/ 0);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xF3);
+  EmitRegisterOperand(1, src.LowBits());
+}
+
 void X86_64Assembler::bswapl(CpuRegister dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex(false, false, false, false, dst.NeedsRex());
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index e696635..ff13ea3 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -543,6 +543,7 @@
   void andps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void pand(XmmRegister dst, XmmRegister src);
 
+  void andn(CpuRegister dst, CpuRegister src1, CpuRegister src2);
   void andnpd(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void andnps(XmmRegister dst, XmmRegister src);
   void pandn(XmmRegister dst, XmmRegister src);
@@ -796,6 +797,10 @@
   void bsfq(CpuRegister dst, CpuRegister src);
   void bsfq(CpuRegister dst, const Address& src);
 
+  void blsi(CpuRegister dst, CpuRegister src);  // no addr variant (for now)
+  void blsmsk(CpuRegister dst, CpuRegister src);  // no addr variant (for now)
+  void blsr(CpuRegister dst, CpuRegister src);  // no addr variant (for now)
+
   void bsrl(CpuRegister dst, CpuRegister src);
   void bsrl(CpuRegister dst, const Address& src);
   void bsrq(CpuRegister dst, CpuRegister src);
@@ -951,6 +956,11 @@
   void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
   void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
+  // Emit a 3 byte VEX Prefix
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp);
+
   ConstantArea constant_area_;
 
   DISALLOW_COPY_AND_ASSIGN(X86_64Assembler);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index fe42f9b..528e037 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1414,7 +1414,9 @@
 TEST_F(AssemblerX86_64Test, Pand) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pand, "pand %{reg2}, %{reg1}"), "pand");
 }
-
+TEST_F(AssemblerX86_64Test, Andn) {
+  DriverStr(RepeatRRR(&x86_64::X86_64Assembler::andn, "andn %{reg3}, %{reg2}, %{reg1}"), "andn");
+}
 TEST_F(AssemblerX86_64Test, andnpd) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::andnpd, "andnpd %{reg2}, %{reg1}"), "andnpd");
 }
@@ -1785,6 +1787,18 @@
   DriverFn(&ret_and_leave_fn, "retleave");
 }
 
+TEST_F(AssemblerX86_64Test, Blsmask) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::blsmsk, "blsmsk %{reg2}, %{reg1}"), "blsmsk");
+}
+
+TEST_F(AssemblerX86_64Test, Blsi) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::blsi, "blsi %{reg2}, %{reg1}"), "blsi");
+}
+
+TEST_F(AssemblerX86_64Test, Blsr) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::blsr, "blsr %{reg2}, %{reg1}"), "blsr");
+}
+
 TEST_F(AssemblerX86_64Test, Bswapl) {
   DriverStr(Repeatr(&x86_64::X86_64Assembler::bswapl, "bswap %{reg}"), "bswapl");
 }