Add X86 bsf and rotate instructions

These are for use in new intrinsics.  Bsf (Bit Scan Forward) is used in
{Long,Integer}NumberOfTrailingZeros and the rotates are used in
{Long,Integer}Rotate{Left,Right}.

Change-Id: Icb599d7e1eec4e4ea9e5b4f0b1654c7b8d4de678
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index e3962b4..04e815a 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -158,6 +158,20 @@
   EmitUint8(0xC8 + dst);
 }
 
+void X86Assembler::bsfl(Register dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitRegisterOperand(dst, src);
+}
+
+void X86Assembler::bsfl(Register dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitOperand(dst, src);
+}
+
 void X86Assembler::bsrl(Register dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
@@ -1423,6 +1437,26 @@
 }
 
 
+void X86Assembler::roll(Register reg, const Immediate& imm) {
+  EmitGenericShift(0, Operand(reg), imm);
+}
+
+
+void X86Assembler::roll(Register operand, Register shifter) {
+  EmitGenericShift(0, Operand(operand), shifter);
+}
+
+
+void X86Assembler::rorl(Register reg, const Immediate& imm) {
+  EmitGenericShift(1, Operand(reg), imm);
+}
+
+
+void X86Assembler::rorl(Register operand, Register shifter) {
+  EmitGenericShift(1, Operand(operand), shifter);
+}
+
+
 void X86Assembler::negl(Register reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF7);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 7d7b3d3..af78663 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -319,9 +319,16 @@
   void movntl(const Address& dst, Register src);
 
   void bswapl(Register dst);
+  void bsfl(Register dst, Register src);
+  void bsfl(Register dst, const Address& src);
   void bsrl(Register dst, Register src);
   void bsrl(Register dst, const Address& src);
 
+  void rorl(Register reg, const Immediate& imm);
+  void rorl(Register operand, Register shifter);
+  void roll(Register reg, const Immediate& imm);
+  void roll(Register operand, Register shifter);
+
   void movzxb(Register dst, ByteRegister src);
   void movzxb(Register dst, const Address& src);
   void movsxb(Register dst, ByteRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 9ac54af..16f9db4 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -32,6 +32,10 @@
 
 class AssemblerX86Test : public AssemblerTest<x86::X86Assembler, x86::Register,
                                               x86::XmmRegister, x86::Immediate> {
+ public:
+  typedef AssemblerTest<x86::X86Assembler, x86::Register,
+                         x86::XmmRegister, x86::Immediate> Base;
+
  protected:
   std::string GetArchitectureString() OVERRIDE {
     return "x86";
@@ -230,6 +234,19 @@
   DriverStr(expected, "rep_movsw");
 }
 
+TEST_F(AssemblerX86Test, Bsfl) {
+  DriverStr(RepeatRR(&x86::X86Assembler::bsfl, "bsfl %{reg2}, %{reg1}"), "bsfl");
+}
+
+TEST_F(AssemblerX86Test, BsflAddress) {
+  GetAssembler()->bsfl(x86::Register(x86::EDI), x86::Address(
+      x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12));
+  const char* expected =
+    "bsfl 0xc(%EDI,%EBX,4), %EDI\n";
+
+  DriverStr(expected, "bsfl_address");
+}
+
 TEST_F(AssemblerX86Test, Bsrl) {
   DriverStr(RepeatRR(&x86::X86Assembler::bsrl, "bsrl %{reg2}, %{reg1}"), "bsrl");
 }
@@ -243,6 +260,52 @@
   DriverStr(expected, "bsrl_address");
 }
 
+// Rorl only allows CL as the shift count.
+std::string rorl_fn(AssemblerX86Test::Base* assembler_test, x86::X86Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86::Register*> registers = assembler_test->GetRegisters();
+
+  x86::Register shifter(x86::ECX);
+  for (auto reg : registers) {
+    assembler->rorl(*reg, shifter);
+    str << "rorl %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86Test, RorlReg) {
+  DriverFn(&rorl_fn, "rorl");
+}
+
+TEST_F(AssemblerX86Test, RorlImm) {
+  DriverStr(RepeatRI(&x86::X86Assembler::rorl, 1U, "rorl ${imm}, %{reg}"), "rorli");
+}
+
+// Roll only allows CL as the shift count.
+std::string roll_fn(AssemblerX86Test::Base* assembler_test, x86::X86Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86::Register*> registers = assembler_test->GetRegisters();
+
+  x86::Register shifter(x86::ECX);
+  for (auto reg : registers) {
+    assembler->roll(*reg, shifter);
+    str << "roll %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86Test, RollReg) {
+  DriverFn(&roll_fn, "roll");
+}
+
+TEST_F(AssemblerX86Test, RollImm) {
+  DriverStr(RepeatRI(&x86::X86Assembler::roll, 1U, "roll ${imm}, %{reg}"), "rolli");
+}
+
 /////////////////
 // Near labels //
 /////////////////
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 88ea990..89d7915 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1866,6 +1866,46 @@
 }
 
 
+void X86_64Assembler::roll(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(false, 0, reg, imm);
+}
+
+
+void X86_64Assembler::roll(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(false, 0, operand, shifter);
+}
+
+
+void X86_64Assembler::rorl(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(false, 1, reg, imm);
+}
+
+
+void X86_64Assembler::rorl(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(false, 1, operand, shifter);
+}
+
+
+void X86_64Assembler::rolq(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(true, 0, reg, imm);
+}
+
+
+void X86_64Assembler::rolq(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(true, 0, operand, shifter);
+}
+
+
+void X86_64Assembler::rorq(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(true, 1, reg, imm);
+}
+
+
+void X86_64Assembler::rorq(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(true, 1, operand, shifter);
+}
+
+
 void X86_64Assembler::negl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg);
@@ -2140,6 +2180,38 @@
   EmitUint8(0xC8 + dst.LowBits());
 }
 
+void X86_64Assembler::bsfl(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::bsfl(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::bsfq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::bsfq(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBC);
+  EmitOperand(dst.LowBits(), src);
+}
+
 void X86_64Assembler::bsrl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index c38aba5..c8875e8 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -633,11 +633,26 @@
   void bswapl(CpuRegister dst);
   void bswapq(CpuRegister dst);
 
+  void bsfl(CpuRegister dst, CpuRegister src);
+  void bsfl(CpuRegister dst, const Address& src);
+  void bsfq(CpuRegister dst, CpuRegister src);
+  void bsfq(CpuRegister dst, const Address& src);
+
   void bsrl(CpuRegister dst, CpuRegister src);
   void bsrl(CpuRegister dst, const Address& src);
   void bsrq(CpuRegister dst, CpuRegister src);
   void bsrq(CpuRegister dst, const Address& src);
 
+  void rorl(CpuRegister reg, const Immediate& imm);
+  void rorl(CpuRegister operand, CpuRegister shifter);
+  void roll(CpuRegister reg, const Immediate& imm);
+  void roll(CpuRegister operand, CpuRegister shifter);
+
+  void rorq(CpuRegister reg, const Immediate& imm);
+  void rorq(CpuRegister operand, CpuRegister shifter);
+  void rolq(CpuRegister reg, const Immediate& imm);
+  void rolq(CpuRegister operand, CpuRegister shifter);
+
   void repne_scasw();
   void repe_cmpsw();
   void repe_cmpsl();
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 9e64b47..82378f7 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -495,6 +495,98 @@
   DriverStr(RepeatRI(&x86_64::X86_64Assembler::sarq, 1U, "sarq ${imm}, %{reg}"), "sarqi");
 }
 
+// Rorl only allows CL as the shift count.
+std::string rorl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->rorl(*reg, shifter);
+    str << "rorl %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, RorlReg) {
+  DriverFn(&rorl_fn, "rorl");
+}
+
+TEST_F(AssemblerX86_64Test, RorlImm) {
+  DriverStr(Repeatri(&x86_64::X86_64Assembler::rorl, 1U, "rorl ${imm}, %{reg}"), "rorli");
+}
+
+// Roll only allows CL as the shift count.
+std::string roll_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->roll(*reg, shifter);
+    str << "roll %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, RollReg) {
+  DriverFn(&roll_fn, "roll");
+}
+
+TEST_F(AssemblerX86_64Test, RollImm) {
+  DriverStr(Repeatri(&x86_64::X86_64Assembler::roll, 1U, "roll ${imm}, %{reg}"), "rolli");
+}
+
+// Rorq only allows CL as the shift count.
+std::string rorq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->rorq(*reg, shifter);
+    str << "rorq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, RorqReg) {
+  DriverFn(&rorq_fn, "rorq");
+}
+
+TEST_F(AssemblerX86_64Test, RorqImm) {
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rorq, 1U, "rorq ${imm}, %{reg}"), "rorqi");
+}
+
+// Rolq only allows CL as the shift count.
+std::string rolq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->rolq(*reg, shifter);
+    str << "rolq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, RolqReg) {
+  DriverFn(&rolq_fn, "rolq");
+}
+
+TEST_F(AssemblerX86_64Test, RolqImm) {
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rolq, 1U, "rolq ${imm}, %{reg}"), "rolqi");
+}
+
 TEST_F(AssemblerX86_64Test, CmpqRegs) {
   DriverStr(RepeatRR(&x86_64::X86_64Assembler::cmpq, "cmpq %{reg2}, %{reg1}"), "cmpq");
 }
@@ -1141,6 +1233,44 @@
   DriverStr(RepeatR(&x86_64::X86_64Assembler::bswapq, "bswap %{reg}"), "bswapq");
 }
 
+TEST_F(AssemblerX86_64Test, Bsfl) {
+  DriverStr(Repeatrr(&x86_64::X86_64Assembler::bsfl, "bsfl %{reg2}, %{reg1}"), "bsfl");
+}
+
+TEST_F(AssemblerX86_64Test, BsflAddress) {
+  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "bsfl 0xc(%RDI,%RBX,4), %R10d\n"
+    "bsfl 0xc(%R10,%RBX,4), %edi\n"
+    "bsfl 0xc(%RDI,%R9,4), %edi\n";
+
+  DriverStr(expected, "bsfl_address");
+}
+
+TEST_F(AssemblerX86_64Test, Bsfq) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::bsfq, "bsfq %{reg2}, %{reg1}"), "bsfq");
+}
+
+TEST_F(AssemblerX86_64Test, BsfqAddress) {
+  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "bsfq 0xc(%RDI,%RBX,4), %R10\n"
+    "bsfq 0xc(%R10,%RBX,4), %RDI\n"
+    "bsfq 0xc(%RDI,%R9,4), %RDI\n";
+
+  DriverStr(expected, "bsfq_address");
+}
+
 TEST_F(AssemblerX86_64Test, Bsrl) {
   DriverStr(Repeatrr(&x86_64::X86_64Assembler::bsrl, "bsrl %{reg2}, %{reg1}"), "bsrl");
 }
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index d4574f4..d4bef0f 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -928,6 +928,11 @@
         has_modrm = true;
         load = true;
         break;
+      case 0xBC:
+        opcode1 = "bsf";
+        has_modrm = true;
+        load = true;
+        break;
       case 0xBD:
         opcode1 = "bsr";
         has_modrm = true;