Implement inlined shift long for 32bit

Added support for x86 inlined shift long for 32bit

Change-Id: I6caef60dd7d80227c3057fd6f64b0ecb11025afa
Signed-off-by: Yixin Shou <yixin.shou@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 93b7999..6173163 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -263,8 +263,10 @@
 
   { kX86Cmc, kNullary, NO_OPERAND, { 0, 0, 0xF5, 0, 0, 0, 0, 0, false }, "Cmc", "" },
   { kX86Shld32RRI,  kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES,            { 0,    0, 0x0F, 0xA4, 0, 0, 0, 1, false }, "Shld32RRI", "!0r,!1r,!2d" },
+  { kX86Shld32RRC,  kShiftRegRegCl,  IS_TERTIARY_OP | REG_DEF0_USE01  | REG_USEC | SETS_CCODES, { 0,    0, 0x0F, 0xA5, 0, 0, 0, 0, false }, "Shld32RRC", "!0r,!1r,cl" },
   { kX86Shld32MRI,  kMemRegImm,      IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xA4, 0, 0, 0, 1, false }, "Shld32MRI", "[!0r+!1d],!2r,!3d" },
   { kX86Shrd32RRI,  kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES,            { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1, false }, "Shrd32RRI", "!0r,!1r,!2d" },
+  { kX86Shrd32RRC,  kShiftRegRegCl,  IS_TERTIARY_OP | REG_DEF0_USE01  | REG_USEC | SETS_CCODES, { 0,    0, 0x0F, 0xAD, 0, 0, 0, 0, false }, "Shrd32RRC", "!0r,!1r,cl" },
   { kX86Shrd32MRI,  kMemRegImm,      IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1, false }, "Shrd32MRI", "[!0r+!1d],!2r,!3d" },
   { kX86Shld64RRI,  kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES,            { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1, false }, "Shld64RRI", "!0r,!1r,!2d" },
   { kX86Shld64MRI,  kMemRegImm,      IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1, false }, "Shld64MRI", "[!0r+!1d],!2r,!3d" },
@@ -591,6 +593,7 @@
     case kShiftRegCl: return true;
     case kRegCond: return true;
     case kRegRegCond: return true;
+    case kShiftRegRegCl: return true;
     case kJmp:
       switch (entry->opcode) {
         case kX86JmpR: return true;
@@ -768,6 +771,9 @@
       DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(lir->operands[4]));
       return ComputeSize(entry, lir->operands[4], lir->operands[1], lir->operands[0],
                          lir->operands[3]);
+    case kShiftRegRegCl:  // lir operands - 0: reg1, 1: reg2, 2: cl
+      DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(lir->operands[2]));
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], 0);
     case kRegCond:  // lir operands - 0: reg, 1: cond
       return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], 0);
     case kMemCond:  // lir operands - 0: base, 1: disp, 2: cond
@@ -1336,6 +1342,19 @@
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+void X86Mir2Lir::EmitShiftRegRegCl(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2, int32_t raw_cl) {
+  DCHECK_EQ(false, entry->skeleton.r8_form);
+  DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(raw_cl));
+  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
+  uint8_t low_reg1 = LowRegisterBits(raw_reg1);
+  uint8_t low_reg2 = LowRegisterBits(raw_reg2);
+  uint8_t modrm = (3 << 6) | (low_reg1 << 3) | low_reg2;
+  code_buffer_.push_back(modrm);
+  DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+}
+
 void X86Mir2Lir::EmitShiftMemImm(const X86EncodingMap* entry, int32_t raw_base, int32_t disp,
                                  int32_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
@@ -1829,6 +1848,9 @@
       case kShiftMemCl:  // lir operands - 0: base, 1:displacement, 2: cl
         EmitShiftMemCl(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
+      case kShiftRegRegCl:  // lir operands - 0: reg1, 1: reg2, 2: cl
+        EmitShiftRegRegCl(entry, lir->operands[1], lir->operands[0], lir->operands[2]);
+        break;
       case kRegCond:  // lir operands - 0: reg, 1: condition
         EmitRegCond(entry, lir->operands[0], lir->operands[1]);
         break;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 1f5b350..7d1e20e 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -457,6 +457,8 @@
   void EmitShiftRegImm(const X86EncodingMap* entry, int32_t raw_reg, int32_t imm);
   void EmitShiftRegCl(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_cl);
   void EmitShiftMemCl(const X86EncodingMap* entry, int32_t raw_base, int32_t disp, int32_t raw_cl);
+  void EmitShiftRegRegCl(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2,
+                         int32_t raw_cl);
   void EmitShiftMemImm(const X86EncodingMap* entry, int32_t raw_base, int32_t disp, int32_t imm);
   void EmitRegCond(const X86EncodingMap* entry, int32_t raw_reg, int32_t cc);
   void EmitMemCond(const X86EncodingMap* entry, int32_t raw_base, int32_t disp, int32_t cc);
@@ -478,8 +480,10 @@
   void GenConstWide(RegLocation rl_dest, int64_t value);
   void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
   void GenShiftByteVector(BasicBlock *bb, MIR *mir);
-  void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
-  void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3,
+                             uint32_t m4);
+  void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2,
+                          uint32_t m3, uint32_t m4);
   void AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir);
 
   static bool ProvidesFullMemoryBarrier(X86OpCode opcode);
@@ -551,7 +555,8 @@
   void GenMoveVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed multiply of units in two vector registers: vB = vB .* @note vC using vA to know the type of the vector.
+   * @brief Packed multiply of units in two vector registers: vB = vB .* @note vC using vA to know
+   * the type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -561,7 +566,8 @@
   void GenMultiplyVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
+   * @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the
+   * type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -571,7 +577,8 @@
   void GenAddVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
+   * @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the
+   * type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -581,7 +588,8 @@
   void GenSubtractVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
+   * @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the
+   * type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -591,7 +599,8 @@
   void GenShiftLeftVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
+   * @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to
+   * know the type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -601,7 +610,8 @@
   void GenSignedShiftRightVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
+   * @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA
+   * to know the type of the vector.
    * @param bb The basic block in which the MIR is from..
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -611,7 +621,8 @@
   void GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
+   * @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the
+   * type of the vector.
    * @note vA: TypeSize
    * @note vB: destination and source
    * @note vC: source
@@ -619,7 +630,8 @@
   void GenAndVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
+   * @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the
+   * type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
@@ -629,7 +641,8 @@
   void GenOrVector(BasicBlock *bb, MIR *mir);
 
   /*
-   * @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
+   * @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the
+   * type of the vector.
    * @param bb The basic block in which the MIR is from.
    * @param mir The MIR whose opcode is kMirConstVector.
    * @note vA: TypeSize
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 7ba9570..cc51538 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -3140,7 +3140,53 @@
 void X86Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_shift) {
   if (!cu_->target64) {
-    Mir2Lir::GenShiftOpLong(opcode, rl_dest, rl_src1, rl_shift);
+    // Long shift operations in 32-bit. Use shld or shrd to create a 32-bit register filled from
+    // the other half, shift the other half, if the shift amount is less than 32 we're done,
+    // otherwise move one register to the other and place zero or sign bits in the other.
+    LIR* branch;
+    FlushAllRegs();
+    LockCallTemps();
+    LoadValueDirectFixed(rl_shift, rs_rCX);
+    RegStorage r_tmp = RegStorage::MakeRegPair(rs_rAX, rs_rDX);
+    LoadValueDirectWideFixed(rl_src1, r_tmp);
+    switch (opcode) {
+      case Instruction::SHL_LONG:
+      case Instruction::SHL_LONG_2ADDR:
+        NewLIR3(kX86Shld32RRC, r_tmp.GetHighReg(), r_tmp.GetLowReg(), rs_rCX.GetReg());
+        NewLIR2(kX86Sal32RC, r_tmp.GetLowReg(), rs_rCX.GetReg());
+        NewLIR2(kX86Test8RI, rs_rCX.GetReg(), 32);
+        branch = NewLIR2(kX86Jcc8, 0, kX86CondZ);
+        OpRegCopy(r_tmp.GetHigh(), r_tmp.GetLow());
+        LoadConstant(r_tmp.GetLow(), 0);
+        branch->target = NewLIR0(kPseudoTargetLabel);
+        break;
+      case Instruction::SHR_LONG:
+      case Instruction::SHR_LONG_2ADDR:
+        NewLIR3(kX86Shrd32RRC, r_tmp.GetLowReg(), r_tmp.GetHighReg(), rs_rCX.GetReg());
+        NewLIR2(kX86Sar32RC, r_tmp.GetHighReg(), rs_rCX.GetReg());
+        NewLIR2(kX86Test8RI, rs_rCX.GetReg(), 32);
+        branch = NewLIR2(kX86Jcc8, 0, kX86CondZ);
+        OpRegCopy(r_tmp.GetLow(), r_tmp.GetHigh());
+        NewLIR2(kX86Sar32RI, r_tmp.GetHighReg(), 31);
+        branch->target = NewLIR0(kPseudoTargetLabel);
+        break;
+      case Instruction::USHR_LONG:
+      case Instruction::USHR_LONG_2ADDR:
+        NewLIR3(kX86Shrd32RRC, r_tmp.GetLowReg(), r_tmp.GetHighReg(),
+               rs_rCX.GetReg());
+        NewLIR2(kX86Shr32RC, r_tmp.GetHighReg(), rs_rCX.GetReg());
+        NewLIR2(kX86Test8RI, rs_rCX.GetReg(), 32);
+        branch = NewLIR2(kX86Jcc8, 0, kX86CondZ);
+        OpRegCopy(r_tmp.GetLow(), r_tmp.GetHigh());
+        LoadConstant(r_tmp.GetHigh(), 0);
+        branch->target = NewLIR0(kPseudoTargetLabel);
+        break;
+      default:
+        LOG(FATAL) << "Unexpected case: " << opcode;
+        return;
+    }
+    RegLocation rl_result = LocCReturnWide();
+    StoreValueWide(rl_dest, rl_result);
     return;
   }
 
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 500c6b8..9620cd1 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -484,8 +484,10 @@
 #undef BinaryShiftOpcode
   kX86Cmc,
   kX86Shld32RRI,
+  kX86Shld32RRC,
   kX86Shld32MRI,
   kX86Shrd32RRI,
+  kX86Shrd32RRC,
   kX86Shrd32MRI,
   kX86Shld64RRI,
   kX86Shld64MRI,
@@ -675,6 +677,7 @@
   kMemRegImm,                               // MRI instruction kinds.
   kShiftRegImm, kShiftMemImm, kShiftArrayImm,  // Shift opcode with immediate.
   kShiftRegCl, kShiftMemCl, kShiftArrayCl,     // Shift opcode with register CL.
+  kShiftRegRegCl,
   // kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.
   kRegCond, kMemCond, kArrayCond,          // R, M, A instruction kinds following by a condition.
   kRegRegCond,                             // RR instruction kind followed by a condition.
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 0ca8962..0bf758e 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -702,12 +702,24 @@
         load = true;
         immediate_bytes = 1;
         break;
+      case 0xA5:
+        opcode << "shld";
+        has_modrm = true;
+        load = true;
+        cx = true;
+        break;
       case 0xAC:
         opcode << "shrd";
         has_modrm = true;
         load = true;
         immediate_bytes = 1;
         break;
+      case 0xAD:
+        opcode << "shrd";
+        has_modrm = true;
+        load = true;
+        cx = true;
+        break;
       case 0xAE:
         if (prefix[0] == 0xF3) {
           prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode