x86_64: Support r8-r15, xmm8-xmm15 in assembler

Added REX support. The TARGET_REX_SUPPORT should be used during build.

Change-Id: I82b457ff5085c8192ad873923bd939fbb91022ce
Signed-off-by: Dmitry Petrochenko <dmitry.petrochenko@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index b8222ef..39a0365 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -478,7 +478,8 @@
   { kX86RepneScasw, kPrefix2Nullary, NO_OPERAND | REG_USEA | REG_USEC | SETS_CCODES, { 0x66, 0xF2, 0xAF, 0, 0, 0, 0, 0 }, "RepNE ScasW", "" },
 };
 
-size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib) {
+size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int base, int displacement,
+                               int reg_r, int reg_x, bool has_sib) {
   size_t size = 0;
   if (entry->skeleton.prefix1 > 0) {
     ++size;
@@ -486,6 +487,10 @@
       ++size;
     }
   }
+  if ((NeedsRex(base) || NeedsRex(reg_r) || NeedsRex(reg_x)) &&
+       entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+    ++size;  // REX_R
+  }
   ++size;  // opcode
   if (entry->skeleton.opcode == 0x0F) {
     ++size;
@@ -494,13 +499,13 @@
     }
   }
   ++size;  // modrm
-  if (has_sib || RegStorage::RegNum(base) == rs_rX86_SP.GetRegNum()
+  if (has_sib || LowRegisterBits(RegStorage::RegNum(base)) == rs_rX86_SP.GetRegNum()
       || (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX)) {
     // SP requires a SIB byte.
     // GS access also needs a SIB byte for absolute adressing in 64-bit mode.
     ++size;
   }
-  if (displacement != 0 || RegStorage::RegNum(base) == rs_rBP.GetRegNum()) {
+  if (displacement != 0 || LowRegisterBits(RegStorage::RegNum(base)) == rs_rBP.GetRegNum()) {
     // BP requires an explicit displacement, even when it's 0.
     if (entry->opcode != kX86Lea32RA) {
       DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0ULL) << entry->name;
@@ -524,33 +529,41 @@
     case kPrefix2Nullary:
       return 3;  // 1 byte of opcode + 2 prefixes
     case kRegOpcode:  // lir operands - 0: reg
-      return ComputeSize(entry, 0, 0, false) - 1;  // substract 1 for modrm
+      // substract 1 for modrm
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false) - 1;
     case kReg:  // lir operands - 0: reg
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kMem:  // lir operands - 0: base, 1: disp
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         lir->operands[2], NO_REG, false);
     case kMemRegImm:  // lir operands - 0: base, 1: disp, 2: reg 3: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         lir->operands[2], NO_REG, false);
     case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         lir->operands[4], lir->operands[1], true);
     case kThreadReg:  // lir operands - 0: disp, 1: reg
-      return ComputeSize(entry, 0, lir->operands[0], false);
-    case kRegReg:
-      return ComputeSize(entry, 0, 0, false);
-    case kRegRegStore:
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, lir->operands[0], lir->operands[1], NO_REG, false);
+    case kRegReg:  // lir operands - 0: reg1, 1: reg2
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
+    case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
+      return ComputeSize(entry, 0, 0, lir->operands[1], lir->operands[0], false);
     case kRegMem:  // lir operands - 0: reg, 1: base, 2: disp
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], NO_REG, false);
     case kRegArray:   // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
-      return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
+      return ComputeSize(entry, lir->operands[1], lir->operands[4],
+                         lir->operands[0], lir->operands[2], true);
     case kRegThread:  // lir operands - 0: reg, 1: disp
-      return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+      // displacement size is always 32bit
+      return ComputeSize(entry, 0, 0x12345678, lir->operands[0], NO_REG, false);
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
-      size_t size = ComputeSize(entry, 0, 0, false);
+      size_t size = ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
       if (entry->skeleton.ax_opcode == 0) {
         return size;
       } else {
@@ -560,47 +573,58 @@
       }
     }
     case kMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1],
+                         NO_REG, lir->operands[0], false);
     case kArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kThreadImm:  // lir operands - 0: disp, 1: imm
-      return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+      // displacement size is always 32bit
+      return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
     case kRegRegImm:  // lir operands - 0: reg, 1: reg, 2: imm
     case kRegRegImmRev:
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
     case kRegMemImm:  // lir operands - 0: reg, 1: base, 2: disp, 3: imm
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], NO_REG, false);
     case kRegArrayImm:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp, 5: imm
-      return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
+      return ComputeSize(entry, lir->operands[1], lir->operands[4],
+                         lir->operands[0], lir->operands[2], true);
     case kMovRegImm:  // lir operands - 0: reg, 1: immediate
-      return (entry->skeleton.prefix1 != 0?1:0) + 1 + entry->skeleton.immediate_bytes;  // TODO(64): reg1
+      return (entry->skeleton.prefix1 != 0 || NeedsRex(lir->operands[0])?1:0) +
+             1 + entry->skeleton.immediate_bytes;
     case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, 0, 0, false) - (lir->operands[1] == 1 ? 1 : 0);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false) -
+             (lir->operands[1] == 1 ? 1 : 0);
     case kShiftMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false) -
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false) -
              (lir->operands[2] == 1 ? 1 : 0);
     case kShiftArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true) -
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true) -
              (lir->operands[4] == 1 ? 1 : 0);
-    case kShiftRegCl:
-      return ComputeSize(entry, 0, 0, false);
+    case kShiftRegCl:  // lir operands - 0: reg, 1: cl
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kShiftMemCl:  // lir operands - 0: base, 1: disp, 2: cl
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kShiftArrayCl:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         lir->operands[4], lir->operands[1], true);
     case kRegCond:  // lir operands - 0: reg, 1: cond
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], NO_REG, false);
     case kMemCond:  // lir operands - 0: base, 1: disp, 2: cond
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
     case kArrayCond:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: cond
-      return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+      return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                         NO_REG, lir->operands[1], true);
     case kRegRegCond:  // lir operands - 0: reg, 1: reg, 2: cond
-      return ComputeSize(entry, 0, 0, false);
+      return ComputeSize(entry, 0, 0, lir->operands[0], lir->operands[1], false);
     case kRegMemCond:  // lir operands - 0: reg, 1: reg, 2: disp, 3:cond
-      return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
+      return ComputeSize(entry, lir->operands[1], lir->operands[2],
+                         lir->operands[0], lir->operands[1], false);
     case kJcc:
       if (lir->opcode == kX86Jcc8) {
         return 2;  // opcode + rel8
@@ -614,21 +638,28 @@
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
       } else if (lir->opcode == kX86JmpT) {
-        return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+        // displacement size is always 32bit
+        return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
       } else {
         DCHECK(lir->opcode == kX86JmpR);
-        return 2;  // opcode + modrm
+        if (NeedsRex(lir->operands[0])) {
+          return 3;  // REX.B + opcode + modrm
+        } else {
+          return 2;  // opcode + modrm
+        }
       }
     case kCall:
       switch (lir->opcode) {
         case kX86CallI: return 5;  // opcode 0:disp
         case kX86CallR: return 2;  // opcode modrm
         case kX86CallM:  // lir operands - 0: base, 1: disp
-          return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
+          return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, NO_REG, false);
         case kX86CallA:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-          return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
+          return ComputeSize(entry, lir->operands[0], lir->operands[3],
+                             NO_REG, lir->operands[1], true);
         case kX86CallT:  // lir operands - 0: disp
-          return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
+          // displacement size is always 32bit
+          return ComputeSize(entry, 0, 0x12345678, NO_REG, NO_REG, false);
         default:
           break;
       }
@@ -636,16 +667,19 @@
     case kPcRel:
       if (entry->opcode == kX86PcRelLoadRA) {
         // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
-        return ComputeSize(entry, lir->operands[1], 0x12345678, true);
+        return ComputeSize(entry, lir->operands[1], 0x12345678,
+                           lir->operands[0], lir->operands[2], true);
       } else {
         DCHECK(entry->opcode == kX86PcRelAdr);
         return 5;  // opcode with reg + 4 byte immediate
       }
-    case kMacro:
+    case kMacro:  // lir operands - 0: reg
       DCHECK_EQ(lir->opcode, static_cast<int>(kX86StartOfMethod));
       return 5 /* call opcode + 4 byte displacement */ + 1 /* pop reg */ +
-          ComputeSize(&X86Mir2Lir::EncodingMap[kX86Sub32RI], 0, 0, false) -
-          (RegStorage::RegNum(lir->operands[0]) == rs_rAX.GetRegNum()  ? 1 : 0);  // shorter ax encoding
+          ComputeSize(&X86Mir2Lir::EncodingMap[kX86Sub32RI], 0, 0,
+                      lir->operands[0], NO_REG, false) -
+          // shorter ax encoding
+          (RegStorage::RegNum(lir->operands[0]) == rs_rAX.GetRegNum()  ? 1 : 0);
     default:
       break;
   }
@@ -654,19 +688,62 @@
 }
 
 void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry) {
+  EmitPrefix(entry, NO_REG, NO_REG, NO_REG);
+}
+
+void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
+                            uint8_t reg_r, uint8_t reg_x, uint8_t reg_b) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  bool force = false;
+  bool w = (entry->skeleton.prefix1 == REX_W) || (entry->skeleton.prefix2 == REX_W);
+  bool r = NeedsRex(reg_r);
+  bool x = NeedsRex(reg_x);
+  bool b = NeedsRex(reg_b);
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (r) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (x) {
+    rex |= 0x42;  // REX.00X0
+  }
+  if (b) {
+    rex |= 0x41;  // REX.000B
+  }
   if (entry->skeleton.prefix1 != 0) {
     if (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX) {
       // 64 bit adresses by GS, not FS
       code_buffer_.push_back(THREAD_PREFIX_GS);
     } else {
-      code_buffer_.push_back(entry->skeleton.prefix1);
+      if (entry->skeleton.prefix1 == REX_W) {
+        rex |= entry->skeleton.prefix1;
+        code_buffer_.push_back(rex);
+        rex = 0;
+      } else {
+        code_buffer_.push_back(entry->skeleton.prefix1);
+      }
     }
     if (entry->skeleton.prefix2 != 0) {
-      code_buffer_.push_back(entry->skeleton.prefix2);
+      if (entry->skeleton.prefix2 == REX_W) {
+        rex |= entry->skeleton.prefix2;
+        code_buffer_.push_back(rex);
+        rex = 0;
+      } else {
+        code_buffer_.push_back(entry->skeleton.prefix2);
+      }
     }
   } else {
     DCHECK_EQ(0, entry->skeleton.prefix2);
   }
+  if (rex != 0) {
+    code_buffer_.push_back(rex);
+  }
 }
 
 void X86Mir2Lir::EmitOpcode(const X86EncodingMap* entry) {
@@ -685,7 +762,12 @@
 }
 
 void X86Mir2Lir::EmitPrefixAndOpcode(const X86EncodingMap* entry) {
-  EmitPrefix(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
+}
+
+void X86Mir2Lir::EmitPrefixAndOpcode(const X86EncodingMap* entry,
+                                     uint8_t reg_r, uint8_t reg_x, uint8_t reg_b) {
+  EmitPrefix(entry, reg_r, reg_x, reg_b);
   EmitOpcode(entry);
 }
 
@@ -766,6 +848,11 @@
       code_buffer_.push_back((imm >> 8) & 0xFF);
       break;
     case 4:
+      if (imm <0) {
+        CHECK_EQ((-imm) & 0x0FFFFFFFFl, -imm);
+      } else {
+        CHECK_EQ(imm & 0x0FFFFFFFFl, imm);
+      }
       code_buffer_.push_back(imm & 0xFF);
       code_buffer_.push_back((imm >> 8) & 0xFF);
       code_buffer_.push_back((imm >> 16) & 0xFF);
@@ -789,7 +876,8 @@
 }
 
 void X86Mir2Lir::EmitOpRegOpcode(const X86EncodingMap* entry, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   // There's no 3-byte instruction with +rd
   DCHECK(entry->skeleton.opcode != 0x0F ||
          (entry->skeleton.extra_opcode1 != 0x38 && entry->skeleton.extra_opcode1 != 0x3A));
@@ -801,7 +889,8 @@
 }
 
 void X86Mir2Lir::EmitOpReg(const X86EncodingMap* entry, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " "
         << static_cast<int>(RegStorage::RegNum(reg))
@@ -815,7 +904,8 @@
 }
 
 void X86Mir2Lir::EmitOpMem(const X86EncodingMap* entry, uint8_t base, int disp) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -827,15 +917,29 @@
 
 void X86Mir2Lir::EmitOpArray(const X86EncodingMap* entry, uint8_t base, uint8_t index,
                              int scale, int disp) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, index, base);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+uint8_t X86Mir2Lir::LowRegisterBits(uint8_t reg) {
+  uint8_t res = reg;
+  res = reg & kRegNumMask32;  // 3 bits
+  return res;
+}
+
+bool X86Mir2Lir::NeedsRex(uint8_t reg) {
+  return RegStorage::RegNum(reg) > 7;
+}
+
 void X86Mir2Lir::EmitMemReg(const X86EncodingMap* entry,
                        uint8_t base, int disp, uint8_t reg) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, base);
+  reg = LowRegisterBits(reg);
+  base = LowRegisterBits(base);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL ||
            entry->opcode == kX86Movzx8RM || entry->opcode == kX86Movsx8RM)
@@ -854,9 +958,12 @@
   EmitMemReg(entry, base, disp, reg);
 }
 
-void X86Mir2Lir::EmitRegArray(const X86EncodingMap* entry, uint8_t reg, uint8_t base, uint8_t index,
-                              int scale, int disp) {
-  EmitPrefixAndOpcode(entry);
+void X86Mir2Lir::EmitRegArray(const X86EncodingMap* entry, uint8_t reg, uint8_t base,
+                              uint8_t index, int scale, int disp) {
+  EmitPrefixAndOpcode(entry, reg, index, base);
+  reg = LowRegisterBits(reg);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(reg, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.modrm_opcode);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
@@ -871,7 +978,9 @@
 
 void X86Mir2Lir::EmitArrayImm(const X86EncodingMap* entry, uint8_t base, uint8_t index, int scale,
                               int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, index, base);
+  index = LowRegisterBits(index);
+  base = LowRegisterBits(base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, base, index, scale, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   EmitImm(entry, imm);
@@ -879,7 +988,8 @@
 
 void X86Mir2Lir::EmitRegThread(const X86EncodingMap* entry, uint8_t reg, int disp) {
   DCHECK_NE(entry->skeleton.prefix1, 0);
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   if (RegStorage::RegNum(reg) >= 4) {
     DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " "
         << static_cast<int>(RegStorage::RegNum(reg))
@@ -897,7 +1007,9 @@
 }
 
 void X86Mir2Lir::EmitRegReg(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
   DCHECK_LT(RegStorage::RegNum(reg1), 8);
   DCHECK_LT(RegStorage::RegNum(reg2), 8);
   uint8_t modrm = (3 << 6) | (RegStorage::RegNum(reg1) << 3) | RegStorage::RegNum(reg2);
@@ -909,7 +1021,9 @@
 
 void X86Mir2Lir::EmitRegRegImm(const X86EncodingMap* entry,
                           uint8_t reg1, uint8_t reg2, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
   DCHECK_LT(RegStorage::RegNum(reg1), 8);
   DCHECK_LT(RegStorage::RegNum(reg2), 8);
   uint8_t modrm = (3 << 6) | (RegStorage::RegNum(reg1) << 3) | RegStorage::RegNum(reg2);
@@ -926,7 +1040,9 @@
 
 void X86Mir2Lir::EmitRegMemImm(const X86EncodingMap* entry,
                                uint8_t reg, uint8_t base, int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg, NO_REG, base);
+  reg = LowRegisterBits(reg);
+  base = LowRegisterBits(base);
   DCHECK(!RegStorage::IsFloat(reg));
   DCHECK_LT(RegStorage::RegNum(reg), 8);
   EmitModrmDisp(reg, base, disp);
@@ -941,10 +1057,11 @@
 }
 
 void X86Mir2Lir::EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
   if (RegStorage::RegNum(reg) == rs_rAX.GetRegNum() && entry->skeleton.ax_opcode != 0) {
     code_buffer_.push_back(entry->skeleton.ax_opcode);
   } else {
+    reg = LowRegisterBits(reg);
     EmitOpcode(entry);
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | RegStorage::RegNum(reg);
     code_buffer_.push_back(modrm);
@@ -953,7 +1070,8 @@
 }
 
 void X86Mir2Lir::EmitMemImm(const X86EncodingMap* entry, uint8_t base, int disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   EmitImm(entry, imm);
@@ -971,6 +1089,8 @@
 }
 
 void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int64_t imm) {
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
+  reg = LowRegisterBits(reg);
   DCHECK_LT(RegStorage::RegNum(reg), 8);
   code_buffer_.push_back(0xB8 + RegStorage::RegNum(reg));
   switch (entry->skeleton.immediate_bytes) {
@@ -991,12 +1111,14 @@
       code_buffer_.push_back((imm >> 56) & 0xFF);
       break;
     default:
-      LOG(FATAL) << "Unsupported immediate size for EmitMovRegImm: " << static_cast<uint32_t>(entry->skeleton.immediate_bytes);
+      LOG(FATAL) << "Unsupported immediate size for EmitMovRegImm: "
+                 << static_cast<uint32_t>(entry->skeleton.immediate_bytes);
   }
 }
 
 void X86Mir2Lir::EmitShiftRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, reg);
+  reg = LowRegisterBits(reg);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -1023,7 +1145,8 @@
 
 void X86Mir2Lir::EmitShiftRegCl(const X86EncodingMap* entry, uint8_t reg, uint8_t cl) {
   DCHECK_EQ(cl, static_cast<uint8_t>(rs_rCX.GetReg()));
-  EmitPrefix(entry);
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1038,7 +1161,8 @@
 void X86Mir2Lir::EmitShiftMemCl(const X86EncodingMap* entry, uint8_t base,
                                 int displacement, uint8_t cl) {
   DCHECK_EQ(cl, static_cast<uint8_t>(rs_rCX.GetReg()));
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1051,7 +1175,8 @@
 
 void X86Mir2Lir::EmitShiftMemImm(const X86EncodingMap* entry, uint8_t base,
                                 int displacement, int imm) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -1070,7 +1195,8 @@
 }
 
 void X86Mir2Lir::EmitRegCond(const X86EncodingMap* entry, uint8_t reg, uint8_t condition) {
-  EmitPrefix(entry);
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0x0F, entry->skeleton.opcode);
   code_buffer_.push_back(0x0F);
@@ -1083,7 +1209,8 @@
   DCHECK_EQ(entry->skeleton.immediate_bytes, 0);
 }
 
-void X86Mir2Lir::EmitMemCond(const X86EncodingMap* entry, uint8_t base, int displacement, uint8_t condition) {
+void X86Mir2Lir::EmitMemCond(const X86EncodingMap* entry, uint8_t base, int displacement,
+                             uint8_t condition) {
   if (entry->skeleton.prefix1 != 0) {
     code_buffer_.push_back(entry->skeleton.prefix1);
     if (entry->skeleton.prefix2 != 0) {
@@ -1105,7 +1232,9 @@
 void X86Mir2Lir::EmitRegRegCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2,
                                 uint8_t condition) {
   // Generate prefix and opcode without the condition
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, reg2);
+  reg1 = LowRegisterBits(reg1);
+  reg2 = LowRegisterBits(reg2);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_LE(condition, 0xF);
@@ -1127,9 +1256,12 @@
   code_buffer_.push_back(modrm);
 }
 
-void X86Mir2Lir::EmitRegMemCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t base, int displacement, uint8_t condition) {
+void X86Mir2Lir::EmitRegMemCond(const X86EncodingMap* entry, uint8_t reg1, uint8_t base,
+                                int displacement, uint8_t condition) {
   // Generate prefix and opcode without the condition
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, reg1, NO_REG, base);
+  reg1 = LowRegisterBits(reg1);
+  base = LowRegisterBits(base);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_LE(condition, 0xF);
@@ -1162,8 +1294,10 @@
     code_buffer_.push_back(rel & 0xFF);
   } else {
     DCHECK(entry->opcode == kX86JmpR);
-    code_buffer_.push_back(entry->skeleton.opcode);
     uint8_t reg = static_cast<uint8_t>(rel);
+    EmitPrefix(entry, NO_REG, NO_REG, reg);
+    code_buffer_.push_back(entry->skeleton.opcode);
+    reg = LowRegisterBits(reg);
     DCHECK_LT(RegStorage::RegNum(reg), 8);
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | RegStorage::RegNum(reg);
     code_buffer_.push_back(modrm);
@@ -1188,7 +1322,8 @@
 }
 
 void X86Mir2Lir::EmitCallMem(const X86EncodingMap* entry, uint8_t base, int disp) {
-  EmitPrefixAndOpcode(entry);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, base);
+  base = LowRegisterBits(base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -1229,9 +1364,12 @@
         reinterpret_cast<Mir2Lir::EmbeddedData*>(UnwrapPointer(base_or_table));
     disp = tab_rec->offset;
   }
-  EmitPrefix(entry);
-  DCHECK_LT(RegStorage::RegNum(reg), 8);
   if (entry->opcode == kX86PcRelLoadRA) {
+    EmitPrefix(entry, reg, index, base_or_table);
+    reg = LowRegisterBits(reg);
+    base_or_table = LowRegisterBits(base_or_table);
+    index = LowRegisterBits(index);
+    DCHECK_LT(RegStorage::RegNum(reg), 8);
     code_buffer_.push_back(entry->skeleton.opcode);
     DCHECK_NE(0x0F, entry->skeleton.opcode);
     DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1246,6 +1384,7 @@
     code_buffer_.push_back(sib);
     DCHECK_EQ(0, entry->skeleton.immediate_bytes);
   } else {
+    DCHECK_LT(RegStorage::RegNum(reg), 8);
     code_buffer_.push_back(entry->skeleton.opcode + RegStorage::RegNum(reg));
   }
   code_buffer_.push_back(disp & 0xFF);
@@ -1258,6 +1397,8 @@
 
 void X86Mir2Lir::EmitMacro(const X86EncodingMap* entry, uint8_t reg, int offset) {
   DCHECK(entry->opcode == kX86StartOfMethod) << entry->name;
+  EmitPrefix(entry, reg, NO_REG, NO_REG);
+  reg = LowRegisterBits(reg);
   code_buffer_.push_back(0xE8);  // call +0
   code_buffer_.push_back(0);
   code_buffer_.push_back(0);
@@ -1532,7 +1673,8 @@
         EmitRegRegCond(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
       case kRegMemCond:  // lir operands - 0: reg, 1: reg, displacement, 3: condition
-        EmitRegMemCond(entry, lir->operands[0], lir->operands[1], lir->operands[2], lir->operands[3]);
+        EmitRegMemCond(entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                       lir->operands[3]);
         break;
       case kJmp:  // lir operands - 0: rel
         if (entry->opcode == kX86JmpT) {
@@ -1566,7 +1708,7 @@
         EmitPcRel(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                   lir->operands[3], lir->operands[4]);
         break;
-      case kMacro:
+      case kMacro:  // lir operands - 0: reg
         EmitMacro(entry, lir->operands[0], lir->offset);
         break;
       default:
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index fcc846f..d66790d 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -331,10 +331,16 @@
     std::vector<uint8_t>* ReturnCallFrameInformation();
 
   protected:
-    size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib);
+    size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement,
+                       int reg_r, int reg_x, bool has_sib);
+    uint8_t LowRegisterBits(uint8_t reg);
+    bool NeedsRex(uint8_t reg);
     void EmitPrefix(const X86EncodingMap* entry);
+    void EmitPrefix(const X86EncodingMap* entry, uint8_t reg_r, uint8_t reg_x, uint8_t reg_b);
     void EmitOpcode(const X86EncodingMap* entry);
     void EmitPrefixAndOpcode(const X86EncodingMap* entry);
+    void EmitPrefixAndOpcode(const X86EncodingMap* entry,
+                             uint8_t reg_r, uint8_t reg_x, uint8_t reg_b);
     void EmitDisp(uint8_t base, int disp);
     void EmitModrmThread(uint8_t reg_or_opcode);
     void EmitModrmDisp(uint8_t reg_or_opcode, uint8_t base, int disp);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 19ad2f8..8b34168 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -38,7 +38,7 @@
 static const RegStorage core_regs_arr_64q[] = {
     rs_r0q, rs_r1q, rs_r2q, rs_r3q, rs_rX86_SP_64, rs_r5q, rs_r6q, rs_r7q,
 #ifdef TARGET_REX_SUPPORT
-    rs_r8, rs_r9, rs_r10, rs_r11, rs_r12, rs_r13, rs_r14, rs_r15
+    rs_r8q, rs_r9q, rs_r10q, rs_r11q, rs_r12q, rs_r13q, rs_r14q, rs_r15q
 #endif
 };
 static const RegStorage sp_regs_arr_32[] = {
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 24c64cd..bb8df89 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -175,6 +175,16 @@
   fr5  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 5,
   fr6  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
   fr7  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
+#ifdef TARGET_REX_SUPPORT
+  fr8  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 8,
+  fr9  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 9,
+  fr10 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 10,
+  fr11 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 11,
+  fr12 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 12,
+  fr13 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 13,
+  fr14 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 14,
+  fr15 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 15,
+#endif
 
   // xmm registers, double precision aliases.
   dr0  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
@@ -185,8 +195,18 @@
   dr5  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 5,
   dr6  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
   dr7  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
+#ifdef TARGET_REX_SUPPORT
+  dr8  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 8,
+  dr9  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 9,
+  dr10 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 10,
+  dr11 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 11,
+  dr12 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 12,
+  dr13 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 13,
+  dr14 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 14,
+  dr15 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 15,
+#endif
 
-  // xmm registers aliases.
+  // xmm registers, quad precision aliases
   xr0  = RegStorage::k128BitSolo | 0,
   xr1  = RegStorage::k128BitSolo | 1,
   xr2  = RegStorage::k128BitSolo | 2,
@@ -195,6 +215,16 @@
   xr5  = RegStorage::k128BitSolo | 5,
   xr6  = RegStorage::k128BitSolo | 6,
   xr7  = RegStorage::k128BitSolo | 7,
+#ifdef TARGET_REX_SUPPORT
+  xr8  = RegStorage::k128BitSolo | 8,
+  xr9  = RegStorage::k128BitSolo | 9,
+  xr10 = RegStorage::k128BitSolo | 10,
+  xr11 = RegStorage::k128BitSolo | 11,
+  xr12 = RegStorage::k128BitSolo | 12,
+  xr13 = RegStorage::k128BitSolo | 13,
+  xr14 = RegStorage::k128BitSolo | 14,
+  xr15 = RegStorage::k128BitSolo | 15,
+#endif
 
   // TODO: as needed, add 256, 512 and 1024-bit xmm views.
 };
@@ -251,6 +281,16 @@
 constexpr RegStorage rs_fr5(RegStorage::kValid | fr5);
 constexpr RegStorage rs_fr6(RegStorage::kValid | fr6);
 constexpr RegStorage rs_fr7(RegStorage::kValid | fr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_fr8(RegStorage::kValid | fr8);
+constexpr RegStorage rs_fr9(RegStorage::kValid | fr9);
+constexpr RegStorage rs_fr10(RegStorage::kValid | fr10);
+constexpr RegStorage rs_fr11(RegStorage::kValid | fr11);
+constexpr RegStorage rs_fr12(RegStorage::kValid | fr12);
+constexpr RegStorage rs_fr13(RegStorage::kValid | fr13);
+constexpr RegStorage rs_fr14(RegStorage::kValid | fr14);
+constexpr RegStorage rs_fr15(RegStorage::kValid | fr15);
+#endif
 
 constexpr RegStorage rs_dr0(RegStorage::kValid | dr0);
 constexpr RegStorage rs_dr1(RegStorage::kValid | dr1);
@@ -260,6 +300,16 @@
 constexpr RegStorage rs_dr5(RegStorage::kValid | dr5);
 constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
 constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_dr8(RegStorage::kValid | dr8);
+constexpr RegStorage rs_dr9(RegStorage::kValid | dr9);
+constexpr RegStorage rs_dr10(RegStorage::kValid | dr10);
+constexpr RegStorage rs_dr11(RegStorage::kValid | dr11);
+constexpr RegStorage rs_dr12(RegStorage::kValid | dr12);
+constexpr RegStorage rs_dr13(RegStorage::kValid | dr13);
+constexpr RegStorage rs_dr14(RegStorage::kValid | dr14);
+constexpr RegStorage rs_dr15(RegStorage::kValid | dr15);
+#endif
 
 constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
 constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
@@ -269,6 +319,16 @@
 constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
 constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
 constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_xr8(RegStorage::kValid | xr8);
+constexpr RegStorage rs_xr9(RegStorage::kValid | xr9);
+constexpr RegStorage rs_xr10(RegStorage::kValid | xr10);
+constexpr RegStorage rs_xr11(RegStorage::kValid | xr11);
+constexpr RegStorage rs_xr12(RegStorage::kValid | xr12);
+constexpr RegStorage rs_xr13(RegStorage::kValid | xr13);
+constexpr RegStorage rs_xr14(RegStorage::kValid | xr14);
+constexpr RegStorage rs_xr15(RegStorage::kValid | xr15);
+#endif
 
 extern X86NativeRegisterPool rX86_ARG0;
 extern X86NativeRegisterPool rX86_ARG1;
@@ -628,6 +688,15 @@
 // 64 Bit Operand Size
 #define REX_W 0x48
 // Extension of the ModR/M reg field
+#define REX_R 0x44
+// Extension of the SIB index field
+#define REX_X 0x42
+// Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
+#define REX_B 0x41
+// Mask extracting the least 3 bits of r0..r15
+#define kRegNumMask32 0x07
+// Value indicating that base or reg is not used
+#define NO_REG 0
 
 #define IS_SIMM8(v) ((-128 <= (v)) && ((v) <= 127))
 #define IS_SIMM16(v) ((-32768 <= (v)) && ((v) <= 32767))