x86_64: Fix intrinsics

The following intrinsics have been ported:

- Abs(double/long/int/float)
- String.indexOf/charAt/compareTo/is_empty/length
- Float.floatToRawIntBits, Float.intBitsToFloat
- Double.doubleToRawLongBits, Double.longBitsToDouble
- Thread.currentThread
- Unsafe.getInt/Long/Object, Unsafe.putInt/Long/Object
- Math.sqrt, Math.max, Math.min
- Long.reverseBytes

Math.min and max for longs have been implemented for x86_64.

Commented out until good tests available:
- Memory.peekShort/Int/Long, Memory.pokeShort/Int/Long

Turned off on x86-64 as reported having problems
- Cas

Change-Id: I934bc9c90fdf953be0d3836a17b6ee4e7c98f244
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
old mode 100644
new mode 100755
index 660563e..6c670cd
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1372,11 +1372,11 @@
     return false;
   }
   RegLocation rl_src_i = info->args[0];
+  RegLocation rl_i = (size == k64) ? LoadValueWide(rl_src_i, kCoreReg) : LoadValue(rl_src_i, kCoreReg);
   RegLocation rl_dest = (size == k64) ? InlineTargetWide(info) : InlineTarget(info);  // result reg
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (size == k64) {
-    RegLocation rl_i = LoadValueWide(rl_src_i, kCoreReg);
-    if (cu_->instruction_set == kArm64) {
+    if (cu_->instruction_set == kArm64 || cu_->instruction_set == kX86_64) {
       OpRegReg(kOpRev, rl_result.reg, rl_i.reg);
       StoreValueWide(rl_dest, rl_result);
       return true;
@@ -1396,7 +1396,6 @@
   } else {
     DCHECK(size == k32 || size == kSignedHalf);
     OpKind op = (size == k32) ? kOpRev : kOpRevsh;
-    RegLocation rl_i = LoadValue(rl_src_i, kCoreReg);
     OpRegReg(op, rl_result.reg, rl_i.reg);
     StoreValue(rl_dest, rl_result);
   }
@@ -1432,7 +1431,9 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
   // If on x86 or if we would clobber a register needed later, just copy the source first.
-  if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64 || rl_result.reg.GetLowReg() == rl_src.reg.GetHighReg()) {
+  if (cu_->instruction_set != kX86_64 &&
+      (cu_->instruction_set == kX86 ||
+       rl_result.reg.GetLowReg() == rl_src.reg.GetHighReg())) {
     OpRegCopyWide(rl_result.reg, rl_src.reg);
     if (rl_result.reg.GetLowReg() != rl_src.reg.GetLowReg() &&
         rl_result.reg.GetLowReg() != rl_src.reg.GetHighReg() &&
@@ -1445,12 +1446,20 @@
   }
 
   // abs(x) = y<=x>>31, (x+y)^y.
-  RegStorage sign_reg = AllocTemp();
-  OpRegRegImm(kOpAsr, sign_reg, rl_src.reg.GetHigh(), 31);
-  OpRegRegReg(kOpAdd, rl_result.reg.GetLow(), rl_src.reg.GetLow(), sign_reg);
-  OpRegRegReg(kOpAdc, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), sign_reg);
-  OpRegReg(kOpXor, rl_result.reg.GetLow(), sign_reg);
-  OpRegReg(kOpXor, rl_result.reg.GetHigh(), sign_reg);
+  RegStorage sign_reg;
+  if (cu_->instruction_set == kX86_64) {
+    sign_reg = AllocTempWide();
+    OpRegRegImm(kOpAsr, sign_reg, rl_src.reg, 63);
+    OpRegRegReg(kOpAdd, rl_result.reg, rl_src.reg, sign_reg);
+    OpRegReg(kOpXor, rl_result.reg, sign_reg);
+  } else {
+    sign_reg = AllocTemp();
+    OpRegRegImm(kOpAsr, sign_reg, rl_src.reg.GetHigh(), 31);
+    OpRegRegReg(kOpAdd, rl_result.reg.GetLow(), rl_src.reg.GetLow(), sign_reg);
+    OpRegRegReg(kOpAdc, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), sign_reg);
+    OpRegReg(kOpXor, rl_result.reg.GetLow(), sign_reg);
+    OpRegReg(kOpXor, rl_result.reg.GetHigh(), sign_reg);
+  }
   FreeTemp(sign_reg);
   StoreValueWide(rl_dest, rl_result);
   return true;
@@ -1533,6 +1542,10 @@
     // TODO - add Mips implementation
     return false;
   }
+  if (cu_->instruction_set == kX86_64) {
+    // TODO - add kX86_64 implementation
+    return false;
+  }
   RegLocation rl_obj = info->args[0];
   RegLocation rl_char = info->args[1];
   if (rl_char.is_const && (mir_graph_->ConstantValue(rl_char) & ~0xFFFF) != 0) {
@@ -1626,7 +1639,7 @@
 
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
 
   switch (cu_->instruction_set) {
     case kArm:
@@ -1759,10 +1772,8 @@
     return;
   }
   DCHECK(cu_->compiler_driver->GetMethodInlinerMap() != nullptr);
-  // TODO: Enable instrinsics for x86_64
-  // Temporary disable intrinsics for x86_64. We will enable them later step by step.
   // Temporary disable intrinsics for Arm64. We will enable them later step by step.
-  if ((cu_->instruction_set != kX86_64) && (cu_->instruction_set != kArm64)) {
+  if (cu_->instruction_set != kArm64) {
     if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file)
         ->GenIntrinsic(this, info)) {
       return;
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 737b8b9..879cf93 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -318,6 +318,7 @@
   { kx86Cdq32Da, kRegOpcode, NO_OPERAND | REG_DEFAD_USEA,                                  { 0,     0, 0x99, 0,    0, 0, 0, 0, false }, "Cdq", "" },
   { kx86Cqo64Da, kRegOpcode, NO_OPERAND | REG_DEFAD_USEA,                                  { REX_W, 0, 0x99, 0,    0, 0, 0, 0, false }, "Cqo", "" },
   { kX86Bswap32R, kRegOpcode, IS_UNARY_OP | REG_DEF0_USE0,                                 { 0,     0, 0x0F, 0xC8, 0, 0, 0, 0, false }, "Bswap32R", "!0r" },
+  { kX86Bswap64R, kRegOpcode, IS_UNARY_OP | REG_DEF0_USE0,                                 { REX_W, 0, 0x0F, 0xC8, 0, 0, 0, 0, false }, "Bswap64R", "!0r" },
   { kX86Push32R,  kRegOpcode, IS_UNARY_OP | REG_USE0 | REG_USE_SP | REG_DEF_SP | IS_STORE, { 0,     0, 0x50, 0,    0, 0, 0, 0, false }, "Push32R",  "!0r" },
   { kX86Pop32R,   kRegOpcode, IS_UNARY_OP | REG_DEF0 | REG_USE_SP | REG_DEF_SP | IS_LOAD,  { 0,     0, 0x58, 0,    0, 0, 0, 0, false }, "Pop32R",   "!0r" },
 
@@ -484,6 +485,7 @@
   { kX86CmpxchgAR, kArrayReg,     IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES,    { 0,    0, 0x0F, 0xB1, 0, 0, 0, 0, false }, "Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86LockCmpxchgMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02 | REG_DEFA_USEA | SETS_CCODES, { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0, false }, "Lock Cmpxchg", "[!0r+!1d],!2r" },
   { kX86LockCmpxchgAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES,    { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0, false }, "Lock Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86LockCmpxchg64AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES,    { 0xF0, REX_W, 0x0F, 0xB1, 0, 0, 0, 0, false }, "Lock Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86LockCmpxchg64M, kMem,     IS_STORE | IS_BINARY_OP | REG_USE0 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES, { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0, false }, "Lock Cmpxchg8b", "[!0r+!1d]" },
   { kX86LockCmpxchg64A, kArray,   IS_STORE | IS_QUAD_OP | REG_USE01 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES,  { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0, false }, "Lock Cmpxchg8b", "[!0r+!1r<<!2d+!3d]" },
   { kX86XchgMR, kMemReg,          IS_STORE | IS_LOAD | IS_TERTIARY_OP | REG_DEF2 | REG_USE02,          { 0, 0, 0x87, 0, 0, 0, 0, 0, false }, "Xchg", "[!0r+!1d],!2r" },
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
old mode 100644
new mode 100755
index 4414d7c..fc65deb
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -648,6 +648,15 @@
     // Result is unused, the code is dead. Inlining successful, no code generated.
     return true;
   }
+  if (cu_->target64) {
+    rl_src = LoadValueWide(rl_src, kCoreReg);
+    RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    OpRegCopyWide(rl_result.reg, rl_src.reg);
+    OpRegImm(kOpLsl, rl_result.reg, 1);
+    OpRegImm(kOpLsr, rl_result.reg, 1);
+    StoreValueWide(rl_dest, rl_result);
+    return true;
+  }
   int v_src_reg = mir_graph_->SRegToVReg(rl_src.s_reg_low);
   int v_dst_reg = mir_graph_->SRegToVReg(rl_dest.s_reg_low);
   rl_src = UpdateLocWide(rl_src);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
old mode 100644
new mode 100755
index ed4c775..5372512
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -716,17 +716,17 @@
 bool X86Mir2Lir::GenInlinedMinMax(CallInfo* info, bool is_min, bool is_long) {
   DCHECK(cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64);
 
-  if (is_long) {
+  if (is_long && cu_->instruction_set == kX86) {
     return false;
   }
 
   // Get the two arguments to the invoke and place them in GP registers.
   RegLocation rl_src1 = info->args[0];
-  RegLocation rl_src2 = info->args[1];
-  rl_src1 = LoadValue(rl_src1, kCoreReg);
-  rl_src2 = LoadValue(rl_src2, kCoreReg);
+  RegLocation rl_src2 = (is_long) ? info->args[2] : info->args[1];
+  rl_src1 = (is_long) ? LoadValueWide(rl_src1, kCoreReg) : LoadValue(rl_src1, kCoreReg);
+  rl_src2 = (is_long) ? LoadValueWide(rl_src2, kCoreReg) : LoadValue(rl_src2, kCoreReg);
 
-  RegLocation rl_dest = InlineTarget(info);
+  RegLocation rl_dest = (is_long) ? InlineTargetWide(info) : InlineTarget(info);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
   /*
@@ -752,43 +752,63 @@
     OpCondRegReg(kOpCmov, condition_code, rl_result.reg, rl_src2.reg);
   }
 
-  StoreValue(rl_dest, rl_result);
-  return true;
-}
-
-bool X86Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
-  RegLocation rl_src_address = info->args[0];  // long address
-  rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[1]
-  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
-  RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  // Unaligned access is allowed on x86.
-  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
-  if (size == k64) {
+  if (is_long) {
     StoreValueWide(rl_dest, rl_result);
   } else {
-    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     StoreValue(rl_dest, rl_result);
   }
   return true;
 }
 
+bool X86Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
+  return false;
+// Turned off until tests available in Art.
+//
+//  RegLocation rl_src_address = info->args[0];  // long address
+//  RegLocation rl_address;
+//  if (!cu_->target64) {
+//    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
+//    rl_address = LoadValue(rl_src_address, kCoreReg);
+//  } else {
+//    rl_address = LoadValueWide(rl_src_address, kCoreReg);
+//  }
+//  RegLocation rl_dest = size == k64 ? InlineTargetWide(info) : InlineTarget(info);
+//  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+//  // Unaligned access is allowed on x86.
+//  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
+//  if (size == k64) {
+//    StoreValueWide(rl_dest, rl_result);
+//  } else {
+//    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
+//    StoreValue(rl_dest, rl_result);
+//  }
+//  return true;
+}
+
 bool X86Mir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
-  RegLocation rl_src_address = info->args[0];  // long address
-  rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[1]
-  RegLocation rl_src_value = info->args[2];  // [size] value
-  RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);
-  if (size == k64) {
-    // Unaligned access is allowed on x86.
-    RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
-  } else {
-    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
-    // Unaligned access is allowed on x86.
-    RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
-  }
-  return true;
+  return false;
+// Turned off until tests available in Art.
+//
+//  RegLocation rl_src_address = info->args[0];  // long address
+//  RegLocation rl_address;
+//  if (!cu_->target64) {
+//    rl_src_address = NarrowRegLoc(rl_src_address);  // ignore high half in info->args[0]
+//    rl_address = LoadValue(rl_src_address, kCoreReg);
+//  } else {
+//    rl_address = LoadValueWide(rl_src_address, kCoreReg);
+//  }
+//  RegLocation rl_src_value = info->args[2];  // [size] value
+//  if (size == k64) {
+//    // Unaligned access is allowed on x86.
+//    RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
+//    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
+//  } else {
+//    DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
+//    // Unaligned access is allowed on x86.
+//    RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
+//    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
+//  }
+//  return true;
 }
 
 void X86Mir2Lir::OpLea(RegStorage r_base, RegStorage reg1, RegStorage reg2, int scale, int offset) {
@@ -811,6 +831,10 @@
 
 bool X86Mir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
   DCHECK(cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64);
+  if (cu_->instruction_set == kX86_64) {
+    return false;  // TODO: Verify working on x86-64.
+  }
+
   // Unused - RegLocation rl_src_unsafe = info->args[0];
   RegLocation rl_src_obj = info->args[1];  // Object - known non-null
   RegLocation rl_src_offset = info->args[2];  // long low
@@ -820,7 +844,24 @@
   RegLocation rl_src_new_value = info->args[is_long ? 6 : 5];  // int, long or Object
   // If is_long, high half is in info->args[7]
 
-  if (is_long) {
+  if (is_long && cu_->target64) {
+    // RAX must hold expected for CMPXCHG. Neither rl_new_value, nor r_ptr may be in RAX.
+    FlushReg(rs_r0);
+    Clobber(rs_r0);
+    LockTemp(rs_r0);
+
+    RegLocation rl_object = LoadValue(rl_src_obj, kRefReg);
+    RegLocation rl_new_value = LoadValueWide(rl_src_new_value, kCoreReg);
+    RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
+    LoadValueDirectWide(rl_src_expected, rs_r0);
+    NewLIR5(kX86LockCmpxchg64AR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());
+
+    // After a store we need to insert barrier in case of potential load. Since the
+    // locked cmpxchg has full barrier semantics, only a scheduling barrier will be generated.
+    GenMemBarrier(kStoreLoad);
+
+    FreeTemp(rs_r0);
+  } else if (is_long) {
     // TODO: avoid unnecessary loads of SI and DI when the values are in registers.
     // TODO: CFI support.
     FlushAllRegs();
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
old mode 100644
new mode 100755
index f80e200..72e47d0
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1035,11 +1035,12 @@
   // ECX: count: number of words to be searched.
   // EDI: String being searched.
   // EDX: temporary during execution.
-  // EBX: temporary during execution.
+  // EBX or R11: temporary during execution (depending on mode).
 
   RegLocation rl_obj = info->args[0];
   RegLocation rl_char = info->args[1];
   RegLocation rl_start;  // Note: only present in III flavor or IndexOf.
+  RegStorage tmpReg = cu_->target64 ? rs_r11 : rs_rBX;
 
   uint32_t char_value =
     rl_char.is_const ? mir_graph_->ConstantValue(rl_char.orig_sreg) : 0;
@@ -1112,9 +1113,9 @@
       rl_start = UpdateLocTyped(rl_start, kCoreReg);
       if (rl_start.location == kLocPhysReg) {
         // Handle "start index < 0" case.
-        OpRegReg(kOpXor, rs_rBX, rs_rBX);
-        OpRegReg(kOpCmp, rl_start.reg, rs_rBX);
-        OpCondRegReg(kOpCmov, kCondLt, rl_start.reg, rs_rBX);
+        OpRegReg(kOpXor, tmpReg, tmpReg);
+        OpRegReg(kOpCmp, rl_start.reg, tmpReg);
+        OpCondRegReg(kOpCmov, kCondLt, rl_start.reg, tmpReg);
 
         // The length of the string should be greater than the start index.
         length_compare = OpCmpBranch(kCondLe, rs_rCX, rl_start.reg, nullptr);
@@ -1126,19 +1127,19 @@
         }
       } else {
         // Load the start index from stack, remembering that we pushed EDI.
-        int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
+        int displacement = SRegOffset(rl_start.s_reg_low) + (cu_->target64 ? 2 : 1) * sizeof(uint32_t);
         {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          Load32Disp(rs_rX86_SP, displacement, rs_rBX);
+          Load32Disp(rs_rX86_SP, displacement, tmpReg);
         }
         OpRegReg(kOpXor, rs_rDI, rs_rDI);
-        OpRegReg(kOpCmp, rs_rBX, rs_rDI);
-        OpCondRegReg(kOpCmov, kCondLt, rs_rBX, rs_rDI);
+        OpRegReg(kOpCmp, tmpReg, rs_rDI);
+        OpCondRegReg(kOpCmov, kCondLt, tmpReg, rs_rDI);
 
-        length_compare = OpCmpBranch(kCondLe, rs_rCX, rs_rBX, nullptr);
-        OpRegReg(kOpSub, rs_rCX, rs_rBX);
+        length_compare = OpCmpBranch(kCondLe, rs_rCX, tmpReg, nullptr);
+        OpRegReg(kOpSub, rs_rCX, tmpReg);
         // Put the start index to stack.
-        NewLIR1(kX86Push32R, rs_rBX.GetReg());
+        NewLIR1(kX86Push32R, tmpReg.GetReg());
         is_index_on_stack = true;
       }
     }
@@ -1147,26 +1148,26 @@
 
   // ECX now contains the count in words to be searched.
 
-  // Load the address of the string into EBX.
+  // Load the address of the string into R11 or EBX (depending on mode).
   // The string starts at VALUE(String) + 2 * OFFSET(String) + DATA_OFFSET.
   Load32Disp(rs_rDX, value_offset, rs_rDI);
-  Load32Disp(rs_rDX, offset_offset, rs_rBX);
-  OpLea(rs_rBX, rs_rDI, rs_rBX, 1, data_offset);
+  Load32Disp(rs_rDX, offset_offset, tmpReg);
+  OpLea(tmpReg, rs_rDI, tmpReg, 1, data_offset);
 
   // Now compute into EDI where the search will start.
   if (zero_based || rl_start.is_const) {
     if (start_value == 0) {
-      OpRegCopy(rs_rDI, rs_rBX);
+      OpRegCopy(rs_rDI, tmpReg);
     } else {
-      NewLIR3(kX86Lea32RM, rs_rDI.GetReg(), rs_rBX.GetReg(), 2 * start_value);
+      NewLIR3(kX86Lea32RM, rs_rDI.GetReg(), tmpReg.GetReg(), 2 * start_value);
     }
   } else {
     if (is_index_on_stack == true) {
       // Load the start index from stack.
       NewLIR1(kX86Pop32R, rs_rDX.GetReg());
-      OpLea(rs_rDI, rs_rBX, rs_rDX, 1, 0);
+      OpLea(rs_rDI, tmpReg, rs_rDX, 1, 0);
     } else {
-      OpLea(rs_rDI, rs_rBX, rl_start.reg, 1, 0);
+      OpLea(rs_rDI, tmpReg, rl_start.reg, 1, 0);
     }
   }
 
@@ -1179,7 +1180,7 @@
 
   // yes, we matched.  Compute the index of the result.
   // index = ((curr_ptr - orig_ptr) / 2) - 1.
-  OpRegReg(kOpSub, rs_rDI, rs_rBX);
+  OpRegReg(kOpSub, rs_rDI, tmpReg);
   OpRegImm(kOpAsr, rs_rDI, 1);
   NewLIR3(kX86Lea32RM, rl_return.reg.GetReg(), rs_rDI.GetReg(), -1);
   LIR *all_done = NewLIR1(kX86Jmp8, 0);
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 4770ade..657160f 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -122,7 +122,7 @@
   switch (op) {
     case kOpNeg: opcode = r_dest_src.Is64Bit() ? kX86Neg64R : kX86Neg32R; break;
     case kOpNot: opcode = r_dest_src.Is64Bit() ? kX86Not64R : kX86Not32R; break;
-    case kOpRev: opcode = kX86Bswap32R; break;
+    case kOpRev: opcode = r_dest_src.Is64Bit() ? kX86Bswap64R : kX86Bswap32R; break;
     case kOpBlx: opcode = kX86CallR; break;
     default:
       LOG(FATAL) << "Bad case in OpReg " << op;
@@ -356,7 +356,9 @@
 LIR* X86Mir2Lir::OpCondRegReg(OpKind op, ConditionCode cc, RegStorage r_dest, RegStorage r_src) {
   // The only conditional reg to reg operation supported is Cmov
   DCHECK_EQ(op, kOpCmov);
-  return NewLIR3(kX86Cmov32RRC, r_dest.GetReg(), r_src.GetReg(), X86ConditionEncoding(cc));
+  DCHECK_EQ(r_dest.Is64Bit(), r_src.Is64Bit());
+  return NewLIR3(r_dest.Is64Bit() ? kX86Cmov64RRC : kX86Cmov32RRC, r_dest.GetReg(),
+                 r_src.GetReg(), X86ConditionEncoding(cc));
 }
 
 LIR* X86Mir2Lir::OpRegMem(OpKind op, RegStorage r_dest, RegStorage r_base, int offset) {
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index a52e842..ff243ce 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -506,6 +506,7 @@
   kx86Cdq32Da,
   kx86Cqo64Da,
   kX86Bswap32R,
+  kX86Bswap64R,
   kX86Push32R, kX86Pop32R,
 #undef UnaryOpcode
 #define Binary0fOpCode(opcode) \
@@ -608,7 +609,7 @@
   Binary0fOpCode(kX86Imul32),   // 32bit multiply
   Binary0fOpCode(kX86Imul64),   // 64bit multiply
   kX86CmpxchgRR, kX86CmpxchgMR, kX86CmpxchgAR,  // compare and exchange
-  kX86LockCmpxchgMR, kX86LockCmpxchgAR,  // locked compare and exchange
+  kX86LockCmpxchgMR, kX86LockCmpxchgAR, kX86LockCmpxchg64AR,  // locked compare and exchange
   kX86LockCmpxchg64M, kX86LockCmpxchg64A,  // locked compare and exchange
   kX86XchgMR,  // exchange memory with register (automatically locked)
   Binary0fOpCode(kX86Movzx8),   // zero-extend 8-bit value