Added thin-lock fast path for monitor-enter/exit on x86.

Change-Id: Iba187ae1acde6e341ae510d4b47f59e6984fc354
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index a245660..0c5d3cf 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -298,6 +298,14 @@
 
   EXT_0F_ENCODING_MAP(Imul16,  0x66, 0xAF, REG_DEF0 | SETS_CCODES),
   EXT_0F_ENCODING_MAP(Imul32,  0x00, 0xAF, REG_DEF0 | SETS_CCODES),
+
+  { kX86CmpxchgRR, kRegRegStore, IS_BINARY_OP | REG_DEF0 | REG_USE01 | REG_DEFA_USEA | SETS_CCODES, { 0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Cmpxchg", "!0r,!1r" },
+  { kX86CmpxchgMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02 | REG_DEFA_USEA | SETS_CCODES, { 0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Cmpxchg", "[!0r+!1d],!2r" },
+  { kX86CmpxchgAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES, { 0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86LockCmpxchgRR, kRegRegStore, IS_BINARY_OP | REG_DEF0 | REG_USE01 | REG_DEFA_USEA | SETS_CCODES, { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Lock Cmpxchg", "!0r,!1r" },
+  { kX86LockCmpxchgMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02 | REG_DEFA_USEA | SETS_CCODES, { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Lock Cmpxchg", "[!0r+!1d],!2r" },
+  { kX86LockCmpxchgAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES, { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Lock Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
+
   EXT_0F_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index 6003465..bb648bf 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -49,7 +49,7 @@
       rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
       rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
       tempReg = oatAllocTemp(cUnit);
-      loadConstant(cUnit, tempReg, 0x80000000);
+      loadConstantNoClobber(cUnit, tempReg, 0x80000000);
       int rDest = rlResult.lowReg;
       int rSrc1 = rlSrc1.lowReg;
       if (rDest == rSrc1) {
@@ -114,7 +114,7 @@
       rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
       rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
       tempReg = oatAllocTemp(cUnit);
-      loadConstant(cUnit, tempReg, 0x80000000);
+      loadConstantNoClobber(cUnit, tempReg, 0x80000000);
       int rDest = S2D(rlResult.lowReg, rlResult.highReg);
       int rSrc1 = S2D(rlSrc1.lowReg, rlSrc1.highReg);
       if (rDest == rSrc1) {
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index 894202e..69aaaad 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -211,30 +211,43 @@
 LIR* genNullCheck(CompilationUnit* cUnit, int sReg, int mReg, int optFlags);
 void callRuntimeHelperReg(CompilationUnit* cUnit, int helperOffset, int arg0);
 
-/*
- * TODO: implement fast path to short-circuit thin-lock case
- */
 void genMonitorEnter(CompilationUnit* cUnit, int optFlags, RegLocation rlSrc)
 {
   oatFlushAllRegs(cUnit);
-  loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
+  loadValueDirectFixed(cUnit, rlSrc, rCX);  // Get obj
   oatLockCallTemps(cUnit);  // Prepare for explicit register usage
-  genNullCheck(cUnit, rlSrc.sRegLow, rARG0, optFlags);
-  // Go expensive route - artLockObjectFromCode(self, obj);
-  callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pLockObjectFromCode), rARG0);
+  genNullCheck(cUnit, rlSrc.sRegLow, rCX, optFlags);
+  // If lock is unheld, try to grab it quickly with compare and exchange
+  // TODO: copy and clear hash state?
+  newLIR2(cUnit, kX86Mov32RT, rDX, Thread::ThinLockIdOffset().Int32Value());
+  newLIR2(cUnit, kX86Sal32RI, rDX, LW_LOCK_OWNER_SHIFT);
+  newLIR2(cUnit, kX86Xor32RR, rAX, rAX);
+  newLIR3(cUnit, kX86LockCmpxchgMR, rCX, Object::MonitorOffset().Int32Value(), rDX);
+  LIR* branch = newLIR2(cUnit, kX86Jcc8, 0, kX86CondEq);
+  // If lock is held, go the expensive route - artLockObjectFromCode(self, obj);
+  callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pLockObjectFromCode), rCX);
+  branch->target = newLIR0(cUnit, kPseudoTargetLabel);
 }
 
-/*
- * TODO: implement fast path to short-circuit thin-lock case
- */
 void genMonitorExit(CompilationUnit* cUnit, int optFlags, RegLocation rlSrc)
 {
   oatFlushAllRegs(cUnit);
-  loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
+  loadValueDirectFixed(cUnit, rlSrc, rAX);  // Get obj
   oatLockCallTemps(cUnit);  // Prepare for explicit register usage
-  genNullCheck(cUnit, rlSrc.sRegLow, rARG0, optFlags);
-  // Go expensive route - UnlockObjectFromCode(obj);
-  callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pUnlockObjectFromCode), rARG0);
+  genNullCheck(cUnit, rlSrc.sRegLow, rAX, optFlags);
+  // If lock is held by the current thread, clear it to quickly release it
+  // TODO: clear hash state?
+  newLIR2(cUnit, kX86Mov32RT, rDX, Thread::ThinLockIdOffset().Int32Value());
+  newLIR2(cUnit, kX86Sal32RI, rDX, LW_LOCK_OWNER_SHIFT);
+  newLIR3(cUnit, kX86Mov32RM, rCX, rAX, Object::MonitorOffset().Int32Value());
+  opRegReg(cUnit, kOpSub, rCX, rDX);
+  LIR* branch = newLIR2(cUnit, kX86Jcc8, 0, kX86CondNe);
+  newLIR3(cUnit, kX86Mov32MR, rAX, Object::MonitorOffset().Int32Value(), rCX);
+  LIR* branch2 = newLIR1(cUnit, kX86Jmp8, 0);
+  branch->target = newLIR0(cUnit, kPseudoTargetLabel);
+  // Otherwise, go the expensive route - UnlockObjectFromCode(obj);
+  callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pUnlockObjectFromCode), rAX);
+  branch2->target = newLIR0(cUnit, kPseudoTargetLabel);
 }
 
 /*
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 5bf4dd9..72c8c03 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -445,6 +445,8 @@
   kX86Mfence,                   // memory barrier
   Binary0fOpCode(kX86Imul16),   // 16bit multiply
   Binary0fOpCode(kX86Imul32),   // 32bit multiply
+  kX86CmpxchgRR, kX86CmpxchgMR, kX86CmpxchgAR,// compare and exchange
+  kX86LockCmpxchgRR, kX86LockCmpxchgMR, kX86LockCmpxchgAR,// locked compare and exchange
   Binary0fOpCode(kX86Movzx8),   // zero-extend 8-bit value
   Binary0fOpCode(kX86Movzx16),  // zero-extend 16-bit value
   Binary0fOpCode(kX86Movsx8),   // sign-extend 8-bit value
diff --git a/src/disassembler_x86.cc b/src/disassembler_x86.cc
index f70289f..646e978 100644
--- a/src/disassembler_x86.cc
+++ b/src/disassembler_x86.cc
@@ -511,6 +511,7 @@
           no_ops = true;
         }
         break;
+      case 0xB1: opcode << "cmpxchg"; has_modrm = true; store = true; break;
       case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; break;
       case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
       case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;