Implement various missing parts of the X86 compiler

Change-Id: I76f08580600befe268328f8cf7102c6146460c5e
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index 20eb47f..00e78ec 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -657,6 +657,8 @@
          */
 #if defined(TARGET_ARM)
         int bxOffset = tabRec->anchor->offset + 4;
+#elif defined(TARGET_X86)
+        int bxOffset = 0;
 #else
         int bxOffset = tabRec->anchor->offset;
 #endif
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 3cc594c..444f5f2 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -43,6 +43,20 @@
 #endif
 }
 
+void callRuntimeHelperReg(CompilationUnit* cUnit, int helperOffset, int arg0) {
+#if !defined(TARGET_X86)
+    int rTgt = loadHelper(cUnit, helperOffset);
+#endif
+    opRegCopy(cUnit, rARG0, arg0);
+    oatClobberCalleeSave(cUnit);
+#if !defined(TARGET_X86)
+    opReg(cUnit, kOpBlx, rTgt);
+    oatFreeTemp(cUnit, rTgt);
+#else
+    opThreadMem(cUnit, kOpBlx, helperOffset);
+#endif
+}
+
 void callRuntimeHelperRegLocation(CompilationUnit* cUnit, int helperOffset,
                                   RegLocation arg0) {
 #if !defined(TARGET_X86)
@@ -431,7 +445,7 @@
             cond = (ConditionCode)0;
             LOG(FATAL) << "Unexpected opcode " << (int)opcode;
     }
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
     opCmpImmBranch(cUnit, cond, rlSrc.lowReg, 0, &labelList[bb->taken->id]);
 #else
     opRegImm(cUnit, kOpCmp, rlSrc.lowReg, 0);
@@ -1811,31 +1825,34 @@
                 (int)mir->dalvikInsn.opcode;
     }
     if (!callOut) {
-        rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
         if (unary) {
+            rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
             rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
             opRegReg(cUnit, op, rlResult.lowReg,
                      rlSrc1.lowReg);
         } else {
-            rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
-#if defined(TARGET_X86)
-            rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-            opRegRegReg(cUnit, op, rlResult.lowReg,
-                        rlSrc1.lowReg, rlSrc2.lowReg);
-#else
             if (shiftOp) {
+#if !defined(TARGET_X86)
+                rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
                 int tReg = oatAllocTemp(cUnit);
                 opRegRegImm(cUnit, kOpAnd, tReg, rlSrc2.lowReg, 31);
+#else
+                // X86 doesn't require masking and must use ECX
+                loadValueDirectFixed(cUnit, rlSrc2, rCX);
+                int tReg = rCX;
+#endif
+                rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
                 rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
                 opRegRegReg(cUnit, op, rlResult.lowReg,
                             rlSrc1.lowReg, tReg);
                 oatFreeTemp(cUnit, tReg);
             } else {
+                rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
+                rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
                 rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
                 opRegRegReg(cUnit, op, rlResult.lowReg,
                             rlSrc1.lowReg, rlSrc2.lowReg);
             }
-#endif
         }
         storeValue(cUnit, rlDest, rlResult);
     } else {
@@ -2151,12 +2168,8 @@
             break;
         case Instruction::ADD_LONG:
         case Instruction::ADD_LONG_2ADDR:
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
             return genAddLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
-#elif defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLadd);
 #else
             firstOp = kOpAdd;
             secondOp = kOpAdc;
@@ -2164,16 +2177,13 @@
 #endif
         case Instruction::SUB_LONG:
         case Instruction::SUB_LONG_2ADDR:
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
             return genSubLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
-#elif defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLsub);
-#endif
+#else
             firstOp = kOpSub;
             secondOp = kOpSbc;
             break;
+#endif
         case Instruction::MUL_LONG:
         case Instruction::MUL_LONG_2ADDR:
             callOut = true;
@@ -2199,33 +2209,30 @@
         case Instruction::AND_LONG_2ADDR:
         case Instruction::AND_LONG:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLand);
-#endif
+            return genAndLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpAnd;
             secondOp = kOpAnd;
             break;
+#endif
         case Instruction::OR_LONG:
         case Instruction::OR_LONG_2ADDR:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLor);
-#endif
+            return genOrLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpOr;
             secondOp = kOpOr;
             break;
+#endif
         case Instruction::XOR_LONG:
         case Instruction::XOR_LONG_2ADDR:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLxor);
-#endif
+            return genXorLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpXor;
             secondOp = kOpXor;
             break;
+#endif
         case Instruction::NEG_LONG: {
             return genNegLong(cUnit, mir, rlDest, rlSrc2);
         }
diff --git a/src/compiler/codegen/GenInvoke.cc b/src/compiler/codegen/GenInvoke.cc
index 8a9d1f5..a904419 100644
--- a/src/compiler/codegen/GenInvoke.cc
+++ b/src/compiler/codegen/GenInvoke.cc
@@ -286,15 +286,15 @@
      * This handles the case in which the base method is not fully
      * resolved at compile time, we bail to a runtime helper.
      */
-#if !defined(TARGET_X86)
     if (state == 0) {
+#if !defined(TARGET_X86)
         // Load trampoline target
         loadWordDisp(cUnit, rSELF, trampoline, rINVOKE_TGT);
+#endif
         // Load rARG0 with method index
         loadConstant(cUnit, rARG0, dexIdx);
         return 1;
     }
-#endif
     return -1;
 }
 
@@ -357,11 +357,7 @@
                 uint32_t methodIdx, uintptr_t directCode,
                 uintptr_t directMethod, InvokeType type, bool skipThis)
 {
-#if !defined(TARGET_X86)
     int lastArgReg = rARG3;
-#else
-    int lastArgReg = rARG2;
-#endif
     int nextReg = rARG1;
     int nextArg = 0;
     if (skipThis) {
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 5ffe3e4..b28df01 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -128,11 +128,31 @@
 #if !defined(TARGET_X86)
     opReg(cUnit, kOpBlx, rINVOKE_TGT);
 #else
-    if (fastPath) {
+    if (fastPath && type != kInterface) {
       opMem(cUnit, kOpBlx, rARG0, Method::GetCodeOffset().Int32Value());
     } else {
-      UNIMPLEMENTED(FATAL) << "compute trampoline";
-      opThreadMem(cUnit, kOpBlx, 0);
+      int trampoline = 0;
+      switch (type) {
+        case kInterface:
+          trampoline = fastPath ? ENTRYPOINT_OFFSET(pInvokeInterfaceTrampoline)
+                                : ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+          break;
+        case kDirect:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+          break;
+        case kStatic:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+          break;
+        case kSuper:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+          break;
+        case kVirtual:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+          break;
+        default:
+          LOG(FATAL) << "Unexpected invoke type";
+      }
+      opThreadMem(cUnit, kOpBlx, trampoline);
     }
 #endif
 
diff --git a/src/compiler/codegen/x86/ArchFactory.cc b/src/compiler/codegen/x86/ArchFactory.cc
index bd95afb..e668250 100644
--- a/src/compiler/codegen/x86/ArchFactory.cc
+++ b/src/compiler/codegen/x86/ArchFactory.cc
@@ -24,30 +24,94 @@
 
 namespace art {
 
+bool genAddLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpAdd, r0, r2);  // r0 = r0 + r2
+  opRegReg(cUnit, kOpAdc, r1, r3);  // r1 = r1 + r3 + CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genSubLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpSub, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpSbc, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genAndLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpAnd, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpAnd, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genOrLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpOr, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpOr, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genXorLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpXor, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpXor, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
 bool genNegLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genNegLong";
-#if 0
-    rlSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
-    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-    /*
-     *  [v1 v0] =  -[a1 a0]
-     *    negu    v0,a0
-     *    negu    v1,a1
-     *    sltu    t1,r_zero
-     *    subu    v1,v1,t1
-     */
-
-    opRegReg(cUnit, kOpNeg, rlResult.lowReg, rlSrc.lowReg);
-    opRegReg(cUnit, kOpNeg, rlResult.highReg, rlSrc.highReg);
-    int tReg = oatAllocTemp(cUnit);
-    newLIR3(cUnit, kX86Sltu, tReg, r_ZERO, rlResult.lowReg);
-    opRegRegReg(cUnit, kOpSub, rlResult.highReg, rlResult.highReg, tReg);
-    oatFreeTemp(cUnit, tReg);
-    storeValueWide(cUnit, rlDest, rlResult);
-#endif
-    return false;
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc, r0, r1);
+  // Compute (r1:r0) = -(r1:r0)
+  opRegReg(cUnit, kOpNeg, r0, r0);  // r0 = -r0
+  opRegImm(cUnit, kOpAdc, r1, 0);   // r1 = r1 + CF
+  opRegReg(cUnit, kOpNeg, r1, r1);  // r1 = -r1
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
 }
 
 void genDebuggerUpdate(CompilationUnit* cUnit, int32_t offset);
diff --git a/src/compiler/codegen/x86/ArchUtility.cc b/src/compiler/codegen/x86/ArchUtility.cc
index 6c54e34..d325f5c 100644
--- a/src/compiler/codegen/x86/ArchUtility.cc
+++ b/src/compiler/codegen/x86/ArchUtility.cc
@@ -49,7 +49,7 @@
 
 /*
  * Interpret a format string and build a string no longer than size
- * See format key in Assemble.c.
+ * See format key in Assemble.cc.
  */
 std::string buildInsnString(const char *fmt, LIR *lir, unsigned char* baseAddr) {
   std::string buf;
@@ -79,6 +79,11 @@
           case 'd':
             buf += StringPrintf("%d", operand);
             break;
+          case 'p': {
+            SwitchTable *tabRec = reinterpret_cast<SwitchTable*>(operand);
+            buf += StringPrintf("0x%08x", tabRec->offset);
+            break;
+          }
           case 'r':
             if (FPREG(operand) || DOUBLEREG(operand)) {
               int fp_reg = operand & FP_REG_MASK;
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index d1a8d64..2639057 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -26,7 +26,7 @@
 
 X86EncodingMap EncodingMap[kX86Last] = {
   { kX8632BitData, kData,    IS_UNARY_OP,            { 0, 0, 0x00, 0, 0, 0, 0, 4 }, "data",  "0x!0d" },
-  { kX86Bkpt,      kNullary, NO_OPERAND | IS_BRANCH, { 0, 0, 0xCC, 0, 0, 0, 0, 4 }, "int 3", "" },
+  { kX86Bkpt,      kNullary, NO_OPERAND | IS_BRANCH, { 0, 0, 0xCC, 0, 0, 0, 0, 0 }, "int 3", "" },
   { kX86Nop,       kNop,     IS_UNARY_OP,            { 0, 0, 0x90, 0, 0, 0, 0, 0 }, "nop",   "" },
 
 #define ENCODING_MAP(opname, is_store, \
@@ -197,17 +197,16 @@
 { kX86 ## opname ## 32RI, kShiftRegImm,   IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32RI", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI, kShiftMemImm,   IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32MI", "[!0r+!1d],!2r" }, \
 { kX86 ## opname ## 32AI, kShiftArrayImm, IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
-{ kX86 ## opname ## 32RC, kShiftRegCl,    IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32RC", "" }, \
-{ kX86 ## opname ## 32MC, kShiftMemCl,    IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32MC", "" }, \
-{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32AC", "" }
+{ kX86 ## opname ## 32RC, kShiftRegCl,    IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32RC", "" }, \
+{ kX86 ## opname ## 32MC, kShiftMemCl,    IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32MC", "" }, \
+{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "" }
 
   SHIFT_ENCODING_MAP(Rol, 0x0),
   SHIFT_ENCODING_MAP(Ror, 0x1),
   SHIFT_ENCODING_MAP(Rcl, 0x2),
   SHIFT_ENCODING_MAP(Rcr, 0x3),
   SHIFT_ENCODING_MAP(Sal, 0x4),
-  SHIFT_ENCODING_MAP(Shl, 0x5),
-  SHIFT_ENCODING_MAP(Shr, 0x6),
+  SHIFT_ENCODING_MAP(Shr, 0x5),
   SHIFT_ENCODING_MAP(Sar, 0x7),
 #undef SHIFT_ENCODING_MAP
 
@@ -295,11 +294,16 @@
   { kX86Jcc32, kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0x0F, 0x80, 0, 0, 0, 0 }, "Jcc32", "!1c !0t" },
   { kX86Jmp8,  kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xEB, 0,    0, 0, 0, 0 }, "Jmp8",  "!0t" },
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
-  { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xE8, 0, 0, 0, 0, 0 }, "CallR", "!0r" },
-  { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
-  { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
-  { kX86CallT, kCall, IS_UNARY_OP  | IS_BRANCH | IS_LOAD,     { THREAD_PREFIX, 0, 0xFF, 0, 0, 2, 0, 0 }, "CallT", "fs:[!0d]" },
-  { kX86Ret,   kNullary,NO_OPERAND | IS_BRANCH,               { 0,             0, 0xC3, 0, 0, 0, 0, 0 }, "Ret", "" },
+  { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
+  { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
+  { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
+  { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
+  { kX86CallT, kCall, IS_UNARY_OP  | IS_BRANCH | IS_LOAD,     { THREAD_PREFIX, 0, 0xFF, 0,    0, 2, 0, 0 }, "CallT", "fs:[!0d]" },
+  { kX86Ret,   kNullary,NO_OPERAND | IS_BRANCH,               { 0,             0, 0xC3, 0,    0, 0, 0, 0 }, "Ret", "" },
+
+  { kX86StartOfMethod, kMacro,  IS_UNARY_OP | SETS_CCODES, { 0,0,0,0,0,0,0,0 },           "StartOfMethod", "!0r" },
+  { kX86PcRelLoadRA,   kPcRel,  IS_LOAD | IS_QUIN_OP,      { 0, 0, 0x8B, 0, 0, 0, 0, 0 }, "PcRelLoadRA",   "!0r,[!1r+!2r<<!3d+!4p]" },
+  { kX86PcRelAdr,      kPcRel,  IS_LOAD | IS_BINARY_OP,    { 0, 0, 0xB8, 0, 0, 0, 0, 4 }, "PcRelAdr",      "!0r,!1d" },
 };
 
 static size_t computeSize(X86EncodingMap* entry, int displacement, bool has_sib) {
@@ -323,7 +327,7 @@
   }
   if (displacement != 0) {
     if (entry->opcode != kX86Lea32RA) {
-      DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0);
+      DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0) << entry->name;
     }
     size += IS_SIMM8(displacement) ? 1 : 4;
   }
@@ -428,9 +432,11 @@
     case kJmp:
       if (lir->opcode == kX86Jmp8) {
         return 2;  // opcode + rel8
-      } else {
-        DCHECK(lir->opcode == kX86Jmp32);
+      } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
+      } else {
+        DCHECK(lir->opcode == kX86JmpR);
+        return 2;  // opcode + modrm
       }
     case kCall:
       switch (lir->opcode) {
@@ -445,6 +451,19 @@
           break;
       }
       break;
+    case kPcRel:
+      if (entry->opcode == kX86PcRelLoadRA) {
+        // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+        return computeSize(entry, 0x12345678, true);
+      } else {
+        DCHECK(entry->opcode == kX86PcRelAdr);
+        return 5; // opcode with reg + 4 byte immediate
+      }
+    case kMacro:
+      DCHECK_EQ(lir->opcode, static_cast<int>(kX86StartOfMethod));
+      return 5 /* call opcode + 4 byte displacement */ + 1 /* pop reg */ +
+          computeSize(&EncodingMap[kX86Sub32RI], 0, false) -
+          (lir->operands[0] == rAX  ? 1 : 0);  // shorter ax encoding
     default:
       break;
   }
@@ -802,7 +821,7 @@
 }
 
 static void emitShiftRegImm(CompilationUnit* cUnit, const X86EncodingMap* entry,
-                       uint8_t reg, int imm) {
+                            uint8_t reg, int imm) {
   if (entry->skeleton.prefix1 != 0) {
     cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
     if (entry->skeleton.prefix2 != 0) {
@@ -829,7 +848,7 @@
     DCHECK_EQ(0, entry->skeleton.extra_opcode2);
   }
   DCHECK_LT(reg, 8);
-  uint8_t modrm = (0 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
   cUnit->codeBuffer.push_back(modrm);
   if (imm != 1) {
     DCHECK_EQ(entry->skeleton.immediate_bytes, 1);
@@ -838,18 +857,67 @@
   }
 }
 
+static void emitShiftRegCl(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                           uint8_t reg, uint8_t cl) {
+  DCHECK_EQ(cl, static_cast<uint8_t>(rCX));
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode1);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+  DCHECK_LT(reg, 8);
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  cUnit->codeBuffer.push_back(modrm);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+}
+
+static void emitRegCond(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                       uint8_t reg, uint8_t condition) {
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  DCHECK_EQ(0x0F, entry->skeleton.opcode);
+  cUnit->codeBuffer.push_back(0x0F);
+  DCHECK_EQ(0x90, entry->skeleton.extra_opcode1);
+  cUnit->codeBuffer.push_back(0x90 | condition);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+  DCHECK_LT(reg, 8);
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  cUnit->codeBuffer.push_back(modrm);
+  DCHECK_EQ(entry->skeleton.immediate_bytes, 0);
+}
+
 static void emitJmp(CompilationUnit* cUnit, const X86EncodingMap* entry, int rel) {
   if (entry->opcode == kX86Jmp8) {
     DCHECK(IS_SIMM8(rel));
     cUnit->codeBuffer.push_back(0xEB);
     cUnit->codeBuffer.push_back(rel & 0xFF);
-  } else {
-    DCHECK(entry->opcode == kX86Jmp32);
+  } else if (entry->opcode == kX86Jmp32) {
     cUnit->codeBuffer.push_back(0xE9);
     cUnit->codeBuffer.push_back(rel & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 8) & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 16) & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 24) & 0xFF);
+  } else {
+    DCHECK(entry->opcode == kX86JmpR);
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+    uint8_t reg = static_cast<uint8_t>(rel);
+    DCHECK_LT(reg, 8);
+    uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+    cUnit->codeBuffer.push_back(modrm);
   }
 }
 
@@ -932,6 +1000,68 @@
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+static void emitPcRel(CompilationUnit* cUnit, const X86EncodingMap* entry, uint8_t reg,
+                      int base_or_table, uint8_t index, int scale, int table_or_disp) {
+  int disp;
+  if (entry->opcode == kX86PcRelLoadRA) {
+    SwitchTable *tabRec = (SwitchTable*)table_or_disp;
+    disp = tabRec->offset;
+  } else {
+    DCHECK(entry->opcode == kX86PcRelAdr);
+    FillArrayData *tabRec = (FillArrayData *)base_or_table;
+    disp = tabRec->offset;
+  }
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  if (FPREG(reg)) {
+    reg = reg & FP_REG_MASK;
+  }
+  DCHECK_LT(reg, 8);
+  if (entry->opcode == kX86PcRelLoadRA) {
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+    DCHECK_EQ(0, entry->skeleton.extra_opcode1);
+    DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+    uint8_t modrm = (2 << 6) | (reg << 3) | rSP;
+    cUnit->codeBuffer.push_back(modrm);
+    DCHECK_LT(scale, 4);
+    DCHECK_LT(index, 8);
+    DCHECK_LT(base_or_table, 8);
+    uint8_t base = static_cast<uint8_t>(base_or_table);
+    uint8_t sib = (scale << 6) | (index << 3) | base;
+    cUnit->codeBuffer.push_back(sib);
+    DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+  } else {
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode + reg);
+  }
+  cUnit->codeBuffer.push_back(disp & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 8) & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 16) & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 24) & 0xFF);
+  DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+}
+
+static void emitMacro(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                      uint8_t reg, int offset) {
+  DCHECK(entry->opcode == kX86StartOfMethod) << entry->name;
+  cUnit->codeBuffer.push_back(0xE8);  // call +0
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+
+  DCHECK_LT(reg, 8);
+  cUnit->codeBuffer.push_back(0x58 + reg);  // pop reg
+
+  emitRegImm(cUnit, &EncodingMap[kX86Sub32RI], reg, offset + 5 /* size of call +0 */);
+}
+
 void emitUnimplemented(CompilationUnit* cUnit, const X86EncodingMap* entry, LIR* lir) {
   UNIMPLEMENTED(WARNING) << "encoding for: " << entry->name;
   for (int i = 0; i < oatGetInsnSize(lir); ++i) {
@@ -949,7 +1079,7 @@
   LIR *lir;
   AssemblerStatus res = kSuccess;  // Assume success
 
-  const bool kVerbosePcFixup = false;
+  const bool kVerbosePcFixup = cUnit->method_idx == 9703;
   for (lir = (LIR *) cUnit->firstLIRInsn; lir; lir = NEXT_LIR(lir)) {
     if (lir->opcode < 0) {
       continue;
@@ -982,6 +1112,29 @@
             oatSetupResourceMasks(lir);
             res = kRetryAll;
           }
+          if (kVerbosePcFixup) {
+            LOG(INFO) << "Source:";
+            oatDumpLIRInsn(cUnit, lir, 0);
+            LOG(INFO) << "Target:";
+            oatDumpLIRInsn(cUnit, targetLIR, 0);
+            LOG(INFO) << "Delta " << delta;
+          }
+          lir->operands[0] = delta;
+          break;
+        }
+        case kX86Jcc32: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          intptr_t pc = lir->offset + 6 /* 2 byte opcode + rel32 */;
+          intptr_t target = targetLIR->offset;
+          int delta = target - pc;
+          if (kVerbosePcFixup) {
+            LOG(INFO) << "Source:";
+            oatDumpLIRInsn(cUnit, lir, 0);
+            LOG(INFO) << "Target:";
+            oatDumpLIRInsn(cUnit, targetLIR, 0);
+            LOG(INFO) << "Delta " << delta;
+          }
           lir->operands[0] = delta;
           break;
         }
@@ -1015,6 +1168,15 @@
           lir->operands[0] = delta;
           break;
         }
+        case kX86Jmp32: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          intptr_t pc = lir->offset + 5 /* opcode + rel32 */;
+          intptr_t target = targetLIR->offset;
+          int delta = target - pc;
+          lir->operands[0] = delta;
+          break;
+        }
         default:
           break;
       }
@@ -1028,6 +1190,7 @@
     if (res != kSuccess) {
       continue;
     }
+    CHECK_EQ(static_cast<size_t>(lir->offset), cUnit->codeBuffer.size());
     const X86EncodingMap *entry = &EncodingMap[lir->opcode];
     size_t starting_cbuf_size = cUnit->codeBuffer.size();
     switch (entry->kind) {
@@ -1088,6 +1251,12 @@
       case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
         emitShiftRegImm(cUnit, entry, lir->operands[0], lir->operands[1]);
         break;
+      case kShiftRegCl: // lir operands - 0: reg, 1: cl
+        emitShiftRegCl(cUnit, entry, lir->operands[0], lir->operands[1]);
+        break;
+      case kRegCond:  // lir operands - 0: reg, 1: condition
+        emitRegCond(cUnit, entry, lir->operands[0], lir->operands[1]);
+        break;
       case kJmp:  // lir operands - 0: rel
         emitJmp(cUnit, entry, lir->operands[0]);
         break;
@@ -1107,15 +1276,20 @@
             break;
         }
         break;
+      case kPcRel:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+        emitPcRel(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                  lir->operands[3], lir->operands[4]);
+        break;
+      case kMacro:
+        emitMacro(cUnit, entry, lir->operands[0], lir->offset);
+        break;
       default:
         emitUnimplemented(cUnit, entry, lir);
         break;
     }
-    if (entry->kind != kJcc && entry->kind != kJmp) {
-      CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
-               cUnit->codeBuffer.size() - starting_cbuf_size)
-          << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
-    }
+    CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
+             cUnit->codeBuffer.size() - starting_cbuf_size)
+        << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
   }
   return res;
 }
diff --git a/src/compiler/codegen/x86/Codegen.h b/src/compiler/codegen/x86/Codegen.h
index 178b986..52ba7c1 100644
--- a/src/compiler/codegen/x86/Codegen.h
+++ b/src/compiler/codegen/x86/Codegen.h
@@ -31,6 +31,12 @@
                 RegLocation rlSrc1, RegLocation rlSrc2);
 bool genSubLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc1, RegLocation rlSrc2);
+bool genAndLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
+bool genOrLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
+bool genXorLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
 bool genNegLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc);
 LIR *opRegImm(CompilationUnit* cUnit, OpKind op, int rDestSrc1, int value);
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index 4987c28..9421744 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -231,6 +231,7 @@
   if (rDest != rSrc1 && rDest != rSrc2) {
     if (op == kOpAdd) { // lea special case, except can't encode rbp as base
       if (rSrc1 == rSrc2) {
+        opRegCopy(cUnit, rDest, rSrc1);
         return opRegImm(cUnit, kOpLsl, rDest, 1);
       } else if (rSrc1 != rBP) {
         return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc1 /* base */,
@@ -285,9 +286,10 @@
     }
   }
   if (rDest != rSrc) {
-    if (op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
-      return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
-                     r4sib_no_index /* index */, value /* scale */, value /* disp */);
+    if (false && op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
+      // TODO: fix bug in LEA encoding when disp == 0
+      return newLIR5(cUnit, kX86Lea32RA, rDest,  r5sib_no_base /* base */,
+                     rSrc /* index */, value /* scale */, 0 /* disp */);
     } else if (op == kOpAdd) { // lea add special case
       return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
                      r4sib_no_index /* index */, 0 /* scale */, value /* disp */);
@@ -351,6 +353,7 @@
                                int rIndex, int rDest, int scale, OpSize size)
 {
     UNIMPLEMENTED(WARNING) << "loadBaseIndexed";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     LIR *first = NULL;
@@ -406,6 +409,7 @@
 LIR *loadMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
     UNIMPLEMENTED(WARNING) << "loadMultiple";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     int i;
@@ -432,6 +436,7 @@
 LIR *storeMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
     UNIMPLEMENTED(WARNING) << "storeMultiple";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     int i;
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index f957cbc..31939f2 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -79,6 +79,7 @@
 void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
     UNIMPLEMENTED(WARNING) << "genSparseSwitch";
+    newLIR0(cUnit, kX86Bkpt);
     return;
 #if 0
     const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
@@ -158,82 +159,55 @@
  *   jr    r_RA
  * done:
  */
-void genPackedSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
-{
-    UNIMPLEMENTED(WARNING) << "genPackedSwitch";
-#if 0
-    const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
-    if (cUnit->printMe) {
-        dumpPackedSwitchTable(table);
-    }
-    // Add the table to the list - we'll process it later
-    SwitchTable *tabRec = (SwitchTable *)oatNew(cUnit, sizeof(SwitchTable),
-                                                true, kAllocData);
-    tabRec->table = table;
-    tabRec->vaddr = mir->offset;
-    int size = table[1];
-    tabRec->targets = (LIR* *)oatNew(cUnit, size * sizeof(LIR*), true,
-                                        kAllocLIR);
-    oatInsertGrowableList(cUnit, &cUnit->switchTables, (intptr_t)tabRec);
+void genPackedSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc) {
+  const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
+  if (cUnit->printMe) {
+    dumpPackedSwitchTable(table);
+  }
+  // Add the table to the list - we'll process it later
+  SwitchTable *tabRec = (SwitchTable *)oatNew(cUnit, sizeof(SwitchTable),
+                                              true, kAllocData);
+  tabRec->table = table;
+  tabRec->vaddr = mir->offset;
+  int size = table[1];
+  tabRec->targets = (LIR* *)oatNew(cUnit, size * sizeof(LIR*), true,
+                                   kAllocLIR);
+  oatInsertGrowableList(cUnit, &cUnit->switchTables, (intptr_t)tabRec);
 
-    // Get the switch value
-    rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+  // Get the switch value
+  rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+  int startOfMethodReg = oatAllocTemp(cUnit);
+  // Materialize a pointer to the switch table
+  //newLIR0(cUnit, kX86Bkpt);
+  newLIR1(cUnit, kX86StartOfMethod, startOfMethodReg);
+  int lowKey = s4FromSwitchData(&table[2]);
+  int keyReg;
+  // Remove the bias, if necessary
+  if (lowKey == 0) {
+    keyReg = rlSrc.lowReg;
+  } else {
+    keyReg = oatAllocTemp(cUnit);
+    opRegRegImm(cUnit, kOpSub, keyReg, rlSrc.lowReg, lowKey);
+  }
+  // Bounds check - if < 0 or >= size continue following switch
+  opRegImm(cUnit, kOpCmp, keyReg, size-1);
+  LIR* branchOver = opCondBranch(cUnit, kCondHi, NULL);
 
-    // Prepare the bias.  If too big, handle 1st stage here
-    int lowKey = s4FromSwitchData(&table[2]);
-    bool largeBias = false;
-    int rKey;
-    if (lowKey == 0) {
-        rKey = rlSrc.lowReg;
-    } else if ((lowKey & 0xffff) != lowKey) {
-        rKey = oatAllocTemp(cUnit);
-        loadConstant(cUnit, rKey, lowKey);
-        largeBias = true;
-    } else {
-        rKey = oatAllocTemp(cUnit);
-    }
+  // Load the displacement from the switch table
+  int dispReg = oatAllocTemp(cUnit);
+  newLIR5(cUnit, kX86PcRelLoadRA, dispReg, startOfMethodReg, keyReg, 2, (intptr_t)tabRec);
+  // Add displacement to start of method
+  opRegReg(cUnit, kOpAdd, startOfMethodReg, dispReg);
+  // ..and go!
+  LIR* switchBranch = newLIR1(cUnit, kX86JmpR, startOfMethodReg);
+  tabRec->anchor = switchBranch;
 
-    // Must prevent code motion for the curr pc pair
-    genBarrier(cUnit);
-    newLIR0(cUnit, kX86CurrPC);  // Really a jal to .+8
-    // Now, fill the branch delay slot with bias strip
-    if (lowKey == 0) {
-        newLIR0(cUnit, kX86Nop);
-    } else {
-        if (largeBias) {
-            opRegRegReg(cUnit, kOpSub, rKey, rlSrc.lowReg, rKey);
-        } else {
-            opRegRegImm(cUnit, kOpSub, rKey, rlSrc.lowReg, lowKey);
-        }
-    }
-    genBarrier(cUnit);  // Scheduling barrier
-
-    // Construct BaseLabel and set up table base register
-    LIR* baseLabel = newLIR0(cUnit, kPseudoTargetLabel);
-    // Remember base label so offsets can be computed later
-    tabRec->anchor = baseLabel;
-
-    // Bounds check - if < 0 or >= size continue following switch
-    LIR* branchOver = opCmpImmBranch(cUnit, kCondHi, rKey, size-1, NULL);
-
-    // Materialize the table base pointer
-    int rBase = oatAllocTemp(cUnit);
-    newLIR4(cUnit, kX86Delta, rBase, 0, (intptr_t)baseLabel, (intptr_t)tabRec);
-
-    // Load the displacement from the switch table
-    int rDisp = oatAllocTemp(cUnit);
-    loadBaseIndexed(cUnit, rBase, rKey, rDisp, 2, kWord);
-
-    // Add to r_AP and go
-    opRegRegReg(cUnit, kOpAdd, r_RA, r_RA, rDisp);
-    opReg(cUnit, kOpBx, r_RA);
-
-    /* branchOver target here */
-    LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
-    branchOver->target = (LIR*)target;
-#endif
+  /* branchOver target here */
+  LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
+  branchOver->target = (LIR*)target;
 }
 
+void callRuntimeHelperRegReg(CompilationUnit* cUnit, int helperOffset, int arg0, int arg1);
 /*
  * Array data table format:
  *  ushort ident = 0x0300   magic value
@@ -246,47 +220,31 @@
  */
 void genFillArrayData(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genFillArrayData";
-#if 0
-    const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
-    // Add the table to the list - we'll process it later
-    FillArrayData *tabRec = (FillArrayData *)
-         oatNew(cUnit, sizeof(FillArrayData), true, kAllocData);
-    tabRec->table = table;
-    tabRec->vaddr = mir->offset;
-    u2 width = tabRec->table[1];
-    u4 size = tabRec->table[2] | (((u4)tabRec->table[3]) << 16);
-    tabRec->size = (size * width) + 8;
+  const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
+  // Add the table to the list - we'll process it later
+  FillArrayData *tabRec = (FillArrayData *)oatNew(cUnit, sizeof(FillArrayData), true, kAllocData);
+  tabRec->table = table;
+  tabRec->vaddr = mir->offset;
+  u2 width = tabRec->table[1];
+  u4 size = tabRec->table[2] | (((u4)tabRec->table[3]) << 16);
+  tabRec->size = (size * width) + 8;
 
-    oatInsertGrowableList(cUnit, &cUnit->fillArrayData, (intptr_t)tabRec);
+  oatInsertGrowableList(cUnit, &cUnit->fillArrayData, (intptr_t)tabRec);
 
-    // Making a call - use explicit registers
-    oatFlushAllRegs(cUnit);   /* Everything to home location */
-    oatLockCallTemps(cUnit);
-    loadValueDirectFixed(cUnit, rlSrc, rARG0);
-
-    // Must prevent code motion for the curr pc pair
-    genBarrier(cUnit);
-    newLIR0(cUnit, kX86CurrPC);  // Really a jal to .+8
-    // Now, fill the branch delay slot with the helper load
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread,
-                          pHandleFillArrayDataFromCode));
-    genBarrier(cUnit);  // Scheduling barrier
-
-    // Construct BaseLabel and set up table base register
-    LIR* baseLabel = newLIR0(cUnit, kPseudoTargetLabel);
-
-    // Materialize a pointer to the fill data image
-    newLIR4(cUnit, kX86Delta, rARG1, 0, (intptr_t)baseLabel, (intptr_t)tabRec);
-
-    // And go...
-    callRuntimeHelper(cUnit, rTgt);  // ( array*, fill_data* )
-#endif
+  // Making a call - use explicit registers
+  oatFlushAllRegs(cUnit);   /* Everything to home location */
+  loadValueDirectFixed(cUnit, rlSrc, rARG0);
+  // Materialize a pointer to the fill data image
+  newLIR1(cUnit, kX86StartOfMethod, rARG2);
+  newLIR2(cUnit, kX86PcRelAdr, rARG1, (intptr_t)tabRec);
+  newLIR2(cUnit, kX86Add32RR, rARG1, rARG2);
+  callRuntimeHelperRegReg(cUnit, ENTRYPOINT_OFFSET(pHandleFillArrayDataFromCode), rARG0, rARG1);
 }
 
 void genNegFloat(CompilationUnit *cUnit, RegLocation rlDest, RegLocation rlSrc)
 {
     UNIMPLEMENTED(WARNING) << "genNegFloat";
+    newLIR0(cUnit, kX86Bkpt);
 #if 0
     RegLocation rlResult;
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
@@ -300,6 +258,7 @@
 void genNegDouble(CompilationUnit *cUnit, RegLocation rlDest, RegLocation rlSrc)
 {
     UNIMPLEMENTED(WARNING) << "genNegDouble";
+    newLIR0(cUnit, kX86Bkpt);
 #if 0
     RegLocation rlResult;
     rlSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
@@ -311,21 +270,20 @@
 #endif
 }
 
+LIR* genNullCheck(CompilationUnit* cUnit, int sReg, int mReg, MIR* mir);
+void callRuntimeHelperReg(CompilationUnit* cUnit, int helperOffset, int arg0);
+
 /*
  * TODO: implement fast path to short-circuit thin-lock case
  */
 void genMonitorEnter(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genMonitorEnter";
-#if 0
     oatFlushAllRegs(cUnit);
     loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
     oatLockCallTemps(cUnit);  // Prepare for explicit register usage
     genNullCheck(cUnit, rlSrc.sRegLow, rARG0, mir);
     // Go expensive route - artLockObjectFromCode(self, obj);
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pLockObjectFromCode));
-    callRuntimeHelper(cUnit, rTgt);
-#endif
+    callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pLockObjectFromCode), rARG0);
 }
 
 /*
@@ -333,16 +291,12 @@
  */
 void genMonitorExit(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genMonitor";
-#if 0
     oatFlushAllRegs(cUnit);
     loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
     oatLockCallTemps(cUnit);  // Prepare for explicit register usage
     genNullCheck(cUnit, rlSrc.sRegLow, rARG0, mir);
     // Go expensive route - UnlockObjectFromCode(obj);
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pUnlockObjectFromCode));
-    callRuntimeHelper(cUnit, rTgt);
-#endif
+    callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pUnlockObjectFromCode), rARG0);
 }
 
 /*
@@ -364,26 +318,20 @@
 void genCmpLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc1, RegLocation rlSrc2)
 {
-    UNIMPLEMENTED(WARNING) << "genCmpLong";
-#if 0
-    rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
-    rlSrc2 = loadValueWide(cUnit, rlSrc2, kCoreReg);
-    int t0 = oatAllocTemp(cUnit);
-    int t1 = oatAllocTemp(cUnit);
-    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-    newLIR3(cUnit, kX86Slt, t0, rlSrc1.highReg, rlSrc2.highReg);
-    newLIR3(cUnit, kX86Slt, t1, rlSrc2.highReg, rlSrc1.highReg);
-    newLIR3(cUnit, kX86Subu, rlResult.lowReg, t1, t0);
-    LIR* branch = opCmpImmBranch(cUnit, kCondNe, rlResult.lowReg, 0, NULL);
-    newLIR3(cUnit, kX86Sltu, t0, rlSrc1.lowReg, rlSrc2.lowReg);
-    newLIR3(cUnit, kX86Sltu, t1, rlSrc2.lowReg, rlSrc1.lowReg);
-    newLIR3(cUnit, kX86Subu, rlResult.lowReg, t1, t0);
-    oatFreeTemp(cUnit, t0);
-    oatFreeTemp(cUnit, t1);
-    LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
-    branch->target = (LIR*)target;
+    oatFlushAllRegs(cUnit);
+    oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+    loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+    loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+    // Compute (r1:r0) = (r1:r0) - (r2:r3)
+    opRegReg(cUnit, kOpSub, r0, r2);  // r0 = r0 - r2
+    opRegReg(cUnit, kOpSbc, r1, r3);  // r1 = r1 - r3 - CF
+    opRegReg(cUnit, kOpOr, r0, r1);   // r0 = high | low - sets ZF
+    newLIR2(cUnit, kX86Set8R, r0, kX86CondNz);  // r0 = (r1:r0) != (r2:r3) ? 1 : 0
+    newLIR2(cUnit, kX86Movzx8RR, r0, r0);
+    opRegImm(cUnit, kOpAsr, r1, 31);  // r1 = high >> 31
+    opRegReg(cUnit, kOpOr, r0, r1);   // r0 holds result
+    RegLocation rlResult = LOC_C_RETURN;
     storeValue(cUnit, rlDest, rlResult);
-#endif
 }
 
 X86ConditionCode oatX86ConditionEncoding(ConditionCode cond) {
@@ -420,8 +368,12 @@
 LIR* opCmpImmBranch(CompilationUnit* cUnit, ConditionCode cond, int reg,
                     int checkValue, LIR* target)
 {
-  // TODO: when checkValue == 0 and reg is rCX, use the jcxz/nz opcode
-  newLIR2(cUnit, kX86Cmp32RI, reg, checkValue);
+  if (false && (checkValue == 0) && (cond == kCondEq || cond == kCondNe)) {
+    // TODO: when checkValue == 0 and reg is rCX, use the jcxz/nz opcode
+    // newLIR2(cUnit, kX86Test32RR, reg, reg);
+  } else {
+    newLIR2(cUnit, kX86Cmp32RI, reg, checkValue);
+  }
   X86ConditionCode cc = oatX86ConditionEncoding(cond);
   LIR* branch = newLIR2(cUnit, kX86Jcc8, 0 /* lir operand for Jcc offset */ , cc);
   branch->target = target;
@@ -458,10 +410,12 @@
       opRegCopy(cUnit, S2D(destLo, destHi), S2D(srcLo, srcHi));
     } else {
       UNIMPLEMENTED(WARNING);
+      newLIR0(cUnit, kX86Bkpt);
     }
   } else {
     if (srcFP) {
       UNIMPLEMENTED(WARNING);
+      newLIR0(cUnit, kX86Bkpt);
     } else {
       // Handle overlap
       if (srcHi == destLo) {
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 1fc44b3..85d2565 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -194,6 +194,7 @@
   r4sib_no_index = r4sp,
   r5     = 5,
   rBP    = r5,
+  r5sib_no_base = r5,
   r6     = 6,
   rSI    = r6,
   r7     = 7,
@@ -277,7 +278,7 @@
     kX86CondNge = kX86CondL,  // not-greater-equal
 
     kX86CondNl  = 0xD,        // not-less-than
-    kX86CondGe  = kX86CondL,  // not-greater-equal
+    kX86CondGe  = kX86CondNl, // not-greater-equal
 
     kX86CondLe  = 0xE,        // less-than-equal
     kX86CondNg  = kX86CondLe, // not-greater
@@ -387,7 +388,6 @@
     BinaryShiftOpCode(kX86Rcl),
     BinaryShiftOpCode(kX86Rcr),
     BinaryShiftOpCode(kX86Sal),
-    BinaryShiftOpCode(kX86Shl),
     BinaryShiftOpCode(kX86Shr),
     BinaryShiftOpCode(kX86Sar),
 #undef BinaryShiftOpcode
@@ -447,12 +447,18 @@
 #undef Binary0fOpCode
     kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
     kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
+    kX86JmpR,   // jmp reg; lir operands - 0: reg
     kX86CallR,  // call reg; lir operands - 0: reg
     kX86CallM,  // call [base + disp]; lir operands - 0: base, 1: disp
     kX86CallA,  // call [base + index * scale + disp]
                 // lir operands - 0: base, 1: index, 2: scale, 3: disp
     kX86CallT,  // call fs:[disp]; fs: is equal to Thread::Current(); lir operands - 0: disp
     kX86Ret,    // ret; no lir operands
+    kX86StartOfMethod,    // call 0; pop reg; sub reg, # - generate start of method into reg
+                          // lir operands - 0: reg
+    kX86PcRelLoadRA, // mov reg, [base + index * scale + PC relative displacement]
+                     // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+    kX86PcRelAdr, // mov reg, PC relative displacement; lir operands - 0: reg, 1: table
     kX86Last
 };
 
@@ -472,6 +478,8 @@
   kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.
   kRegCond, kMemCond, kArrayCond,          // R, M, A instruction kinds following by a condition.
   kJmp, kJcc, kCall,           // Branch instruction kinds.
+  kPcRel,                      // Operation with displacement that is PC relative
+  kMacro,                      // An instruction composing multiple others
   kUnimplemented               // Encoding used when an instruction isn't yet implemented.
 };
 
diff --git a/src/compiler/codegen/x86/X86RallocUtil.cc b/src/compiler/codegen/x86/X86RallocUtil.cc
index ba5c063..2971632 100644
--- a/src/compiler/codegen/x86/X86RallocUtil.cc
+++ b/src/compiler/codegen/x86/X86RallocUtil.cc
@@ -96,9 +96,9 @@
 /* Clobber all regs that might be used by an external C call */
 extern void oatClobberCalleeSave(CompilationUnit *cUnit)
 {
-    oatClobber(cUnit, rBP);
-    oatClobber(cUnit, rSI);
-    oatClobber(cUnit, rDI);
+    oatClobber(cUnit, rAX);
+    oatClobber(cUnit, rCX);
+    oatClobber(cUnit, rDX);
 }
 
 extern RegLocation oatGetReturnWideAlt(CompilationUnit* cUnit) {
diff --git a/src/disassembler_x86.cc b/src/disassembler_x86.cc
index 4c8c09a..d7ee80b 100644
--- a/src/disassembler_x86.cc
+++ b/src/disassembler_x86.cc
@@ -57,16 +57,14 @@
   DumpReg0(os, rex, reg_num, byte_operand, size_override);
 }
 
-static void DumpBaseReg(std::ostream& os, uint8_t rex, uint8_t reg,
-                        bool byte_operand, uint8_t size_override) {
+static void DumpBaseReg(std::ostream& os, uint8_t rex, uint8_t reg) {
   size_t reg_num = reg;  // TODO: combine with REX.B on 64bit
-  DumpReg0(os, rex, reg_num, byte_operand, size_override);
+  DumpReg0(os, rex, reg_num, false, 0);
 }
 
-static void DumpIndexReg(std::ostream& os, uint8_t rex, uint8_t reg,
-                         bool byte_operand, uint8_t size_override) {
+static void DumpIndexReg(std::ostream& os, uint8_t rex, uint8_t reg) {
   int reg_num = reg;  // TODO: combine with REX.X on 64bit
-  DumpReg0(os, rex, reg_num, byte_operand, size_override);
+  DumpReg0(os, rex, reg_num, false, 0);
 }
 
 static void DumpSegmentOverride(std::ostream& os, uint8_t segment_prefix) {
@@ -88,7 +86,7 @@
   const char** modrm_opcodes = NULL;
   do {
     switch (*instr) {
-      // Group 1 - lock and repeat prefixes:
+        // Group 1 - lock and repeat prefixes:
       case 0xF0:
       case 0xF2:
       case 0xF3:
@@ -203,6 +201,20 @@
   case 0x0F:  // 2 byte extended opcode
     instr++;
     switch (*instr) {
+      case 0x10: case 0x11:
+        if (prefix[0] == 0xF2) {
+          opcode << "movsd";
+        } else if (prefix[0] == 0xF3) {
+          opcode << "movss";
+        } else if (prefix[2] == 0x66) {
+          opcode << "movupd";
+        } else {
+          opcode << "movups";
+        }
+        has_modrm = true;
+        load = *instr == 0x10;
+        store = !load;
+        break;
       case 0x38:  // 3 byte extended opcode
         opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
         break;
@@ -214,6 +226,16 @@
         opcode << "j" << condition_codes[*instr & 0xF];
         branch_bytes = 4;
         break;
+      case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
+      case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: case 0x9E: case 0x9F:
+        opcode << "set" << condition_codes[*instr & 0xF];
+        modrm_opcodes = NULL;
+        reg_is_opcode = true;
+        has_modrm = true;
+        store = true;
+        break;
+      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; break;
+      case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
       default:
         opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
         break;
@@ -228,6 +250,11 @@
     byte_operand = (*instr & 1) == 0;
     immediate_bytes = *instr == 0x81 ? 4 : 1;
     break;
+  case 0x8D:
+    opcode << "lea";
+    has_modrm = true;
+    load = true;
+    break;
   case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
     opcode << "mov";
     immediate_bytes = 1;
@@ -238,7 +265,19 @@
     immediate_bytes = 4;
     reg_in_opcode = true;
     break;
+  case 0xC0: case 0xC1:
+    static const char* shift_opcodes[] =
+        {"rol", "ror", "rcl", "rcr", "shl", "shr", "unknown-shift", "sar"};
+    modrm_opcodes = shift_opcodes;
+    has_modrm = true;
+    reg_is_opcode = true;
+    store = true;
+    immediate_bytes = 1;
+    byte_operand = *instr == 0xC0;
+    break;
   case 0xC3: opcode << "ret"; break;
+  case 0xCC: opcode << "int 3"; break;
+  case 0xE8: opcode << "call"; branch_bytes = 4; break;
   case 0xE9: opcode << "jmp"; branch_bytes = 4; break;
   case 0xEB: opcode << "jmp"; branch_bytes = 1; break;
   case 0xFF:
@@ -276,13 +315,13 @@
       uint8_t base = sib & 7;
       address << "[";
       if (base != 5 || mod != 0) {
-        DumpBaseReg(address, rex, base, byte_operand, prefix[2]);
+        DumpBaseReg(address, rex, base);
         if (index != 4) {
           address << " + ";
         }
       }
       if (index != 4) {
-        DumpIndexReg(address, rex, index, byte_operand, prefix[2]);
+        DumpIndexReg(address, rex, index);
         if (ss != 0) {
           address << StringPrintf(" * %d", 1 << ss);
         }
@@ -299,7 +338,7 @@
       if (mod != 3) {
         address << "[";
       }
-      DumpBaseReg(address, rex, rm, byte_operand, prefix[2]);
+      DumpBaseReg(address, rex, rm);
       if (mod == 1) {
         address << StringPrintf(" + %d", *reinterpret_cast<const int8_t*>(instr));
         instr++;
@@ -312,7 +351,7 @@
       }
     }
 
-    if (reg_is_opcode) {
+    if (reg_is_opcode && modrm_opcodes != NULL) {
       opcode << modrm_opcodes[reg_or_opcode];
     }
     if (load) {
diff --git a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
index fcff424..bed4fba 100644
--- a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
+++ b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
@@ -27,7 +27,7 @@
 extern "C" void* art_check_and_alloc_array_from_code_with_access_check(uint32_t, void*, int32_t);
 
 // Cast entrypoints.
-extern uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class);
+extern "C" uint32_t artIsAssignableFromCode(const Class* klass, const Class* ref_class);
 extern "C" void art_can_put_array_element_from_code(void*, void*);
 extern "C" void art_check_cast_from_code(void*, void*);
 
@@ -152,7 +152,7 @@
   points->pCheckAndAllocArrayFromCodeWithAccessCheck = art_check_and_alloc_array_from_code_with_access_check;
 
   // Cast
-  points->pInstanceofNonTrivialFromCode = IsAssignableFromCode;
+  points->pInstanceofNonTrivialFromCode = artIsAssignableFromCode;
   points->pCanPutArrayElementFromCode = art_can_put_array_element_from_code;
   points->pCheckCastFromCode = art_check_cast_from_code;
 
diff --git a/src/oat/runtime/support_cast.cc b/src/oat/runtime/support_cast.cc
index 987e764..139239f 100644
--- a/src/oat/runtime/support_cast.cc
+++ b/src/oat/runtime/support_cast.cc
@@ -20,7 +20,7 @@
 namespace art {
 
 // Assignable test for code, won't throw.  Null and equality tests already performed
-uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class) {
+extern "C" uint32_t artIsAssignableFromCode(const Class* klass, const Class* ref_class) {
   DCHECK(klass != NULL);
   DCHECK(ref_class != NULL);
   return klass->IsAssignableFrom(ref_class) ? 1 : 0;
diff --git a/src/oat/runtime/support_stubs.cc b/src/oat/runtime/support_stubs.cc
index 5f7d635..fb0b5a4 100644
--- a/src/oat/runtime/support_stubs.cc
+++ b/src/oat/runtime/support_stubs.cc
@@ -27,7 +27,7 @@
 // Lazily resolve a method. Called by stub code.
 const void* UnresolvedDirectMethodTrampolineFromCode(Method* called, Method** sp, Thread* thread,
                                                      Runtime::TrampolineType type) {
-  // TODO: this code is specific to ARM
+#if defined(__arm__)
   // On entry the stack pointed by sp is:
   // | argN       |  |
   // | ...        |  |
@@ -43,10 +43,35 @@
   // | R1         |    arg1
   // | R0         |
   // | Method*    |  <- sp
-  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp) + kPointerSize);
   DCHECK_EQ(48U, Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes());
   Method** caller_sp = reinterpret_cast<Method**>(reinterpret_cast<byte*>(sp) + 48);
+  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp) + kPointerSize);
   uintptr_t caller_pc = regs[10];
+#elif defined(__i386__)
+  // On entry the stack pointed by sp is:
+  // | argN        |  |
+  // | ...         |  |
+  // | arg4        |  |
+  // | arg3 spill  |  |  Caller's frame
+  // | arg2 spill  |  |
+  // | arg1 spill  |  |
+  // | Method*     | ---
+  // | Return      |
+  // | EBP,ESI,EDI |    callee saves
+  // | EBX         |    arg3
+  // | EDX         |    arg2
+  // | ECX         |    arg1
+  // | EAX/Method* |  <- sp
+  DCHECK_EQ(32U, Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes());
+  Method** caller_sp = reinterpret_cast<Method**>(reinterpret_cast<byte*>(sp) + 32);
+  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp));
+  uintptr_t caller_pc = regs[7];
+#else
+  UNIMPLEMENTED(FATAL);
+  Method** caller_sp = NULL;
+  uintptr_t* regs = NULL;
+  uintptr_t caller_pc = 0;
+#endif
   FinishCalleeSaveFrameSetup(thread, sp, Runtime::kRefsAndArgs);
   // Start new JNI local reference state
   JNIEnvExt* env = thread->GetJniEnv();
@@ -88,6 +113,7 @@
     shorty = mh.GetShorty();
     shorty_len = mh.GetShortyLength();
   }
+#if !defined(__i386__)
   // Discover shorty (avoid GCs)
   size_t args_in_regs = 0;
   for (size_t i = 1; i < shorty_len; i++) {
@@ -132,6 +158,7 @@
     }
     cur_arg = cur_arg + (c == 'J' || c == 'D' ? 2 : 1);
   }
+#endif
   // Resolve method filling in dex cache
   if (type == Runtime::kUnknownMethod) {
     called = linker->ResolveMethod(dex_method_idx, caller, !is_virtual);
diff --git a/src/oat/runtime/x86/context_x86.cc b/src/oat/runtime/x86/context_x86.cc
index 35bfd01..2af95bb 100644
--- a/src/oat/runtime/x86/context_x86.cc
+++ b/src/oat/runtime/x86/context_x86.cc
@@ -23,7 +23,7 @@
 
 X86Context::X86Context() {
 #ifndef NDEBUG
-  // Initialize registers with easy to spot debug values
+  // Initialize registers with easy to spot debug values.
   for (int i = 0; i < 8; i++) {
     gprs_[i] = 0xEBAD6070+i;
   }
@@ -37,8 +37,8 @@
   size_t spill_count = __builtin_popcount(core_spills);
   CHECK_EQ(method->GetFpSpillMask(), 0u);
   if (spill_count > 0) {
-    // Lowest number spill is furthest away, walk registers and fill into context
-    int j = 1;
+    // Lowest number spill is furthest away, walk registers and fill into context.
+    int j = 2;  // Offset j to skip return address spill.
     for (int i = 0; i < 8; i++) {
       if (((core_spills >> i) & 1) != 0) {
         gprs_[i] = fr.LoadCalleeSave(spill_count - j);
@@ -50,8 +50,11 @@
 
 void X86Context::DoLongJump() {
 #if defined(__i386__)
-  // Load ESP and EIP
-  gprs_[ESP] -= 4;  // push EIP for return
+  // We push all the registers using memory-memory pushes, we then pop-all to get the registers
+  // set up, we then pop esp which will move us down the stack to the delivery address. At the frame
+  // where the exception will be delivered, we push EIP so that the return will take us to the
+  // correct delivery instruction.
+  gprs_[ESP] -= 4;
   *(reinterpret_cast<uintptr_t*>(gprs_[ESP])) = eip_;
   asm volatile(
       "pushl %4\n\t"
diff --git a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
index 5d525a9..d2f97eb 100644
--- a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
+++ b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
@@ -27,7 +27,7 @@
 extern "C" void* art_check_and_alloc_array_from_code_with_access_check(uint32_t, void*, int32_t);
 
 // Cast entrypoints.
-extern uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class);
+extern "C" uint32_t art_is_assignable_from_code(const Class* klass, const Class* ref_class);
 extern "C" void art_can_put_array_element_from_code(void*, void*);
 extern "C" void art_check_cast_from_code(void*, void*);
 
@@ -72,11 +72,14 @@
 extern int32_t CmplFloat(float a, float b);
 extern int64_t D2L(double d);
 extern int64_t F2L(float f);
+extern "C" int32_t art_idiv_from_code(int32_t, int32_t);
+extern "C" int32_t art_idivmod_from_code(int32_t, int32_t);
 
 // Intrinsic entrypoints.
-extern "C" int32_t __memcmp16(void*, void*, int32_t);
+extern "C" int32_t art_memcmp16(void*, void*, int32_t);
 extern "C" int32_t art_indexof(void*, uint32_t, uint32_t, uint32_t);
 extern "C" int32_t art_string_compareto(void*, void*);
+extern "C" void* art_memcpy(void*, const void*, size_t);
 
 // Invoke entrypoints.
 const void* UnresolvedDirectMethodTrampolineFromCode(Method*, Method**, Thread*,
@@ -112,7 +115,7 @@
   points->pCheckAndAllocArrayFromCodeWithAccessCheck = art_check_and_alloc_array_from_code_with_access_check;
 
   // Cast
-  points->pInstanceofNonTrivialFromCode = IsAssignableFromCode;
+  points->pInstanceofNonTrivialFromCode = art_is_assignable_from_code;
   points->pCanPutArrayElementFromCode = art_can_put_array_element_from_code;
   points->pCheckCastFromCode = art_check_cast_from_code;
 
@@ -174,8 +177,8 @@
   points->pL2f = NULL;
   points->pD2iz = NULL;
   points->pF2iz = NULL;
-  points->pIdiv = NULL;
-  points->pIdivmod = NULL;
+  points->pIdiv = art_idiv_from_code;
+  points->pIdivmod = art_idivmod_from_code;
   points->pD2l = D2L;
   points->pF2l = F2L;
   points->pLadd = NULL;
@@ -191,9 +194,9 @@
 
   // Intrinsics
   points->pIndexOf = art_indexof;
-  points->pMemcmp16 = __memcmp16;
+  points->pMemcmp16 = art_memcmp16;
   points->pStringCompareTo = art_string_compareto;
-  points->pMemcpy = memcpy;
+  points->pMemcpy = art_memcpy;
 
   // Invocation
   points->pUnresolvedDirectMethodTrampolineFromCode = UnresolvedDirectMethodTrampolineFromCode;
diff --git a/src/oat/runtime/x86/runtime_support_x86.S b/src/oat/runtime/x86/runtime_support_x86.S
index 3333469..943b55c 100644
--- a/src/oat/runtime/x86/runtime_support_x86.S
+++ b/src/oat/runtime/x86/runtime_support_x86.S
@@ -51,7 +51,7 @@
 
     /*
      * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(...)
+     * Runtime::CreateCalleeSaveMethod(kSaveAll)
      */
 MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
     pushl %edi  // Save callee saves (ebx is saved/restored by the upcall)
@@ -60,16 +60,24 @@
     subl  LITERAL(16), %esp  // Grow stack by 4 words, bottom word will hold Method*
 END_MACRO
 
-MACRO0(RESTORE_CALLEE_SAVE_FRAME)
-    addl LITERAL(16), %esp  // Remove padding
-    popl %ebp  // Restore callee saves
-    popl %esi
-    popl %edi
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kRefsOnly)
+     */
+MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
+    pushl %edi  // Save callee saves (ebx is saved/restored by the upcall)
+    pushl %esi
+    pushl %ebp
+    subl  LITERAL(16), %esp  // Grow stack by 4 words, bottom word will hold Method*
+END_MACRO
+
+MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
+    addl LITERAL(28), %esp  // Unwind stack up to return address
 END_MACRO
 
     /*
      * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(...)
+     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
      */
 MACRO0(SETUP_REF_AND_ARG_CALLEE_SAVE_FRAME)
     pushl %edi  // Save callee saves
@@ -82,7 +90,10 @@
 END_MACRO
 
 MACRO0(RESTORE_REF_AND_ARG_CALLEE_SAVE_FRAME)
-    addl LITERAL(16), %esp  // Remove padding
+    addl LITERAL(4), %esp  // Remove padding
+    popl %ecx  // Restore args except eax
+    popl %edx
+    popl %ebx
     popl %ebp  // Restore callee saves
     popl %esi
     popl %edi
@@ -141,9 +152,9 @@
     // Outgoing argument set up
     pushl %edx                    // pass SP
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
-    pushl %eax                    // pass arg1
     pushl %ecx                    // pass arg2
-    call VAR(cxx_name, 1)         // cxx_name(arg2, arg1, Thread*, SP)
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, Thread*, SP)
     int3                          // unreached
 END_MACRO
 
@@ -204,7 +215,39 @@
     .globl VAR(c_name, 0)
     ALIGN_FUNCTION_ENTRY
 VAR(c_name, 0):
-    int3
+    // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
+    // return address
+    pushl %edi
+    pushl %esi
+    pushl %ebp
+    pushl %ebx
+    pushl %edx
+    pushl %ecx
+    pushl %eax  // <-- callee save Method* to go here
+    movl %esp, %edx  // remember SP
+    // Outgoing argument set up
+    subl LITERAL(12), %esp        // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl 32(%edx)                // pass caller Method*
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
+    movl %edx, %edi               // save code pointer in EDI
+    addl LITERAL(36), %esp        // Pop arguments skip eax
+    popl %ecx                     // Restore args
+    popl %edx
+    popl %ebx
+    popl %ebp  // Restore callee saves.
+    popl %esi
+    // Swap EDI callee save with code pointer.
+    xchgl %edi, (%esp)
+    testl %eax, %eax              // Branch forward if exception pending.
+    jz    1f
+    // Tail call to intended method.
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
 END_MACRO
 
 INVOKE_TRAMPOLINE art_invoke_interface_trampoline, artInvokeInterfaceTrampoline
@@ -215,6 +258,189 @@
 INVOKE_TRAMPOLINE art_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
+MACRO2(TWO_ARG_ALLOC, c_name, cxx_name)
+    .globl VAR(c_name, 0)
+    ALIGN_FUNCTION_ENTRY
+VAR(c_name, 0):
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jz  1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+END_MACRO
+
+MACRO2(THREE_ARG_ALLOC, c_name, cxx_name)
+    .globl VAR(c_name, 0)
+    ALIGN_FUNCTION_ENTRY
+VAR(c_name, 0):
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %ebx                // remember SP
+    // Outgoing argument set up
+    subl LITERAL(12), %esp        // alignment padding
+    pushl %ebx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, Thread*, SP)
+    addl LITERAL(32), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jz  1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+END_MACRO
+
+TWO_ARG_ALLOC art_alloc_object_from_code, artAllocObjectFromCode
+TWO_ARG_ALLOC art_alloc_object_from_code_with_access_check, artAllocObjectFromCodeWithAccessCheck
+THREE_ARG_ALLOC art_alloc_array_from_code, artAllocArrayFromCode
+THREE_ARG_ALLOC art_alloc_array_from_code_with_access_check, artAllocArrayFromCodeWithAccessCheck
+THREE_ARG_ALLOC art_check_and_alloc_array_from_code, artCheckAndAllocArrayFromCode
+THREE_ARG_ALLOC art_check_and_alloc_array_from_code_with_access_check, artCheckAndAllocArrayFromCodeWithAccessCheck
+
+TWO_ARG_ALLOC art_resolve_string_from_code, artResolveStringFromCode
+TWO_ARG_ALLOC art_initialize_static_storage_from_code, artInitializeStaticStorageFromCode
+
+    .globl art_lock_object_from_code
+    ALIGN_FUNCTION_ENTRY
+art_lock_object_from_code:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %eax                    // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %eax                    // pass arg1
+    call artLockObjectFromCode    // (Object*, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    ret
+
+    .globl art_unlock_object_from_code
+    ALIGN_FUNCTION_ENTRY
+art_unlock_object_from_code:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %eax                    // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %eax                    // pass arg1
+    call artUnlockObjectFromCode  // (Object*, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+    .globl art_handle_fill_data_from_code
+    ALIGN_FUNCTION_ENTRY
+art_handle_fill_data_from_code:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call artHandleFillArrayDataFromCode  // (Array* array, const uint16_t* table, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+    .globl art_is_assignable_from_code
+    ALIGN_FUNCTION_ENTRY
+art_is_assignable_from_code:
+    pushl %eax                    // alignment padding
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call artIsAssignableFromCode  // (Class* a, Class* b, Thread*, SP)
+    addl LITERAL(12), %esp        // pop arguments
+    ret
+
+    .globl art_memcpy
+    ALIGN_FUNCTION_ENTRY
+art_memcpy:
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call memcpy                   // (void*, const void*, size_t)
+    addl LITERAL(12), %esp        // pop arguments
+    ret
+
+    .globl art_check_cast_from_code
+    ALIGN_FUNCTION_ENTRY
+art_check_cast_from_code:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call artCheckCastFromCode     // (Class* a, Class* b, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+    .globl art_idiv_from_code
+    ALIGN_FUNCTION_ENTRY
+art_idiv_from_code:
+    cdq         // edx:eax = sign extend eax
+    idiv %ecx   // (edx,eax) = (edx:eax % ecx, edx:eax / ecx)
+    ret
+
+    .globl art_idivmod_from_code
+    ALIGN_FUNCTION_ENTRY
+art_idivmod_from_code:
+    cdq         // edx:eax = sign extend eax
+    idiv %ecx   // (edx,eax) = (edx:eax % ecx, edx:eax / ecx)
+    movl %eax, %edx
+    ret
+
+    .globl art_can_put_array_element_from_code
+    ALIGN_FUNCTION_ENTRY
+art_can_put_array_element_from_code:
+    test %eax, %eax               // Null is trivially storable
+    jz   1f
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call artCanPutArrayElementFromCode  // (Object* element, Class* array_class, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jnz 2f
+1:
+    ret
+2:
+    DELIVER_PENDING_EXCEPTION
+
 MACRO1(UNIMPLEMENTED,name)
     .globl VAR(name, 0)
     ALIGN_FUNCTION_ENTRY
@@ -226,18 +452,8 @@
 UNIMPLEMENTED art_proxy_invoke_handler
 UNIMPLEMENTED art_update_debugger
 UNIMPLEMENTED art_test_suspend
-UNIMPLEMENTED art_alloc_object_from_code
-UNIMPLEMENTED art_alloc_object_from_code_with_access_check
-UNIMPLEMENTED art_alloc_array_from_code
-UNIMPLEMENTED art_alloc_array_from_code_with_access_check
-UNIMPLEMENTED art_check_and_alloc_array_from_code
-UNIMPLEMENTED art_check_and_alloc_array_from_code_with_access_check
-UNIMPLEMENTED art_can_put_array_element_from_code
-UNIMPLEMENTED art_check_cast_from_code
-UNIMPLEMENTED art_initialize_static_storage_from_code
 UNIMPLEMENTED art_initialize_type_and_verify_access_from_code
 UNIMPLEMENTED art_initialize_type_from_code
-UNIMPLEMENTED art_resolve_string_from_code
 UNIMPLEMENTED art_set32_instance_from_code
 UNIMPLEMENTED art_set64_instance_from_code
 UNIMPLEMENTED art_set_obj_instance_from_code
@@ -250,9 +466,6 @@
 UNIMPLEMENTED art_get32_static_from_code
 UNIMPLEMENTED art_get64_static_from_code
 UNIMPLEMENTED art_get_obj_static_from_code
-UNIMPLEMENTED art_handle_fill_data_from_code
-UNIMPLEMENTED art_lock_object_from_code
-UNIMPLEMENTED art_unlock_object_from_code
 UNIMPLEMENTED art_indexof
-UNIMPLEMENTED __memcmp16
+UNIMPLEMENTED art_memcmp16
 UNIMPLEMENTED art_string_compareto
diff --git a/src/oat/runtime/x86/stub_x86.cc b/src/oat/runtime/x86/stub_x86.cc
index 14e4f23..1dea0a12 100644
--- a/src/oat/runtime/x86/stub_x86.cc
+++ b/src/oat/runtime/x86/stub_x86.cc
@@ -25,11 +25,40 @@
 namespace art {
 namespace x86 {
 
-ByteArray* X86CreateResolutionTrampoline(Runtime::TrampolineType) {
+ByteArray* X86CreateResolutionTrampoline(Runtime::TrampolineType type) {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
-  // TODO: unimplemented
-  __ int3();
+  // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
+  // return address
+  __ pushl(EDI);
+  __ pushl(ESI);
+  __ pushl(EBP);
+  __ pushl(EBX);
+  __ pushl(EDX);
+  __ pushl(ECX);
+  __ pushl(EAX);  // <-- callee save Method* to go here
+  __ movl(ECX, ESP);          // save ESP
+  __ pushl(Immediate(type));  // pass is_static
+  __ fs()->pushl(Address::Absolute(Thread::SelfOffset()));  // Thread*
+  __ pushl(ECX);              // pass ESP for Method*
+  __ pushl(EAX);              // pass Method*
+
+  // Call to resolve method.
+  __ Call(ThreadOffset(ENTRYPOINT_OFFSET(pUnresolvedDirectMethodTrampolineFromCode)),
+          X86ManagedRegister::FromCpuRegister(ECX));
+
+  __ movl(EDI, EAX);  // save code pointer in EDI
+  __ addl(ESP, Immediate(16));  // Pop arguments
+  __ popl(EAX);  // Restore args.
+  __ popl(ECX);
+  __ popl(EDX);
+  __ popl(EBX);
+  __ popl(EBP);  // Restore callee saves.
+  __ popl(ESI);
+  // Swap EDI callee save with code pointer
+  __ xchgl(EDI, Address(ESP,0));
+  // Tail call to intended method.
+  __ ret();
 
   assembler->EmitSlowPaths();
   size_t cs = assembler->CodeSize();
@@ -46,6 +75,8 @@
 ByteArray* CreateAbstractMethodErrorStub() {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
+  // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kSaveAll)
+
   // return address
   __ pushl(EDI);
   __ pushl(ESI);
@@ -66,6 +97,7 @@
 
 #if defined(ART_USE_LLVM_COMPILER)
   // Return to caller who will handle pending exception.
+  // TODO: The callee save set up is unnecessary for LLVM as it uses shadow stacks.
   __ addl(ESP, Immediate(32));
   __ popl(EBP);
   __ popl(ESI);
diff --git a/src/oat/utils/x86/assembler_x86.cc b/src/oat/utils/x86/assembler_x86.cc
index 0862551..569ec09 100644
--- a/src/oat/utils/x86/assembler_x86.cc
+++ b/src/oat/utils/x86/assembler_x86.cc
@@ -758,6 +758,12 @@
   EmitRegisterOperand(dst, src);
 }
 
+void X86Assembler::xchgl(Register reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x87);
+  EmitOperand(reg, address);
+}
+
 
 void X86Assembler::cmpl(Register reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1172,6 +1178,11 @@
   EmitRegisterOperand(4, reg);
 }
 
+void X86Assembler::jmp(const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(4, address);
+}
 
 void X86Assembler::jmp(Label* label) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/src/oat/utils/x86/assembler_x86.h b/src/oat/utils/x86/assembler_x86.h
index 886e173..58acab2 100644
--- a/src/oat/utils/x86/assembler_x86.h
+++ b/src/oat/utils/x86/assembler_x86.h
@@ -344,6 +344,7 @@
   void fptan();
 
   void xchgl(Register dst, Register src);
+  void xchgl(Register reg, const Address& address);
 
   void cmpl(Register reg, const Immediate& imm);
   void cmpl(Register reg0, Register reg1);
@@ -426,6 +427,7 @@
   void j(Condition condition, Label* label);
 
   void jmp(Register reg);
+  void jmp(const Address& address);
   void jmp(Label* label);
 
   X86Assembler* lock();
diff --git a/src/runtime.cc b/src/runtime.cc
index f434fed..c4a9bd7 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -964,8 +964,14 @@
     method->SetCoreSpillMask(core_spills);
     method->SetFpSpillMask(fp_spills);
   } else if (instruction_set == kX86) {
-    method->SetFrameSizeInBytes(32);
-    method->SetCoreSpillMask((1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI));
+    uint32_t ref_spills = (1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI);
+    uint32_t arg_spills = (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+    uint32_t core_spills = ref_spills | (type == kRefsAndArgs ? arg_spills : 0) |
+                         (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
+    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
+                                 1 /* Method* */) * kPointerSize, kStackAlignment);
+    method->SetFrameSizeInBytes(frame_size);
+    method->SetCoreSpillMask(core_spills);
     method->SetFpSpillMask(0);
   } else {
     UNIMPLEMENTED(FATAL);