Experimental x86 Jit trace selection

Experimental support for trace selection for x86 host mode operation.
Not enabled by default.  Turned on by setting WITH_HOST_DALVIK true
and WITH_JIT true.  When enabled, profiles during x86 fast interpreter
operation, selects hot traces and "compiles" traces consisting of jumps
back to the interpreter.

First in a series of experimental x86 support checkins.

Change-Id: I0e423ec58a7bf01f226cb486f55de2841fab1002
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 8dd8adc..fe42f4c 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -627,6 +627,11 @@
                         compileOK = dvmCompilerDoWork(&work);
                     }
                     if (aborted || !compileOK) {
+#if 0 // for x86 JIT testing
+                        dvmJitSetCodeAddr(work.pc,
+                                          dvmCompilerGetInterpretTemplate(),
+                                          work.result.instructionSet);
+#endif
                         dvmCompilerArenaReset();
                     } else if (!work.result.discardResult &&
                                work.result.codeAddress) {
diff --git a/vm/compiler/codegen/x86/Assemble.c b/vm/compiler/codegen/x86/Assemble.c
index fbf53ca..31264ce 100644
--- a/vm/compiler/codegen/x86/Assemble.c
+++ b/vm/compiler/codegen/x86/Assemble.c
@@ -34,8 +34,6 @@
 #endif
 
 /*
- * FIXME - redo for x86
- *
  * Translation layout in the code cache.  Note that the codeAddress pointer
  * in JitTable will point directly to the code body (field codeAddress).  The
  * chain cell offset codeAddress - 2, and (if present) executionCount is at
@@ -52,7 +50,7 @@
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 16 bytes each, 8 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -66,8 +64,8 @@
  *      |                            |
  *      +----------------------------+
  *      | Literal pool               |  -> 4-byte aligned, variable size
- *      .                            .
- *      .                            .
+ *      .                            .     Note: for x86 literals will
+ *      .                            .     generally appear inline.
  *      |                            |
  *      +----------------------------+
  *
diff --git a/vm/compiler/codegen/x86/CodegenDriver.c b/vm/compiler/codegen/x86/CodegenDriver.c
index 69f637e..4a5d481 100644
--- a/vm/compiler/codegen/x86/CodegenDriver.c
+++ b/vm/compiler/codegen/x86/CodegenDriver.c
@@ -24,10 +24,63 @@
  * applicable directory below this one.
  */
 
+extern X86LIR *loadConstant(CompilationUnit *cUnit, int rDest, int value);
+extern X86LIR *loadWordDisp(CompilationUnit *cUnit, int rBase,
+                            int displacement, int rDest);
+extern void dvmCompilerFlushAllRegs(CompilationUnit *cUnit);
+extern void storeWordDisp(CompilationUnit *cUnit, int rBase,
+                          int displacement, int rSrc);
+extern X86LIR *opReg(CompilationUnit *cUnit, OpKind op, int rDestSrc);
+
 static int opcodeCoverage[kNumPackedOpcodes];
 static intptr_t templateEntryOffsets[TEMPLATE_LAST_MARK];
 
 /*
+ * Bail to the interpreter.  Will not return to this trace.
+ * On entry, rPC must be set correctly.
+ */
+static void genPuntToInterp(CompilationUnit *cUnit, unsigned int offset)
+{
+    dvmCompilerFlushAllRegs(cUnit);
+    loadConstant(cUnit, rPC, (int)(cUnit->method->insns + offset));
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX,
+                 offsetof(InterpState, jitToInterpEntries.dvmJitToInterpPunt),
+                 rEAX);
+    opReg(cUnit, kOpUncondBr, rEAX);
+}
+
+static void genInterpSingleStep(CompilationUnit *cUnit, MIR *mir)
+{
+    int flags = dexGetFlagsFromOpcode(mir->dalvikInsn.opcode);
+    int flagsToCheck = kInstrCanBranch | kInstrCanSwitch | kInstrCanReturn |
+                       kInstrCanThrow;
+
+    //If already optimized out, just ignore
+    if (mir->dalvikInsn.opcode == OP_NOP)
+        return;
+
+    //Ugly, but necessary.  Flush all Dalvik regs so Interp can find them
+    dvmCompilerFlushAllRegs(cUnit);
+
+    if ((mir->next == NULL) || (flags & flagsToCheck)) {
+       genPuntToInterp(cUnit, mir->offset);
+       return;
+    }
+    int entryAddr = offsetof(InterpState,
+                             jitToInterpEntries.dvmJitToInterpSingleStep);
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX, entryAddr, rEAX); // rEAX<- entry address
+    /* rPC = dalvik pc */
+    loadConstant(cUnit, rPC, (int) (cUnit->method->insns + mir->offset));
+    /* rECX = dalvik pc of following instruction */
+    loadConstant(cUnit, rECX, (int) (cUnit->method->insns + mir->next->offset));
+    /* Pass on the stack */
+    storeWordDisp(cUnit, rESP, OUT_ARG0, rECX);
+    opReg(cUnit, kOpCall, rEAX);
+}
+
+/*
  * The following are the first-level codegen routines that analyze the format
  * of each bytecode then either dispatch special purpose codegen routines
  * or produce corresponding Thumb instructions directly.
diff --git a/vm/compiler/codegen/x86/X86LIR.h b/vm/compiler/codegen/x86/X86LIR.h
index 62ac447..8acf015 100644
--- a/vm/compiler/codegen/x86/X86LIR.h
+++ b/vm/compiler/codegen/x86/X86LIR.h
@@ -27,7 +27,7 @@
  *     esp is native SP
  *
  * For interpreter:
- *     edx is Dalvik PC (rPC)
+ *     edi is Dalvik PC (rPC)
  *     ebx is rINST
  *
  * For JIT:
@@ -82,8 +82,8 @@
     int nextFPTemp;
     int numCoreRegs;
     RegisterInfo *coreRegs;
-    int numFPRegs;
-    RegisterInfo *FPRegs;
+    int numMMRegs;
+    RegisterInfo *MMRegs;
 } RegisterPool;
 
 typedef enum OpSize {
@@ -99,7 +99,6 @@
 
 typedef enum OpKind {
     kOpMov,
-    kOpMvn,
     kOpCmp,
     kOpLsl,
     kOpLsr,
@@ -114,15 +113,11 @@
     kOpAdc,
     kOpSub,
     kOpSbc,
-    kOpRsub,
     kOpMul,
     kOpDiv,
     kOpRem,
-    kOpBic,
-    kOpCmn,
     kOpTst,
-    kOpBkpt,
-    kOpBlx,
+    kOpCall,
     kOpPush,
     kOpPop,
     kOp2Char,
@@ -132,6 +127,37 @@
     kOpUncondBr,
 } OpKind;
 
+#define FP_REG_OFFSET 8
+
+typedef enum NativeRegisterPool {
+    rEAX = 0,
+    rECX = 1,
+    rEDX = 2,
+    rEBX = 3,
+    rESP = 4,
+    rEBP = 5,
+    rESI = 6,
+    rEDI = 7,
+    rXMM0 = 0 + FP_REG_OFFSET,
+    rXMM1 = 1 + FP_REG_OFFSET,
+    rXMM2 = 2 + FP_REG_OFFSET,
+    rXMM3 = 3 + FP_REG_OFFSET,
+    rXMM4 = 4 + FP_REG_OFFSET,
+    rXMM5 = 5 + FP_REG_OFFSET,
+    rXMM6 = 6 + FP_REG_OFFSET,
+    rXMM7 = 7 + FP_REG_OFFSET,
+} NativeRegisterPool;
+
+#define rPC rEDI
+#define rFP rESI
+#define rINST rEBX
+
+#define OUT_ARG0 0
+#define OUT_ARG1 4
+#define OUT_ARG2 8
+#define OUT_ARG3 12
+#define OUT_ARG4 16
+
 typedef struct X86LIR {
     LIR generic;
     //X86Opcode opcode;
diff --git a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
index 4c98917..68b2d0d 100644
--- a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
+++ b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
@@ -1,27 +1,30 @@
     /*
-     * TODO: figure out how best to do this on x86, as we don't have
-     * an lr equivalent and probably don't want to push.
+     * This handler is a bit odd - it may be called via chaining or
+     * from static code and is expected to cause control to flow
+     * to the interpreter.  The problem is where to find the Dalvik
+     * PC of the next instruction.  When called via chaining, the dPC
+     * will be located at *rp.  When called from static code, rPC is
+     * valid and rp is a real return pointer (that should be ignored).
+     * The Arm target deals with this by using the link register as
+     * a flag.  If it is zero, we know we were called from static code.
+     * If non-zero, it points to the chain cell containing dPC.
+     * For x86, we'll infer the source by looking where rp points.
+     * If it points to anywhere within the code cache, we'll assume
+     * we got here via chaining.  Otherwise, we'll assume rPC is valid.
      *
-     * This handler transfers control to the interpeter without performing
-     * any lookups.  It may be called either as part of a normal chaining
-     * operation, or from the transition code in header.S.  We distinquish
-     * the two cases by looking at the link register.  If called from a
-     * translation chain, it will point to the chaining Dalvik PC -3.
      * On entry:
-     *    lr - if NULL:
-     *        r1 - the Dalvik PC to begin interpretation.
-     *    else
-     *        [lr, #3] contains Dalvik PC to begin interpretation
-     *    rGLUE - pointer to interpState
-     *    rFP - Dalvik frame pointer
-     *
-     *cmp     lr, #0
-     *ldrne   r1,[lr, #3]
-     *ldr     r2, .LinterpPunt
-     *mov     r0, r1                       @ set Dalvik PC
-     *bx      r2
-     *@ doesn't return
+     *    (TOS)<- return pointer or pointer to dPC
      */
+     movl   rGLUE,%ecx
+     movl   $$.LinterpPunt,%edx
+     pop    %eax
+     cmpl   %eax,offGlue_jitCacheEnd(%ecx)
+     ja     1f
+     cmpl   %eax,offGlue_jitCacheStart(%ecx)
+     jb     1f
+     movl   %eax,rPC
+1:
+     jmp    *(%edx)
 
 .LinterpPunt:
     .long   dvmJitToInterpPunt
diff --git a/vm/compiler/template/ia32/footer.S b/vm/compiler/template/ia32/footer.S
index 1b1a1ae..d11af69 100644
--- a/vm/compiler/template/ia32/footer.S
+++ b/vm/compiler/template/ia32/footer.S
@@ -7,12 +7,12 @@
     .text
     .align  4
 /*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
  */
     .global cacheflush
 cacheflush:
-    movl  $$0xdeadf0f0, %eax
-    call *%eax
+    ret
 
 
     .global dmvCompilerTemplateEnd
diff --git a/vm/compiler/template/ia32/header.S b/vm/compiler/template/ia32/header.S
index 57f5a5b..a67ba6e 100644
--- a/vm/compiler/template/ia32/header.S
+++ b/vm/compiler/template/ia32/header.S
@@ -16,6 +16,12 @@
 
 #if defined(WITH_JIT)
 
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC   %esi
+#define rFP   %edi
+#define rINST %ebx
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
index 7726e97..1256ee4 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
@@ -23,6 +23,12 @@
 
 #if defined(WITH_JIT)
 
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC   %esi
+#define rFP   %edi
+#define rINST %ebx
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
@@ -51,29 +57,32 @@
 dvmCompiler_TEMPLATE_INTERPRET:
 /* File: ia32/TEMPLATE_INTERPRET.S */
     /*
-     * TODO: figure out how best to do this on x86, as we don't have
-     * an lr equivalent and probably don't want to push.
+     * This handler is a bit odd - it may be called via chaining or
+     * from static code and is expected to cause control to flow
+     * to the interpreter.  The problem is where to find the Dalvik
+     * PC of the next instruction.  When called via chaining, the dPC
+     * will be located at *rp.  When called from static code, rPC is
+     * valid and rp is a real return pointer (that should be ignored).
+     * The Arm target deals with this by using the link register as
+     * a flag.  If it is zero, we know we were called from static code.
+     * If non-zero, it points to the chain cell containing dPC.
+     * For x86, we'll infer the source by looking where rp points.
+     * If it points to anywhere within the code cache, we'll assume
+     * we got here via chaining.  Otherwise, we'll assume rPC is valid.
      *
-     * This handler transfers control to the interpeter without performing
-     * any lookups.  It may be called either as part of a normal chaining
-     * operation, or from the transition code in header.S.  We distinquish
-     * the two cases by looking at the link register.  If called from a
-     * translation chain, it will point to the chaining Dalvik PC -3.
      * On entry:
-     *    lr - if NULL:
-     *        r1 - the Dalvik PC to begin interpretation.
-     *    else
-     *        [lr, #3] contains Dalvik PC to begin interpretation
-     *    rGLUE - pointer to interpState
-     *    rFP - Dalvik frame pointer
-     *
-     *cmp     lr, #0
-     *ldrne   r1,[lr, #3]
-     *ldr     r2, .LinterpPunt
-     *mov     r0, r1                       @ set Dalvik PC
-     *bx      r2
-     *@ doesn't return
+     *    (TOS)<- return pointer or pointer to dPC
      */
+     movl   rGLUE,%ecx
+     movl   $.LinterpPunt,%edx
+     pop    %eax
+     cmpl   %eax,offGlue_jitCacheEnd(%ecx)
+     ja     1f
+     cmpl   %eax,offGlue_jitCacheStart(%ecx)
+     jb     1f
+     movl   %eax,rPC
+1:
+     jmp    *(%edx)
 
 .LinterpPunt:
     .long   dvmJitToInterpPunt
@@ -89,12 +98,12 @@
     .text
     .align  4
 /*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
  */
     .global cacheflush
 cacheflush:
-    movl  $0xdeadf0f0, %eax
-    call *%eax
+    ret
 
 
     .global dmvCompilerTemplateEnd
diff --git a/vm/interp/InterpDefs.h b/vm/interp/InterpDefs.h
index 505df52..3c0d2e3 100644
--- a/vm/interp/InterpDefs.h
+++ b/vm/interp/InterpDefs.h
@@ -39,7 +39,7 @@
 
 #if defined(WITH_JIT)
 /*
- * There are six entry points from the compiled code to the interpreter:
+ * There are seven entry points from the compiled code to the interpreter:
  * 1) dvmJitToInterpNormal: find if there is a corresponding compilation for
  *    the new dalvik PC. If so, chain the originating compilation with the
  *    target then jump to it.
@@ -161,6 +161,8 @@
      */
     unsigned char**    ppJitProfTable; // Used to refresh pJitProfTable
     int                icRechainCount; // Count down to next rechain request
+    const void*        jitCacheStart;  // Code cache boundaries
+    const void*        jitCacheEnd;
 #endif
 
     bool        debugIsMethodEntry;     // used for method entry event triggers
diff --git a/vm/mterp/Mterp.c b/vm/mterp/Mterp.c
index f4740fe..0cd5a1f 100644
--- a/vm/mterp/Mterp.c
+++ b/vm/mterp/Mterp.c
@@ -81,6 +81,8 @@
     glue->pJitProfTable = gDvmJit.pProfTable;
     glue->ppJitProfTable = &gDvmJit.pProfTable;
     glue->jitThreshold = gDvmJit.threshold;
+    glue->jitCacheStart = gDvmJit.codeCache;
+    glue->jitCacheEnd = (char*)gDvmJit.codeCache + gDvmJit.codeCacheSize;
 #endif
 #if defined(WITH_INLINE_PROFILING)
     /*
diff --git a/vm/mterp/common/asm-constants.h b/vm/mterp/common/asm-constants.h
index aeed88b..e5a8a04 100644
--- a/vm/mterp/common/asm-constants.h
+++ b/vm/mterp/common/asm-constants.h
@@ -107,6 +107,8 @@
 MTERP_OFFSET(offGlue_jitThreshold,      MterpGlue, jitThreshold, 76)
 MTERP_OFFSET(offGlue_ppJitProfTable,    MterpGlue, ppJitProfTable, 80)
 MTERP_OFFSET(offGlue_icRechainCount,    MterpGlue, icRechainCount, 84)
+MTERP_OFFSET(offGlue_jitCacheStart,     MterpGlue, jitCacheStart, 88)
+MTERP_OFFSET(offGlue_jitCacheEnd,       MterpGlue, jitCacheEnd, 92)
 #endif
 /* make sure all JValue union members are stored at the same offset */
 MTERP_OFFSET(offGlue_retval_z,          MterpGlue, retval.z, 8)
diff --git a/vm/mterp/out/InterpAsm-x86-atom.S b/vm/mterp/out/InterpAsm-x86-atom.S
index 136e2f2..d4dd705 100644
--- a/vm/mterp/out/InterpAsm-x86-atom.S
+++ b/vm/mterp/out/InterpAsm-x86-atom.S
@@ -477,6 +477,7 @@
 .LintMax:
 .long   0x7FFFFFFF
 
+
     .global dvmAsmInstructionStart
     .type   dvmAsmInstructionStart, %function
 dvmAsmInstructionStart = .L_OP_NOP
diff --git a/vm/mterp/out/InterpAsm-x86.S b/vm/mterp/out/InterpAsm-x86.S
index b2bcd08..9564d43 100644
--- a/vm/mterp/out/InterpAsm-x86.S
+++ b/vm/mterp/out/InterpAsm-x86.S
@@ -55,7 +55,7 @@
 Mterp notes:
 
 Some key interpreter variables will be assigned to registers.  Note that each
-will also have an associated spill location (mostly used useful for those assigned
+will also have an associated spill location (mostly useful for those assigned
 to callee save registers).
 
   nick     reg   purpose
@@ -69,8 +69,6 @@
    o High order 16 bits of ebx must be zero on entry to handler
    o rPC, rFP, rINSTw/rINSTbl valid on handler entry and exit
    o eax, edx and ecx are scratch, rINSTw/ebx sometimes scratch
-   o rPC is in the caller save set, and will be killed across external calls. Don't
-     forget to SPILL/UNSPILL it around call points
 
 */
 
@@ -119,6 +117,15 @@
 #define SPILL_TMP3(reg) movl reg,TMP_SPILL3(%ebp)
 #define UNSPILL_TMP3(reg) movl TMP_SPILL3(%ebp),reg
 
+#if defined(WITH_JIT)
+.macro GET_JIT_PROF_TABLE _glue _reg
+    movl    offGlue_pJitProfTable(\_glue),\_reg
+.endm
+.macro GET_JIT_THRESHOLD _glue _reg
+    movl    offGlue_jitThreshold(\_glue),\_reg
+.endm
+#endif
+
 /* save/restore the PC and/or FP from the glue struct */
 .macro SAVE_PC_FP_TO_GLUE _reg
     movl     rGLUE,\_reg
@@ -593,6 +600,10 @@
  */
 #include "../common/asm-constants.h"
 
+#if defined(WITH_JIT)
+#include "../common/jit-config.h"
+#endif
+
 
     .global dvmAsmInstructionStart
     .type   dvmAsmInstructionStart, %function
@@ -9129,24 +9140,134 @@
 
 #if defined(WITH_JIT)
 /*
- * Placeholder entries for x86 JIT
+ * JIT-related re-entries into the interpreter.  In general, if the
+ * exit from a translation can at some point be chained, the entry
+ * here requires that control arrived via a call, and that the "rp"
+ * on TOS is actually a pointer to a 32-bit cell containing the Dalvik PC
+ * of the next insn to handle.  If no chaining will happen, the entry
+ * should be reached via a direct jump and rPC set beforehand.
  */
+
     .global dvmJitToInterpPunt
+/*
+ * The compiler will generate a jump to this entry point when it is
+ * having difficulty translating a Dalvik instruction.  We must skip
+ * the code cache lookup & prevent chaining to avoid bouncing between
+ * the interpreter and code cache. rPC must be set on entry.
+ */
 dvmJitToInterpPunt:
+#if defined(WITH_JIT_TUNING)
+    movl   rPC, OUT_ARG0(%esp)
+    call   dvmBumpPunt
+#endif
+    FETCH_INST_R %edx
+    GOTO_NEXT_R %edx
+
     .global dvmJitToInterpSingleStep
+/*
+ * Return to the interpreter to handle a single instruction.
+ * Should be reached via a call.
+ * On entry:
+ *   0(%esp)          <= native return address within trace
+ *   rPC              <= Dalvik PC of this instruction
+ *   OUT_ARG0+4(%esp) <= Dalvik PC of next instruction
+ */
 dvmJitToInterpSingleStep:
+    pop    %eax
+    movl   rGLUE, %ecx
+    movl   OUT_ARG0(%esp), %edx
+    movl   %eax,offGlue_jitResumeNPC(%ecx)
+    movl   %edx,offGlue_jitResumeDPC(%ecx)
+    movl   $kInterpEntryInstr,offGlue_entryPoint(%ecx)
+    movl   $1,rINST     # changeInterp <= true
+    jmp    common_gotoBail
+
     .global dvmJitToInterpNoChainNoProfile
+/*
+ * Return from the translation cache to the interpreter to do method
+ * invocation.  Check if the translation exists for the callee, but don't
+ * chain to it. rPC must be set on entry.
+ */
 dvmJitToInterpNoChainNoProfile:
+#if defined(WITH_JIT_TUNING)
+    call   dvmBumpNoChain
+#endif
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr         # is there a translation?
+    movl   rGLUE,%ecx
+    movl   offGlue_self(%ecx), %ecx  # ecx <- glue->self
+    movl   %eax,offThread_inJitCodeCache(%ecx)  # set inJitCodeCache flag
+    cmpl   $0, %eax
+    jz     1f
+    call   *%eax                     # exec translation if we've got one
+    # won't return
+1:
+    FETCH_INST_R %edx
+    GOTO_NEXT_R %edx
+
+/*
+ * Return from the translation cache and immediately request a
+ * translation fro the exit target, but don't attempt to chain.
+ * rPC set on entry.
+ */
     .global dvmJitToInterpTraceSelectNoChain
 dvmJitToInterpTraceSelectNoChain:
+#if defined(WITH_JIT_TUNING)
+    call   dvmBumpNoChain
+#endif
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr  # is there a translation?
+    movl   rGLUE,%ecx
+    movl   offGlue_self(%ecx),%ecx
+    cmpl   $0,%eax
+    movl   %eax,offThread_inJitCodeCache(%ecx)  # set inJitCodeCache flag
+    jz     1f
+    call   *%eax              # jump to tranlation
+    # won't return
+
+/* No Translation - request one */
+1:
+    GET_JIT_PROF_TABLE %ecx %eax
+    cmpl   $0, %eax          # JIT enabled?
+    jnz    2f                 # Request one if so
+    FETCH_INST_R %edx         # Continue interpreting if not
+    GOTO_NEXT_R %edx
+2:
+    movl   $kJitTSelectRequestHot,rINST  # ask for trace select
+    jmp    common_selectTrace
+
+/*
+ * Return from the translation cache and immediately request a
+ * translation for the exit target.  Reached via a call, and
+ * (TOS)->rPC.
+ */
     .global dvmJitToInterpTraceSelect
 dvmJitToInterpTraceSelect:
+    pop    rINST           # save chain cell address in callee save reg
+    movl   (rINST),rPC
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr  # is there a translation?
+    cmpl   $0,%eax
+    jz     1b                 # no - ask for one
+    movl   %eax,OUT_ARG0(%esp)
+# FIXME - need to adjust rINST to beginning of sequence
+    movl   rINST,OUT_ARG1(%esp)
+    call   dvmJitChain        # Attempt dvmJitChain(codeAddr,chainAddr)
+    cmpl   $0,%eax           # Success?
+    jz     toInterpreter      # didn't chain - interpret
+    call   *%eax
+    # won't return
+
+/*
+ * Placeholder entries for x86 JIT
+ */
     .global dvmJitToInterpBackwardBranch
 dvmJitToInterpBackwardBranch:
     .global dvmJitToInterpNormal
 dvmJitToInterpNormal:
     .global dvmJitToInterpNoChain
 dvmJitToInterpNoChain:
+toInterpreter:
     jmp  common_abort
 #endif
 
@@ -9158,10 +9279,56 @@
  */
 common_backwardBranch:
     movl    rGLUE,%ecx
-    call   common_periodicChecks  # Note: expects rPC to be preserved
+    call   common_periodicChecks  # rPC and ecx/rGLUE preserved
+#if defined(WITH_JIT)
+    GET_JIT_PROF_TABLE %ecx %edx
+    ADVANCE_PC_INDEXED rINST
+    cmpl   $0,%edx
+    FETCH_INST
+    jz    1f                    # Profiling off - continue
+    .global updateProfile
+updateProfile:
+common_updateProfile:
+    # quick & dirty hash
+    movl   rPC, %eax
+    shrl   $12, %eax
+    xorl   rPC, %eax
+    andl   $((1<<JIT_PROF_SIZE_LOG_2)-1),%eax
+    decb   (%edx,%eax)
+    jz     2f
+1:
+    GOTO_NEXT
+2:
+/*
+ * Here, we switch to the debug interpreter to request
+ * trace selection.  First, though, check to see if there
+ * is already a native translation in place (and, if so,
+ * jump to it now.
+ */
+    GET_JIT_THRESHOLD %ecx rINST
+    EXPORT_PC
+    movb   rINSTbl,(%edx,%eax)   # reset counter
+    movl   offGlue_self(%ecx),rINST
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr   # already have one?
+    movl   %eax,offThread_inJitCodeCache(rINST)   # set the inJitCodeCache flag
+    cmpl   $0,%eax
+    jz     1f
+    call   *%eax        # FIXME: decide call vs/ jmp!.  No return either way
+1:
+    movl   $kJitTSelectRequest,%eax
+    # On entry, eax<- jitState, rPC valid
+common_selectTrace:
+    movl   rGLUE,%ecx
+    movl   %eax,offGlue_jitState(%ecx)
+    movl   $kInterpEntryInstr,offGlue_entryPoint(%ecx)
+    movl   $1,rINST
+    jmp    common_gotoBail
+#else
     ADVANCE_PC_INDEXED rINST
     FETCH_INST
     GOTO_NEXT
+#endif
 
 
 
diff --git a/vm/mterp/x86/footer.S b/vm/mterp/x86/footer.S
index 6e2c5bd..989585a 100644
--- a/vm/mterp/x86/footer.S
+++ b/vm/mterp/x86/footer.S
@@ -19,24 +19,134 @@
 
 #if defined(WITH_JIT)
 /*
- * Placeholder entries for x86 JIT
+ * JIT-related re-entries into the interpreter.  In general, if the
+ * exit from a translation can at some point be chained, the entry
+ * here requires that control arrived via a call, and that the "rp"
+ * on TOS is actually a pointer to a 32-bit cell containing the Dalvik PC
+ * of the next insn to handle.  If no chaining will happen, the entry
+ * should be reached via a direct jump and rPC set beforehand.
  */
+
     .global dvmJitToInterpPunt
+/*
+ * The compiler will generate a jump to this entry point when it is
+ * having difficulty translating a Dalvik instruction.  We must skip
+ * the code cache lookup & prevent chaining to avoid bouncing between
+ * the interpreter and code cache. rPC must be set on entry.
+ */
 dvmJitToInterpPunt:
+#if defined(WITH_JIT_TUNING)
+    movl   rPC, OUT_ARG0(%esp)
+    call   dvmBumpPunt
+#endif
+    FETCH_INST_R %edx
+    GOTO_NEXT_R %edx
+
     .global dvmJitToInterpSingleStep
+/*
+ * Return to the interpreter to handle a single instruction.
+ * Should be reached via a call.
+ * On entry:
+ *   0(%esp)          <= native return address within trace
+ *   rPC              <= Dalvik PC of this instruction
+ *   OUT_ARG0+4(%esp) <= Dalvik PC of next instruction
+ */
 dvmJitToInterpSingleStep:
+    pop    %eax
+    movl   rGLUE, %ecx
+    movl   OUT_ARG0(%esp), %edx
+    movl   %eax,offGlue_jitResumeNPC(%ecx)
+    movl   %edx,offGlue_jitResumeDPC(%ecx)
+    movl   $$kInterpEntryInstr,offGlue_entryPoint(%ecx)
+    movl   $$1,rINST     # changeInterp <= true
+    jmp    common_gotoBail
+
     .global dvmJitToInterpNoChainNoProfile
+/*
+ * Return from the translation cache to the interpreter to do method
+ * invocation.  Check if the translation exists for the callee, but don't
+ * chain to it. rPC must be set on entry.
+ */
 dvmJitToInterpNoChainNoProfile:
+#if defined(WITH_JIT_TUNING)
+    call   dvmBumpNoChain
+#endif
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr         # is there a translation?
+    movl   rGLUE,%ecx
+    movl   offGlue_self(%ecx), %ecx  # ecx <- glue->self
+    movl   %eax,offThread_inJitCodeCache(%ecx)  # set inJitCodeCache flag
+    cmpl   $$0, %eax
+    jz     1f
+    call   *%eax                     # exec translation if we've got one
+    # won't return
+1:
+    FETCH_INST_R %edx
+    GOTO_NEXT_R %edx
+
+/*
+ * Return from the translation cache and immediately request a
+ * translation fro the exit target, but don't attempt to chain.
+ * rPC set on entry.
+ */
     .global dvmJitToInterpTraceSelectNoChain
 dvmJitToInterpTraceSelectNoChain:
+#if defined(WITH_JIT_TUNING)
+    call   dvmBumpNoChain
+#endif
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr  # is there a translation?
+    movl   rGLUE,%ecx
+    movl   offGlue_self(%ecx),%ecx
+    cmpl   $$0,%eax
+    movl   %eax,offThread_inJitCodeCache(%ecx)  # set inJitCodeCache flag
+    jz     1f
+    call   *%eax              # jump to tranlation
+    # won't return
+
+/* No Translation - request one */
+1:
+    GET_JIT_PROF_TABLE %ecx %eax
+    cmpl   $$0, %eax          # JIT enabled?
+    jnz    2f                 # Request one if so
+    FETCH_INST_R %edx         # Continue interpreting if not
+    GOTO_NEXT_R %edx
+2:
+    movl   $$kJitTSelectRequestHot,rINST  # ask for trace select
+    jmp    common_selectTrace
+
+/*
+ * Return from the translation cache and immediately request a
+ * translation for the exit target.  Reached via a call, and
+ * (TOS)->rPC.
+ */
     .global dvmJitToInterpTraceSelect
 dvmJitToInterpTraceSelect:
+    pop    rINST           # save chain cell address in callee save reg
+    movl   (rINST),rPC
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr  # is there a translation?
+    cmpl   $$0,%eax
+    jz     1b                 # no - ask for one
+    movl   %eax,OUT_ARG0(%esp)
+# FIXME - need to adjust rINST to beginning of sequence
+    movl   rINST,OUT_ARG1(%esp)
+    call   dvmJitChain        # Attempt dvmJitChain(codeAddr,chainAddr)
+    cmpl   $$0,%eax           # Success?
+    jz     toInterpreter      # didn't chain - interpret
+    call   *%eax
+    # won't return
+
+/*
+ * Placeholder entries for x86 JIT
+ */
     .global dvmJitToInterpBackwardBranch
 dvmJitToInterpBackwardBranch:
     .global dvmJitToInterpNormal
 dvmJitToInterpNormal:
     .global dvmJitToInterpNoChain
 dvmJitToInterpNoChain:
+toInterpreter:
     jmp  common_abort
 #endif
 
@@ -48,10 +158,56 @@
  */
 common_backwardBranch:
     movl    rGLUE,%ecx
-    call   common_periodicChecks  # Note: expects rPC to be preserved
+    call   common_periodicChecks  # rPC and ecx/rGLUE preserved
+#if defined(WITH_JIT)
+    GET_JIT_PROF_TABLE %ecx %edx
+    ADVANCE_PC_INDEXED rINST
+    cmpl   $$0,%edx
+    FETCH_INST
+    jz    1f                    # Profiling off - continue
+    .global updateProfile
+updateProfile:
+common_updateProfile:
+    # quick & dirty hash
+    movl   rPC, %eax
+    shrl   $$12, %eax
+    xorl   rPC, %eax
+    andl   $$((1<<JIT_PROF_SIZE_LOG_2)-1),%eax
+    decb   (%edx,%eax)
+    jz     2f
+1:
+    GOTO_NEXT
+2:
+/*
+ * Here, we switch to the debug interpreter to request
+ * trace selection.  First, though, check to see if there
+ * is already a native translation in place (and, if so,
+ * jump to it now.
+ */
+    GET_JIT_THRESHOLD %ecx rINST
+    EXPORT_PC
+    movb   rINSTbl,(%edx,%eax)   # reset counter
+    movl   offGlue_self(%ecx),rINST
+    movl   rPC,OUT_ARG0(%esp)
+    call   dvmJitGetCodeAddr   # already have one?
+    movl   %eax,offThread_inJitCodeCache(rINST)   # set the inJitCodeCache flag
+    cmpl   $$0,%eax
+    jz     1f
+    call   *%eax        # FIXME: decide call vs/ jmp!.  No return either way
+1:
+    movl   $$kJitTSelectRequest,%eax
+    # On entry, eax<- jitState, rPC valid
+common_selectTrace:
+    movl   rGLUE,%ecx
+    movl   %eax,offGlue_jitState(%ecx)
+    movl   $$kInterpEntryInstr,offGlue_entryPoint(%ecx)
+    movl   $$1,rINST
+    jmp    common_gotoBail
+#else
     ADVANCE_PC_INDEXED rINST
     FETCH_INST
     GOTO_NEXT
+#endif
 
 
 
diff --git a/vm/mterp/x86/header.S b/vm/mterp/x86/header.S
index cb2ddf8..dc4fdc2 100644
--- a/vm/mterp/x86/header.S
+++ b/vm/mterp/x86/header.S
@@ -48,7 +48,7 @@
 Mterp notes:
 
 Some key interpreter variables will be assigned to registers.  Note that each
-will also have an associated spill location (mostly used useful for those assigned
+will also have an associated spill location (mostly useful for those assigned
 to callee save registers).
 
   nick     reg   purpose
@@ -62,8 +62,6 @@
    o High order 16 bits of ebx must be zero on entry to handler
    o rPC, rFP, rINSTw/rINSTbl valid on handler entry and exit
    o eax, edx and ecx are scratch, rINSTw/ebx sometimes scratch
-   o rPC is in the caller save set, and will be killed across external calls. Don't
-     forget to SPILL/UNSPILL it around call points
 
 */
 
@@ -112,6 +110,15 @@
 #define SPILL_TMP3(reg) movl reg,TMP_SPILL3(%ebp)
 #define UNSPILL_TMP3(reg) movl TMP_SPILL3(%ebp),reg
 
+#if defined(WITH_JIT)
+.macro GET_JIT_PROF_TABLE _glue _reg
+    movl    offGlue_pJitProfTable(\_glue),\_reg
+.endm
+.macro GET_JIT_THRESHOLD _glue _reg
+    movl    offGlue_jitThreshold(\_glue),\_reg
+.endm
+#endif
+
 /* save/restore the PC and/or FP from the glue struct */
 .macro SAVE_PC_FP_TO_GLUE _reg
     movl     rGLUE,\_reg
@@ -585,3 +592,7 @@
  * to expand the macros into assembler assignment statements.
  */
 #include "../common/asm-constants.h"
+
+#if defined(WITH_JIT)
+#include "../common/jit-config.h"
+#endif