Collect method traces with the fast interpreter and the JIT'ed code.

Insert inline code instead of switching to the debug interpreter in the hope
that the time stamps collected in traceview are more close to the real
world behavior with minimal profiling overhead.

Because the inline polling still introduces additional overhead (20% ~ 100%),
it is only enabled in the special VM build called "libdvm_traceview.so".
It won't work on the emulator because it is not implemented to collect the
detailed instruction traces.

Here are some performance numbers using the FibonacciSlow microbenchmark
(ie recursive workloads / the shorter the faster):

       time: configuration
  8,162,602: profiling off/libdvm.so/JIT off
  2,801,829: profiling off/libdvm.so/JIT on
  9,952,236: profiling off/libdvm_traceview.so/JIT off
  4,465,701: profiling off/libdvm_traceview.so/JIT on
164,786,585: profiling on/libdvm.so/JIT off
164,664,634: profiling on/libdvm.so/JIT on
 11,231,707: profiling on/libdvm_traceview.so/JIT off
  8,427,846: profiling on/libdvm_traceview.so/JIT on

Comparing the 8,427,846 vs 164,664,634 numbers againt the true baseline
performance number of 2,801,829, the new libdvm_traceview.so improves the time
skew from 58x to 3x.

Change-Id: I48611a3a4ff9c4950059249e5503c26abd6b138e
diff --git a/vm/Android.mk b/vm/Android.mk
index 98f004c..691813e 100644
--- a/vm/Android.mk
+++ b/vm/Android.mk
@@ -77,7 +77,7 @@
     LOCAL_MODULE := libdvm_sv
     include $(BUILD_SHARED_LIBRARY)
 
-    # Devivation #3
+    # Derivation #3
     # Compile out the JIT
     WITH_JIT := false
     include $(LOCAL_PATH)/ReconfigureDvm.mk
@@ -86,6 +86,13 @@
     LOCAL_MODULE := libdvm_interp
     include $(BUILD_SHARED_LIBRARY)
 
+    # Derivation #4
+    WITH_JIT := true
+    include $(LOCAL_PATH)/ReconfigureDvm.mk
+
+    LOCAL_CFLAGS += $(target_smp_flag) -DWITH_INLINE_PROFILING
+    LOCAL_MODULE := libdvm_traceview
+    include $(BUILD_SHARED_LIBRARY)
 endif
 
 #
diff --git a/vm/Init.c b/vm/Init.c
index 81a385a..1a3d359 100644
--- a/vm/Init.c
+++ b/vm/Init.c
@@ -197,6 +197,9 @@
 #if ANDROID_SMP != 0
         " smp"
 #endif
+#ifdef WITH_INLINE_PROFILING
+        " inline_profiling"
+#endif
     );
 #ifdef DVM_SHOW_EXCEPTION
     dvmFprintf(stderr, " show_exception=%d", DVM_SHOW_EXCEPTION);
diff --git a/vm/Profile.c b/vm/Profile.c
index d5dcc36..957ef0d 100644
--- a/vm/Profile.c
+++ b/vm/Profile.c
@@ -665,7 +665,6 @@
     dvmUnlockMutex(&state->startStopLock);
 }
 
-
 /*
  * We just did something with a method.  Emit a record.
  *
@@ -727,6 +726,48 @@
     *ptr++ = (u1) (clockDiff >> 24);
 }
 
+#if defined(WITH_INLINE_PROFILING)
+#include <interp/InterpDefs.h>
+
+/*
+ * Register the METHOD_TRACE_ENTER action for the fast interpreter and
+ * JIT'ed code.
+ */
+void dvmFastMethodTraceEnter(const Method* method,
+                             const struct InterpState* interpState)
+{
+    if (gDvm.activeProfilers) {
+        dvmMethodTraceAdd(interpState->self, method, METHOD_TRACE_ENTER);
+    }
+}
+
+/*
+ * Register the METHOD_TRACE_EXIT action for the fast interpreter and
+ * JIT'ed code for Java methods. The about-to-return callee method can be
+ * retrieved from interpState->method.
+ */
+void dvmFastJavaMethodTraceExit(const struct InterpState* interpState)
+{
+    if (gDvm.activeProfilers) {
+        dvmMethodTraceAdd(interpState->self, interpState->method,
+                          METHOD_TRACE_EXIT);
+    }
+}
+
+/*
+ * Register the METHOD_TRACE_EXIT action for the fast interpreter and
+ * JIT'ed code for JNI methods. The about-to-return JNI callee method is passed
+ * in explicitly.
+ */
+void dvmFastNativeMethodTraceExit(const Method* method,
+                                  const struct InterpState* interpState)
+{
+    if (gDvm.activeProfilers) {
+        dvmMethodTraceAdd(interpState->self, method, METHOD_TRACE_EXIT);
+    }
+}
+#endif
+
 /*
  * We just did something with a method.  Emit a record by setting a value
  * in a magic memory location.
@@ -858,6 +899,9 @@
  */
 void dvmStartInstructionCounting()
 {
+#if defined(WITH_INLINE_PROFILING)
+    LOGW("Instruction counting not supported with inline profiling");
+#endif
     updateActiveProfilers(1);
     /* in theory we should make this an atomic inc; in practice not important */
     gDvm.instructionCountEnableCount++;
diff --git a/vm/Profile.h b/vm/Profile.h
index 08bbf61..e2be2e0 100644
--- a/vm/Profile.h
+++ b/vm/Profile.h
@@ -163,6 +163,15 @@
 void dvmMethodTraceClassPrepBegin(void);
 void dvmMethodTraceClassPrepEnd(void);
 
+#if defined(WITH_INLINE_PROFILING)
+struct InterpState;     // extern
+void dvmFastMethodTraceEnter(const Method* method,
+                             const struct InterpState* interpState);
+void dvmFastJavaMethodTraceExit(const struct InterpState* interpState);
+void dvmFastNativeMethodTraceExit(const Method*method,
+                                  const struct InterpState* interpState);
+#endif
+
 /*
  * Start/stop alloc counting.
  */
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 60f060c..8c26989 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -741,7 +741,11 @@
 
     dvmLockMutex(&gDvmJit.tableLock);
     jitActive = gDvmJit.pProfTable != NULL;
-    jitActivate = !(gDvm.debuggerActive || (gDvm.activeProfilers > 0));
+    bool disableJit = gDvm.debuggerActive;
+#if !defined(WITH_INLINE_PROFILING)
+    disableJit = disableJit || (gDvm.activeProfilers > 0);
+#endif
+    jitActivate = !disableJit;
 
     if (jitActivate && !jitActive) {
         gDvmJit.pProfTable = gDvmJit.pProfTableCopy;
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
index a137d22..aaadc00 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
@@ -41,5 +41,12 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
index 2557863..eeac2b0 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
@@ -41,9 +41,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
index 5be6978..044d0ee 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
@@ -46,6 +46,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
index b7ab971..b2e71ee 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
@@ -5,6 +5,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
diff --git a/vm/compiler/template/armv5te/footer.S b/vm/compiler/template/armv5te/footer.S
index 73fc3d7..a391dbe 100644
--- a/vm/compiler/template/armv5te/footer.S
+++ b/vm/compiler/template/armv5te/footer.S
@@ -22,9 +22,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -96,6 +109,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index 60664fa..655bc54 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index ccdbcca..ff552bb 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1181,9 +1217,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1255,6 +1304,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index e520056..34931f8 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index 87a0691..b10beef 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/interp/InterpDefs.h b/vm/interp/InterpDefs.h
index a01ee68..ec1175f 100644
--- a/vm/interp/InterpDefs.h
+++ b/vm/interp/InterpDefs.h
@@ -233,7 +233,11 @@
  */
 static inline bool dvmDebuggerOrProfilerActive(void)
 {
-    return gDvm.debuggerActive || gDvm.activeProfilers != 0;
+    bool result = gDvm.debuggerActive;
+#if !defined(WITH_INLINE_PROFILING)
+    result = result || (gDvm.activeProfilers != 0);
+#endif
+    return result;
 }
 
 #if defined(WITH_JIT)
@@ -243,9 +247,11 @@
  */
 static inline bool dvmJitDebuggerOrProfilerActive()
 {
-    return gDvmJit.pProfTable != NULL
-        || gDvm.activeProfilers != 0
-        || gDvm.debuggerActive;
+    bool result = (gDvmJit.pProfTable != NULL) || gDvm.debuggerActive;
+#if !defined(WITH_INLINE_PROFILING)
+    result = result || (gDvm.activeProfilers != 0);
+#endif
+    return result;
 }
 #endif
 
diff --git a/vm/mterp/Mterp.c b/vm/mterp/Mterp.c
index dbf5003..4a3fc34 100644
--- a/vm/mterp/Mterp.c
+++ b/vm/mterp/Mterp.c
@@ -82,6 +82,17 @@
     glue->ppJitProfTable = &gDvmJit.pProfTable;
     glue->jitThreshold = gDvmJit.threshold;
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    /*
+     * If WITH_INLINE_PROFILING is defined, we won't switch to the debug
+     * interpreter when a new method is entered. So we need to register the
+     * METHOD_ENTER action here.
+     */
+    if (glue->debugIsMethodEntry) {
+        glue->debugIsMethodEntry = false;
+        TRACE_METHOD_ENTER(self, glue->method);
+    }
+#endif
     if (gDvm.jdwpConfigured) {
         glue->pDebuggerActive = &gDvm.debuggerActive;
     } else {
diff --git a/vm/mterp/armv5te/footer.S b/vm/mterp/armv5te/footer.S
index 2ba5357..41bcb24 100644
--- a/vm/mterp/armv5te/footer.S
+++ b/vm/mterp/armv5te/footer.S
@@ -446,8 +446,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -484,16 +494,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -620,6 +635,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -689,10 +711,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -765,6 +798,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
diff --git a/vm/mterp/out/InterpAsm-armv4t.S b/vm/mterp/out/InterpAsm-armv4t.S
index 3593a6e..de12cd9 100644
--- a/vm/mterp/out/InterpAsm-armv4t.S
+++ b/vm/mterp/out/InterpAsm-armv4t.S
@@ -10294,8 +10294,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -10332,16 +10342,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -10468,6 +10483,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -10537,10 +10559,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -10613,6 +10646,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
diff --git a/vm/mterp/out/InterpAsm-armv5te-vfp.S b/vm/mterp/out/InterpAsm-armv5te-vfp.S
index 1ae9636..d144b7d 100644
--- a/vm/mterp/out/InterpAsm-armv5te-vfp.S
+++ b/vm/mterp/out/InterpAsm-armv5te-vfp.S
@@ -9832,8 +9832,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -9870,16 +9880,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -10006,6 +10021,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -10075,10 +10097,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -10151,6 +10184,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
diff --git a/vm/mterp/out/InterpAsm-armv5te.S b/vm/mterp/out/InterpAsm-armv5te.S
index 9b083b2..f98897a 100644
--- a/vm/mterp/out/InterpAsm-armv5te.S
+++ b/vm/mterp/out/InterpAsm-armv5te.S
@@ -10290,8 +10290,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -10328,16 +10338,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -10464,6 +10479,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -10533,10 +10555,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -10609,6 +10642,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
diff --git a/vm/mterp/out/InterpAsm-armv7-a-neon.S b/vm/mterp/out/InterpAsm-armv7-a-neon.S
index 173b478..635414f 100644
--- a/vm/mterp/out/InterpAsm-armv7-a-neon.S
+++ b/vm/mterp/out/InterpAsm-armv7-a-neon.S
@@ -9766,8 +9766,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -9804,16 +9814,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -9940,6 +9955,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -10009,10 +10031,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -10085,6 +10118,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
diff --git a/vm/mterp/out/InterpAsm-armv7-a.S b/vm/mterp/out/InterpAsm-armv7-a.S
index 15e48a4..694d0b5 100644
--- a/vm/mterp/out/InterpAsm-armv7-a.S
+++ b/vm/mterp/out/InterpAsm-armv7-a.S
@@ -9766,8 +9766,18 @@
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-    orrne   ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    orrnes  ip, ip, r1                  @ ip<- suspendCount | debuggerActive
+    /*
+     * Don't switch the interpreter in the libdvm_traceview build even if the
+     * profiler is active.
+     * The code here is opted for less intrusion instead of performance.
+     * That is, *pActiveProfilers is still loaded into r2 even though it is not
+     * used when WITH_INLINE_PROFILING is defined.
+     */
+#if !defined(WITH_INLINE_PROFILING)
     orrs    ip, ip, r2                  @ ip<- suspend|debugger|profiler; set Z
+#endif
+
 
     bxeq    lr                          @ all zero, return
 
@@ -9804,16 +9814,21 @@
      * Reload the debugger/profiler enable flags.  We're checking to see
      * if either of these got set while we were suspended.
      *
-     * We can't really avoid the #ifdefs here, because the fields don't
-     * exist when the feature is disabled.
+     * If WITH_INLINE_PROFILING is configured, don't check whether the profiler
+     * is enabled or not as the profiling will be done inline.
      */
     ldr     r1, [rGLUE, #offGlue_pDebuggerActive]   @ r1<- &debuggerActive
     cmp     r1, #0                      @ debugger enabled?
     ldrneb  r1, [r1]                    @ yes, r1<- debuggerActive (boolean)
+
+#if !defined(WITH_INLINE_PROFILING)
     ldr     r2, [rGLUE, #offGlue_pActiveProfilers]  @ r2<- &activeProfilers
     ldr     r2, [r2]                    @ r2<- activeProfilers (int)
-
     orrs    r1, r1, r2
+#else
+    cmp     r1, #0                      @ only consult the debuggerActive flag
+#endif
+
     beq     2f
 
 1:  @ debugger/profiler enabled, bail out; glue->entryPoint was set above
@@ -9940,6 +9955,13 @@
     mov     r9, #0
     str     r9, [r10, #offStackSaveArea_returnAddr]
 #endif
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    bl      dvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     str     r0, [r10, #offStackSaveArea_method]
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
@@ -10009,10 +10031,21 @@
 .Lskip:
 #endif
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=JNIMethod, r6=rGLUE
+    stmfd   sp!, {r2,r6}
+#endif
+
     @mov     lr, pc                      @ set return addr
     @ldr     pc, [r2, #offMethod_nativeFunc] @ pc<- methodToCall->nativeFunc
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    @ r0=JNIMethod, r1=rGLUE
+    ldmfd   sp!, {r0-r1}
+    bl      dvmFastNativeMethodTraceExit
+#endif
+
 #if defined(WITH_JIT)
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable] @ Refresh Jit's on/off status
 #endif
@@ -10085,6 +10118,13 @@
     mov     r9, #0
     bl      common_periodicChecks
 
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r6
+    @ r0=rGlue
+    bl      dvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
     ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc