Automated import from //branches/master/...@142811,142811
diff --git a/vm/mterp/armv5te/footer.S b/vm/mterp/armv5te/footer.S
index 0e5898b..22ad65a 100644
--- a/vm/mterp/armv5te/footer.S
+++ b/vm/mterp/armv5te/footer.S
@@ -120,10 +120,12 @@
     @ (very few methods have > 10 args; could unroll for common cases)
     add     r3, rFP, r1, lsl #2         @ r3<- &fp[CCCC]
     sub     r10, r10, r2, lsl #2        @ r10<- "outs" area, for call args
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
 1:  ldr     r1, [r3], #4                @ val = *fp++
     subs    r2, r2, #1                  @ count--
     str     r1, [r10], #4               @ *outs++ = val
     bne     1b                          @ ...while count != 0
+    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
     b       .LinvokeArgsDone
 
 /*
@@ -137,47 +139,50 @@
     @ prepare to copy args to "outs" area of current frame
     movs    r2, rINST, lsr #12          @ r2<- B (arg count) -- test for zero
     SAVEAREA_FROM_FP(r10, rFP)          @ r10<- stack save area
-    beq     .LinvokeArgsDone            @ if no args, skip the rest
-    FETCH(r1, 2)                        @ r1<- GFED
+    FETCH(r1, 2)                        @ r1<- GFED (load here to hide latency)
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
+    ldrh    r3, [r0, #offMethod_outsSize]  @ r3<- methodToCall->outsSize
+    beq     .LinvokeArgsDone
 
-    @ r0=methodToCall, r1=GFED, r2=count, r10=outs
+    @ r0=methodToCall, r1=GFED, r3=outSize, r2=count, r9=regSize, r10=outs
 .LinvokeNonRange:
     rsb     r2, r2, #5                  @ r2<- 5-r2
     add     pc, pc, r2, lsl #4          @ computed goto, 4 instrs each
     bl      common_abort                @ (skipped due to ARM prefetch)
 5:  and     ip, rINST, #0x0f00          @ isolate A
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vA (shift right 8, left 2)
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vA (shift right 8, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vA
+    str     r2, [r10, #-4]!             @ *--outs = vA
 4:  and     ip, r1, #0xf000             @ isolate G
-    ldr     r3, [rFP, ip, lsr #10]      @ r3<- vG (shift right 12, left 2)
+    ldr     r2, [rFP, ip, lsr #10]      @ r2<- vG (shift right 12, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vG
+    str     r2, [r10, #-4]!             @ *--outs = vG
 3:  and     ip, r1, #0x0f00             @ isolate F
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vF
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vF
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vF
+    str     r2, [r10, #-4]!             @ *--outs = vF
 2:  and     ip, r1, #0x00f0             @ isolate E
-    ldr     r3, [rFP, ip, lsr #2]       @ r3<- vE
+    ldr     r2, [rFP, ip, lsr #2]       @ r2<- vE
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vE
+    str     r2, [r10, #-4]!             @ *--outs = vE
 1:  and     ip, r1, #0x000f             @ isolate D
-    ldr     r3, [rFP, ip, lsl #2]       @ r3<- vD
+    ldr     r2, [rFP, ip, lsl #2]       @ r2<- vD
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vD
+    str     r2, [r10, #-4]!             @ *--outs = vD
 0:  @ fall through to .LinvokeArgsDone
 
-.LinvokeArgsDone: @ r0=methodToCall
+.LinvokeArgsDone: @ r0=methodToCall, r3=outSize, r9=regSize
+    ldr     r2, [r0, #offMethod_insns]  @ r2<- method->insns
+    ldr     rINST, [r0, #offMethod_clazz]  @ rINST<- method->clazz
     @ find space for the new stack frame, check for overflow
     SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
-    ldrh    r2, [r0, #offMethod_registersSize]  @ r2<- methodToCall->regsSize
-    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
-    sub     r1, r1, r2, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    sub     r1, r1, r9, lsl #2          @ r1<- newFp (old savearea - regsSize)
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- newSaveArea
 @    bl      common_dumpRegs
     ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
     sub     r3, r10, r3, lsl #2         @ r3<- bottom (newsave - outsSize)
     cmp     r3, r9                      @ bottom < interpStackEnd?
+    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     blt     .LstackOverflow             @ yes, this frame will overflow stack
 
     @ set up newSaveArea
@@ -188,8 +193,6 @@
     str     rFP, [r10, #offStackSaveArea_prevFrame]
     str     rPC, [r10, #offStackSaveArea_savedPc]
     str     r0, [r10, #offStackSaveArea_method]
-
-    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
 
@@ -208,17 +211,18 @@
     ldmfd   sp!, {r0-r3}
     */
 
-    @ Update "glue" values for the new method
-    @ r0=methodToCall, r1=newFp
-    ldr     r3, [r0, #offMethod_clazz]      @ r3<- method->clazz
-    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
-    ldr     r3, [r3, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
-    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- method->insns
-    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    ldrh    r9, [r2]                        @ r9 <- load INST from new PC
+    ldr     r3, [rINST, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    mov     rPC, r2                         @ publish new rPC
     ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
-    FETCH_INST()                            @ load rINST from rPC
+
+    @ Update "glue" values for the new method
+    @ r0=methodToCall, r1=newFp, r2=self, r3=newMethodClass, r9=newINST
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
-    GET_INST_OPCODE(ip)                     @ extract opcode from rINST
+    GET_PREFETCHED_OPCODE(ip, r9)           @ extract prefetched opcode from r9
+    mov     rINST, r9                       @ publish new rINST
     str     r1, [r2, #offThread_curFrame]   @ self->curFrame = newFp
     GOTO_OPCODE(ip)                         @ jump to next instruction
 
@@ -313,20 +317,21 @@
 
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
+    ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
     ldr     r2, [rFP, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
     cmp     r2, #0                      @ is this a break frame?
+    ldrne   r10, [r2, #offMethod_clazz] @ r10<- method->clazz
     mov     r1, #0                      @ "want switch" = false
     beq     common_gotoBail             @ break frame, bail out completely
 
-    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ pc = saveArea->savedPc
-    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
-    str     r2, [rGLUE, #offGlue_method]    @ glue->method = newSave->method
+    PREFETCH_ADVANCE_INST(rINST, r9, 3) @ advance r9, update new rINST
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r1, [r10, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     str     rFP, [r3, #offThread_curFrame]  @ self->curFrame = fp
-    ldr     r1, [r2, #offMethod_clazz]      @ r1<- method->clazz
-    FETCH_ADVANCE_INST(3)               @ advance rPC, load rINST
-    ldr     r1, [r1, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     GET_INST_OPCODE(ip)                 @ extract opcode from rINST
+    mov     rPC, r9                     @ publish new rPC
     str     r1, [rGLUE, #offGlue_methodClassDex]
     GOTO_OPCODE(ip)                     @ jump to next instruction
 
diff --git a/vm/mterp/armv5te/header.S b/vm/mterp/armv5te/header.S
index 6f9ba97..2e5c6ed 100644
--- a/vm/mterp/armv5te/header.S
+++ b/vm/mterp/armv5te/header.S
@@ -117,6 +117,13 @@
 #define FETCH_ADVANCE_INST(_count) ldrh    rINST, [rPC, #(_count*2)]!
 
 /*
+ * The operation performed here is similar to FETCH_ADVANCE_INST, except the
+ * src and dest registers are parameterized (not hard-wired to rPC and rINST).
+ */
+#define PREFETCH_ADVANCE_INST(_dreg, _sreg, _count) \
+        ldrh    _dreg, [_sreg, #(_count*2)]!
+
+/*
  * Fetch the next instruction from an offset specified by _reg.  Updates
  * rPC to point to the next instruction.  "_reg" must specify the distance
  * in bytes, *not* 16-bit code units, and may be a signed value.
@@ -150,6 +157,11 @@
 #define GET_INST_OPCODE(_reg)   and     _reg, rINST, #255
 
 /*
+ * Put the prefetched instruction's opcode field into the specified register.
+ */
+#define GET_PREFETCHED_OPCODE(_oreg, _ireg)   and     _oreg, _ireg, #255
+
+/*
  * Begin executing the opcode in _reg.  Because this only jumps within the
  * interpreter, we don't have to worry about pre-ARMv5 THUMB interwork.
  */
diff --git a/vm/mterp/out/InterpAsm-armv4.S b/vm/mterp/out/InterpAsm-armv4.S
index f183814..6ab5b7f 100644
--- a/vm/mterp/out/InterpAsm-armv4.S
+++ b/vm/mterp/out/InterpAsm-armv4.S
@@ -124,6 +124,13 @@
 #define FETCH_ADVANCE_INST(_count) ldrh    rINST, [rPC, #(_count*2)]!
 
 /*
+ * The operation performed here is similar to FETCH_ADVANCE_INST, except the
+ * src and dest registers are parameterized (not hard-wired to rPC and rINST).
+ */
+#define PREFETCH_ADVANCE_INST(_dreg, _sreg, _count) \
+        ldrh    _dreg, [_sreg, #(_count*2)]!
+
+/*
  * Fetch the next instruction from an offset specified by _reg.  Updates
  * rPC to point to the next instruction.  "_reg" must specify the distance
  * in bytes, *not* 16-bit code units, and may be a signed value.
@@ -157,6 +164,11 @@
 #define GET_INST_OPCODE(_reg)   and     _reg, rINST, #255
 
 /*
+ * Put the prefetched instruction's opcode field into the specified register.
+ */
+#define GET_PREFETCHED_OPCODE(_oreg, _ireg)   and     _oreg, _ireg, #255
+
+/*
  * Begin executing the opcode in _reg.  Because this only jumps within the
  * interpreter, we don't have to worry about pre-ARMv5 THUMB interwork.
  */
@@ -9406,10 +9418,12 @@
     @ (very few methods have > 10 args; could unroll for common cases)
     add     r3, rFP, r1, lsl #2         @ r3<- &fp[CCCC]
     sub     r10, r10, r2, lsl #2        @ r10<- "outs" area, for call args
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
 1:  ldr     r1, [r3], #4                @ val = *fp++
     subs    r2, r2, #1                  @ count--
     str     r1, [r10], #4               @ *outs++ = val
     bne     1b                          @ ...while count != 0
+    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
     b       .LinvokeArgsDone
 
 /*
@@ -9423,47 +9437,50 @@
     @ prepare to copy args to "outs" area of current frame
     movs    r2, rINST, lsr #12          @ r2<- B (arg count) -- test for zero
     SAVEAREA_FROM_FP(r10, rFP)          @ r10<- stack save area
-    beq     .LinvokeArgsDone            @ if no args, skip the rest
-    FETCH(r1, 2)                        @ r1<- GFED
+    FETCH(r1, 2)                        @ r1<- GFED (load here to hide latency)
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
+    ldrh    r3, [r0, #offMethod_outsSize]  @ r3<- methodToCall->outsSize
+    beq     .LinvokeArgsDone
 
-    @ r0=methodToCall, r1=GFED, r2=count, r10=outs
+    @ r0=methodToCall, r1=GFED, r3=outSize, r2=count, r9=regSize, r10=outs
 .LinvokeNonRange:
     rsb     r2, r2, #5                  @ r2<- 5-r2
     add     pc, pc, r2, lsl #4          @ computed goto, 4 instrs each
     bl      common_abort                @ (skipped due to ARM prefetch)
 5:  and     ip, rINST, #0x0f00          @ isolate A
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vA (shift right 8, left 2)
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vA (shift right 8, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vA
+    str     r2, [r10, #-4]!             @ *--outs = vA
 4:  and     ip, r1, #0xf000             @ isolate G
-    ldr     r3, [rFP, ip, lsr #10]      @ r3<- vG (shift right 12, left 2)
+    ldr     r2, [rFP, ip, lsr #10]      @ r2<- vG (shift right 12, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vG
+    str     r2, [r10, #-4]!             @ *--outs = vG
 3:  and     ip, r1, #0x0f00             @ isolate F
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vF
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vF
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vF
+    str     r2, [r10, #-4]!             @ *--outs = vF
 2:  and     ip, r1, #0x00f0             @ isolate E
-    ldr     r3, [rFP, ip, lsr #2]       @ r3<- vE
+    ldr     r2, [rFP, ip, lsr #2]       @ r2<- vE
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vE
+    str     r2, [r10, #-4]!             @ *--outs = vE
 1:  and     ip, r1, #0x000f             @ isolate D
-    ldr     r3, [rFP, ip, lsl #2]       @ r3<- vD
+    ldr     r2, [rFP, ip, lsl #2]       @ r2<- vD
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vD
+    str     r2, [r10, #-4]!             @ *--outs = vD
 0:  @ fall through to .LinvokeArgsDone
 
-.LinvokeArgsDone: @ r0=methodToCall
+.LinvokeArgsDone: @ r0=methodToCall, r3=outSize, r9=regSize
+    ldr     r2, [r0, #offMethod_insns]  @ r2<- method->insns
+    ldr     rINST, [r0, #offMethod_clazz]  @ rINST<- method->clazz
     @ find space for the new stack frame, check for overflow
     SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
-    ldrh    r2, [r0, #offMethod_registersSize]  @ r2<- methodToCall->regsSize
-    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
-    sub     r1, r1, r2, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    sub     r1, r1, r9, lsl #2          @ r1<- newFp (old savearea - regsSize)
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- newSaveArea
 @    bl      common_dumpRegs
     ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
     sub     r3, r10, r3, lsl #2         @ r3<- bottom (newsave - outsSize)
     cmp     r3, r9                      @ bottom < interpStackEnd?
+    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     blt     .LstackOverflow             @ yes, this frame will overflow stack
 
     @ set up newSaveArea
@@ -9474,8 +9491,6 @@
     str     rFP, [r10, #offStackSaveArea_prevFrame]
     str     rPC, [r10, #offStackSaveArea_savedPc]
     str     r0, [r10, #offStackSaveArea_method]
-
-    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
 
@@ -9494,17 +9509,18 @@
     ldmfd   sp!, {r0-r3}
     */
 
-    @ Update "glue" values for the new method
-    @ r0=methodToCall, r1=newFp
-    ldr     r3, [r0, #offMethod_clazz]      @ r3<- method->clazz
-    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
-    ldr     r3, [r3, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
-    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- method->insns
-    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    ldrh    r9, [r2]                        @ r9 <- load INST from new PC
+    ldr     r3, [rINST, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    mov     rPC, r2                         @ publish new rPC
     ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
-    FETCH_INST()                            @ load rINST from rPC
+
+    @ Update "glue" values for the new method
+    @ r0=methodToCall, r1=newFp, r2=self, r3=newMethodClass, r9=newINST
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
-    GET_INST_OPCODE(ip)                     @ extract opcode from rINST
+    GET_PREFETCHED_OPCODE(ip, r9)           @ extract prefetched opcode from r9
+    mov     rINST, r9                       @ publish new rINST
     str     r1, [r2, #offThread_curFrame]   @ self->curFrame = newFp
     GOTO_OPCODE(ip)                         @ jump to next instruction
 
@@ -9599,20 +9615,21 @@
 
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
+    ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
     ldr     r2, [rFP, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
     cmp     r2, #0                      @ is this a break frame?
+    ldrne   r10, [r2, #offMethod_clazz] @ r10<- method->clazz
     mov     r1, #0                      @ "want switch" = false
     beq     common_gotoBail             @ break frame, bail out completely
 
-    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ pc = saveArea->savedPc
-    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
-    str     r2, [rGLUE, #offGlue_method]    @ glue->method = newSave->method
+    PREFETCH_ADVANCE_INST(rINST, r9, 3) @ advance r9, update new rINST
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r1, [r10, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     str     rFP, [r3, #offThread_curFrame]  @ self->curFrame = fp
-    ldr     r1, [r2, #offMethod_clazz]      @ r1<- method->clazz
-    FETCH_ADVANCE_INST(3)               @ advance rPC, load rINST
-    ldr     r1, [r1, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     GET_INST_OPCODE(ip)                 @ extract opcode from rINST
+    mov     rPC, r9                     @ publish new rPC
     str     r1, [rGLUE, #offGlue_methodClassDex]
     GOTO_OPCODE(ip)                     @ jump to next instruction
 
diff --git a/vm/mterp/out/InterpAsm-armv5te.S b/vm/mterp/out/InterpAsm-armv5te.S
index 26c8860..24708f5 100644
--- a/vm/mterp/out/InterpAsm-armv5te.S
+++ b/vm/mterp/out/InterpAsm-armv5te.S
@@ -124,6 +124,13 @@
 #define FETCH_ADVANCE_INST(_count) ldrh    rINST, [rPC, #(_count*2)]!
 
 /*
+ * The operation performed here is similar to FETCH_ADVANCE_INST, except the
+ * src and dest registers are parameterized (not hard-wired to rPC and rINST).
+ */
+#define PREFETCH_ADVANCE_INST(_dreg, _sreg, _count) \
+        ldrh    _dreg, [_sreg, #(_count*2)]!
+
+/*
  * Fetch the next instruction from an offset specified by _reg.  Updates
  * rPC to point to the next instruction.  "_reg" must specify the distance
  * in bytes, *not* 16-bit code units, and may be a signed value.
@@ -157,6 +164,11 @@
 #define GET_INST_OPCODE(_reg)   and     _reg, rINST, #255
 
 /*
+ * Put the prefetched instruction's opcode field into the specified register.
+ */
+#define GET_PREFETCHED_OPCODE(_oreg, _ireg)   and     _oreg, _ireg, #255
+
+/*
  * Begin executing the opcode in _reg.  Because this only jumps within the
  * interpreter, we don't have to worry about pre-ARMv5 THUMB interwork.
  */
@@ -9400,10 +9412,12 @@
     @ (very few methods have > 10 args; could unroll for common cases)
     add     r3, rFP, r1, lsl #2         @ r3<- &fp[CCCC]
     sub     r10, r10, r2, lsl #2        @ r10<- "outs" area, for call args
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
 1:  ldr     r1, [r3], #4                @ val = *fp++
     subs    r2, r2, #1                  @ count--
     str     r1, [r10], #4               @ *outs++ = val
     bne     1b                          @ ...while count != 0
+    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
     b       .LinvokeArgsDone
 
 /*
@@ -9417,47 +9431,50 @@
     @ prepare to copy args to "outs" area of current frame
     movs    r2, rINST, lsr #12          @ r2<- B (arg count) -- test for zero
     SAVEAREA_FROM_FP(r10, rFP)          @ r10<- stack save area
-    beq     .LinvokeArgsDone            @ if no args, skip the rest
-    FETCH(r1, 2)                        @ r1<- GFED
+    FETCH(r1, 2)                        @ r1<- GFED (load here to hide latency)
+    ldrh    r9, [r0, #offMethod_registersSize]  @ r9<- methodToCall->regsSize
+    ldrh    r3, [r0, #offMethod_outsSize]  @ r3<- methodToCall->outsSize
+    beq     .LinvokeArgsDone
 
-    @ r0=methodToCall, r1=GFED, r2=count, r10=outs
+    @ r0=methodToCall, r1=GFED, r3=outSize, r2=count, r9=regSize, r10=outs
 .LinvokeNonRange:
     rsb     r2, r2, #5                  @ r2<- 5-r2
     add     pc, pc, r2, lsl #4          @ computed goto, 4 instrs each
     bl      common_abort                @ (skipped due to ARM prefetch)
 5:  and     ip, rINST, #0x0f00          @ isolate A
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vA (shift right 8, left 2)
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vA (shift right 8, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vA
+    str     r2, [r10, #-4]!             @ *--outs = vA
 4:  and     ip, r1, #0xf000             @ isolate G
-    ldr     r3, [rFP, ip, lsr #10]      @ r3<- vG (shift right 12, left 2)
+    ldr     r2, [rFP, ip, lsr #10]      @ r2<- vG (shift right 12, left 2)
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vG
+    str     r2, [r10, #-4]!             @ *--outs = vG
 3:  and     ip, r1, #0x0f00             @ isolate F
-    ldr     r3, [rFP, ip, lsr #6]       @ r3<- vF
+    ldr     r2, [rFP, ip, lsr #6]       @ r2<- vF
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vF
+    str     r2, [r10, #-4]!             @ *--outs = vF
 2:  and     ip, r1, #0x00f0             @ isolate E
-    ldr     r3, [rFP, ip, lsr #2]       @ r3<- vE
+    ldr     r2, [rFP, ip, lsr #2]       @ r2<- vE
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vE
+    str     r2, [r10, #-4]!             @ *--outs = vE
 1:  and     ip, r1, #0x000f             @ isolate D
-    ldr     r3, [rFP, ip, lsl #2]       @ r3<- vD
+    ldr     r2, [rFP, ip, lsl #2]       @ r2<- vD
     mov     r0, r0                      @ nop
-    str     r3, [r10, #-4]!             @ *--outs = vD
+    str     r2, [r10, #-4]!             @ *--outs = vD
 0:  @ fall through to .LinvokeArgsDone
 
-.LinvokeArgsDone: @ r0=methodToCall
+.LinvokeArgsDone: @ r0=methodToCall, r3=outSize, r9=regSize
+    ldr     r2, [r0, #offMethod_insns]  @ r2<- method->insns
+    ldr     rINST, [r0, #offMethod_clazz]  @ rINST<- method->clazz
     @ find space for the new stack frame, check for overflow
     SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
-    ldrh    r2, [r0, #offMethod_registersSize]  @ r2<- methodToCall->regsSize
-    ldrh    r3, [r0, #offMethod_outsSize]   @ r3<- methodToCall->outsSize
-    sub     r1, r1, r2, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    sub     r1, r1, r9, lsl #2          @ r1<- newFp (old savearea - regsSize)
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- newSaveArea
 @    bl      common_dumpRegs
     ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
     sub     r3, r10, r3, lsl #2         @ r3<- bottom (newsave - outsSize)
     cmp     r3, r9                      @ bottom < interpStackEnd?
+    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     blt     .LstackOverflow             @ yes, this frame will overflow stack
 
     @ set up newSaveArea
@@ -9468,8 +9485,6 @@
     str     rFP, [r10, #offStackSaveArea_prevFrame]
     str     rPC, [r10, #offStackSaveArea_savedPc]
     str     r0, [r10, #offStackSaveArea_method]
-
-    ldr     r3, [r0, #offMethod_accessFlags] @ r3<- methodToCall->accessFlags
     tst     r3, #ACC_NATIVE
     bne     .LinvokeNative
 
@@ -9488,17 +9503,18 @@
     ldmfd   sp!, {r0-r3}
     */
 
-    @ Update "glue" values for the new method
-    @ r0=methodToCall, r1=newFp
-    ldr     r3, [r0, #offMethod_clazz]      @ r3<- method->clazz
-    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
-    ldr     r3, [r3, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
-    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- method->insns
-    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    ldrh    r9, [r2]                        @ r9 <- load INST from new PC
+    ldr     r3, [rINST, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    mov     rPC, r2                         @ publish new rPC
     ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
-    FETCH_INST()                            @ load rINST from rPC
+
+    @ Update "glue" values for the new method
+    @ r0=methodToCall, r1=newFp, r2=self, r3=newMethodClass, r9=newINST
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
-    GET_INST_OPCODE(ip)                     @ extract opcode from rINST
+    GET_PREFETCHED_OPCODE(ip, r9)           @ extract prefetched opcode from r9
+    mov     rINST, r9                       @ publish new rINST
     str     r1, [r2, #offThread_curFrame]   @ self->curFrame = newFp
     GOTO_OPCODE(ip)                         @ jump to next instruction
 
@@ -9593,20 +9609,21 @@
 
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     rFP, [r0, #offStackSaveArea_prevFrame] @ fp = saveArea->prevFrame
+    ldr     r9, [r0, #offStackSaveArea_savedPc] @ r9 = saveArea->savedPc
     ldr     r2, [rFP, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
     cmp     r2, #0                      @ is this a break frame?
+    ldrne   r10, [r2, #offMethod_clazz] @ r10<- method->clazz
     mov     r1, #0                      @ "want switch" = false
     beq     common_gotoBail             @ break frame, bail out completely
 
-    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ pc = saveArea->savedPc
-    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
-    str     r2, [rGLUE, #offGlue_method]    @ glue->method = newSave->method
+    PREFETCH_ADVANCE_INST(rINST, r9, 3) @ advance r9, update new rINST
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r1, [r10, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     str     rFP, [r3, #offThread_curFrame]  @ self->curFrame = fp
-    ldr     r1, [r2, #offMethod_clazz]      @ r1<- method->clazz
-    FETCH_ADVANCE_INST(3)               @ advance rPC, load rINST
-    ldr     r1, [r1, #offClassObject_pDvmDex]   @ r1<- method->clazz->pDvmDex
     GET_INST_OPCODE(ip)                 @ extract opcode from rINST
+    mov     rPC, r9                     @ publish new rPC
     str     r1, [rGLUE, #offGlue_methodClassDex]
     GOTO_OPCODE(ip)                     @ jump to next instruction