ART: Rework Generic JNI, add ARM version

Refactors and optimizes Generic JNI. This version uses TwoWordReturn
to avoid writing to / loading from the bottom of the alloca.

Change-Id: I3287007c976f79c9fd32d3b3a43f2d1371bf4cd3
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 8f4eddb..25b489b 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -1284,13 +1284,6 @@
   EXPECT_TRUE(env_->ExceptionCheck() == JNI_TRUE);
 }
 
-template <typename U, typename V> V convert(U in) {
-  DCHECK_LE(sizeof(U), sizeof(V));
-  union { U u; V v; } tmp;
-  tmp.u = in;
-  return tmp.v;
-}
-
 void Java_MyClassNatives_stackArgsIntsFirst(JNIEnv* env, jclass klass, jint i1, jint i2, jint i3,
                                             jint i4, jint i5, jint i6, jint i7, jint i8, jint i9,
                                             jint i10, jfloat f1, jfloat f2, jfloat f3, jfloat f4,
@@ -1307,25 +1300,25 @@
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1345,16 +1338,16 @@
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, f1, f2,
                              f3, f4, f5, f6, f7, f8, f9, f10);
@@ -1376,25 +1369,25 @@
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1414,16 +1407,16 @@
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, i1, i2, i3,
                              i4, i5, i6, i7, i8, i9, i10);
@@ -1444,25 +1437,25 @@
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1482,16 +1475,16 @@
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, i6, f6, i7,
                              f7, i8, f8, i9, f9, i10, f10);
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 83a683d..4939610 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -127,7 +127,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
 #if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 8)
-#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM64) size not as expected."
+#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
 #endif
 .endm
 
@@ -1007,7 +1007,92 @@
     DELIVER_PENDING_EXCEPTION
 END art_quick_resolution_trampoline
 
-UNIMPLEMENTED art_quick_generic_jni_trampoline
+    /*
+     * Called to do a generic JNI down-call
+     */
+ENTRY art_quick_generic_jni_trampoline
+    SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    str r0, [sp, #0]  // Store native ArtMethod* to bottom of stack.
+
+    // Save rSELF
+    mov r11, rSELF
+    // Save SP , so we can have static CFI info. r10 is saved in ref_and_args.
+    mov r10, sp
+    .cfi_def_cfa_register r10
+
+    sub sp, sp, #5120
+
+    // prepare for artQuickGenericJniTrampoline call
+    // (Thread*,  SP)
+    //    r0      r1   <= C calling convention
+    //  rSELF     r10  <= where they are
+
+    mov r0, rSELF   // Thread*
+    mov r1, r10
+    blx artQuickGenericJniTrampoline  // (Thread*, sp)
+
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // r0: pointer to native code, 0 on error.
+    // r1: pointer to the bottom of the used area of the alloca, can restore stack till there.
+
+    // Check for error = 0.
+    cbz r0, .Lentry_error
+
+    // Release part of the alloca.
+    mov sp, r1
+
+    // Save the code pointer
+    mov r12, r0
+
+    // Load parameters from frame into registers.
+    pop {r0-r3}
+
+    // Softfloat.
+    // TODO: Change to hardfloat when supported.
+
+    blx r12           // native call.
+
+    // result sign extension is handled in C code
+    // prepare for artQuickGenericJniEndTrampoline call
+    // (Thread*, result, result_f)
+    //    r0      r1,r2    r3,stack       <= C calling convention
+    //    r11     r0,r1    r0,r1          <= where they are
+    sub sp, sp, #12 // Stack alignment.
+
+    push {r1}
+    mov r3, r0
+    mov r2, r1
+    mov r1, r0
+    mov r0, r11
+
+    blx artQuickGenericJniEndTrampoline
+
+    // Tear down the alloca.
+    mov sp, r10
+    .cfi_def_cfa_register sp
+
+    // Restore self pointer.
+    mov r9, r11
+
+    // Pending exceptions possible.
+    ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    cbnz r2, .Lexception_in_native
+
+    // Tear down the callee-save frame.
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+
+    bx lr      // ret
+
+.Lentry_error:
+    mov sp, r10
+    .cfi_def_cfa_register sp
+    mov r9, r11
+.Lexception_in_native:
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    DELIVER_PENDING_EXCEPTION
+
+END art_quick_generic_jni_trampoline
 
     .extern artQuickToInterpreterBridge
 ENTRY art_quick_to_interpreter_bridge
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index e088751..7907b6e 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1485,33 +1485,34 @@
     mov x1, xFP
     bl artQuickGenericJniTrampoline  // (Thread*, sp)
 
-    // Get the updated pointer. This is the bottom of the frame _with_ handle scope.
-    ldr xFP, [sp]
-    add x9, sp, #8
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // x0: pointer to native code, 0 on error.
+    // x1: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    cmp x0, #0
-    b.mi .Lentry_error      // Check for error, negative value.
+    // Check for error = 0.
+    cbz x0, .Lentry_error
 
-    // release part of the alloca.
-    add x9, x9, x0
+    // Release part of the alloca.
+    mov sp, x1
 
-    // Get the code pointer
-    ldr xIP0, [x9, #0]
+    // Save the code pointer
+    mov xIP0, x0
 
     // Load parameters from frame into registers.
     // TODO Check with artQuickGenericJniTrampoline.
     //      Also, check again APPCS64 - the stack arguments are interleaved.
-    ldp x0, x1, [x9, #8]
-    ldp x2, x3, [x9, #24]
-    ldp x4, x5, [x9, #40]
-    ldp x6, x7, [x9, #56]
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
 
-    ldp d0, d1, [x9, #72]
-    ldp d2, d3, [x9, #88]
-    ldp d4, d5, [x9, #104]
-    ldp d6, d7, [x9, #120]
+    ldp d0, d1, [sp, #64]
+    ldp d2, d3, [sp, #80]
+    ldp d4, d5, [sp, #96]
+    ldp d6, d7, [sp, #112]
 
-    add sp, x9, #136
+    add sp, sp, #128
 
     blr xIP0           // native call.
 
@@ -1520,13 +1521,11 @@
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP, result, result_f)
-    //   x0       x1   x2       x3       <= C calling convention
-    mov x5, x0      // Save return value
+    // (Thread*, result, result_f)
+    //    x0       x1       x2        <= C calling convention
+    mov x1, x0      // Result (from saved)
     mov x0, xSELF   // Thread register
-    mov x1, xFP     // Stack pointer
-    mov x2, x5      // Result (from saved)
-    fmov x3, d0     // d0 will contain floating point result, but needs to go into x3
+    fmov x2, d0     // d0 will contain floating point result, but needs to go into x2
 
     bl artQuickGenericJniEndTrampoline
 
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 0326f9e..24b9e46 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1127,8 +1127,7 @@
     // This also stores the native ArtMethod reference at the bottom of the stack.
 
     movl %esp, %ebp                 // save SP at callee-save frame
-    movl %esp, %edi
-    CFI_DEF_CFA_REGISTER(edi)
+    CFI_DEF_CFA_REGISTER(ebp)
     subl LITERAL(5120), %esp
     // prepare for artQuickGenericJniTrampoline call
     // (Thread*,  SP)
@@ -1141,46 +1140,39 @@
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     SETUP_GOT_NOSAVE              // Clobbers ebx.
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
-    // Drop call stack.
-    addl LITERAL(16), %esp
 
-    // At the bottom of the alloca we now have the name pointer to the method=bottom of callee-save
-    // get the adjusted frame pointer
-    popl %ebp
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // eax: pointer to native code, 0 on error.
+    // edx: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    // Check for error, negative value.
+    // Check for error = 0.
     test %eax, %eax
-    js .Lentry_error
+    jz .Lentry_error
 
-    // release part of the alloca, get the code pointer
-    addl %eax, %esp
-    popl %eax
+    // Release part of the alloca.
+    movl %edx, %esp
 
     // On x86 there are no registers passed, so nothing to pop here.
 
     // Native call.
     call *%eax
 
-    // Pop native stack, but keep the space that was reserved cookie.
-    movl %ebp, %esp
-    subl LITERAL(16), %esp        // Alignment.
-
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP,  result, result_f)
-    //  (esp)   4(esp)  8(esp)  16(esp)    <= C calling convention
-    //  fs:...    ebp  eax:edx   xmm0      <= where they are
+    // (Thread*, result, result_f)
+    //  (esp)    4(esp)  12(esp)    <= C calling convention
+    //  fs:...  eax:edx   xmm0      <= where they are
 
-    subl LITERAL(8), %esp         // Pass float result.
+    subl LITERAL(20), %esp         // Padding & pass float result.
     movsd %xmm0, (%esp)
     pushl %edx                    // Pass int result.
     pushl %eax
-    pushl %ebp                    // Pass SP (to ArtMethod).
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     call PLT_SYMBOL(artQuickGenericJniEndTrampoline)
 
     // Tear down the alloca.
-    movl %edi, %esp
+    movl %ebp, %esp
     CFI_DEF_CFA_REGISTER(esp)
 
     // Pending exceptions possible.
@@ -1204,7 +1196,7 @@
     punpckldq %xmm1, %xmm0
     ret
 .Lentry_error:
-    movl %edi, %esp
+    movl %ebp, %esp
     CFI_DEF_CFA_REGISTER(esp)
 .Lexception_in_native:
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 668fb88..8fa947c 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1167,11 +1167,9 @@
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
-    // Store native ArtMethod* to bottom of stack.
-    movq %rdi, 0(%rsp)
-    movq %rsp, %rbp                 // save SP at callee-save frame
-    movq %rsp, %rbx
-    CFI_DEF_CFA_REGISTER(rbx)
+    movq %rdi, 0(%rsp)              // Store native ArtMethod* to bottom of stack.
+    movq %rsp, %rbp                 // save SP at (old) callee-save frame
+    CFI_DEF_CFA_REGISTER(rbp)
     //
     // reserve a lot of space
     //
@@ -1198,17 +1196,17 @@
     movq %rbp, %rsi
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
 
-    // At the bottom of the alloca we now have the name pointer to the method=bottom of callee-save
-    // get the adjusted frame pointer
-    popq %rbp
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // %rax: pointer to native code, 0 on error.
+    // %rdx: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    // Check for error, negative value.
+    // Check for error = 0.
     test %rax, %rax
-    js .Lentry_error
+    jz .Lentry_error
 
-    // release part of the alloca, get the code pointer
-    addq %rax, %rsp
-    popq %rax
+    // Release part of the alloca.
+    movq %rdx, %rsp
 
     // pop from the register-passing alloca region
     // what's the right layout?
@@ -1228,21 +1226,22 @@
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
     addq LITERAL(64), %rsp          // floating-point done
+
     // native call
-    call *%rax                      // Stack should be aligned 16B without the return addr?
+    call *%rax
+
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP, result, result_f)
-    //   rdi      rsi   rdx   rcx       <= C calling convention
-    //  gs:...    rbp   rax   xmm0      <= where they are
+    // (Thread*,  result, result_f)
+    //   rdi      rsi   rdx       <= C calling convention
+    //  gs:...    rax   xmm0      <= where they are
     movq %gs:THREAD_SELF_OFFSET, %rdi
-    movq %rbp, %rsi
-    movq %rax, %rdx
-    movq %xmm0, %rcx
+    movq %rax, %rsi
+    movq %xmm0, %rdx
     call PLT_SYMBOL(artQuickGenericJniEndTrampoline)
 
     // Tear down the alloca.
-    movq %rbx, %rsp
+    movq %rbp, %rsp
     CFI_DEF_CFA_REGISTER(rsp)
 
     // Pending exceptions possible.
@@ -1280,7 +1279,7 @@
     movq %rax, %xmm0
     ret
 .Lentry_error:
-    movq %rbx, %rsp
+    movq %rbp, %rsp
     CFI_DEF_CFA_REGISTER(rsp)
 .Lexception_in_native:
     // TODO: the handle scope contains the this pointer which is used by the debugger for exception
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 7a144b6..6fb9624 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -185,8 +185,8 @@
       case 3: return (5 * GetBytesPerGprSpillLocation(kRuntimeISA));
       case 4: return (6 * GetBytesPerGprSpillLocation(kRuntimeISA));
       default:
-        LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
-        return 0;
+      LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
+      return 0;
     }
   }
 #else
@@ -209,16 +209,15 @@
     return *reinterpret_cast<uintptr_t*>(lr);
   }
 
-  QuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
-                       const char* shorty, uint32_t shorty_len)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
-      is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
-      gpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset),
-      fpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
-      stack_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
-                  + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
-      gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
-      is_split_long_or_double_(false) { }
+  QuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static, const char* shorty,
+                       uint32_t shorty_len) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
+          is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
+          gpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset),
+          fpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
+          stack_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
+                      + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
+          gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
+          is_split_long_or_double_(false) {}
 
   virtual ~QuickArgumentVisitor() {}
 
@@ -388,9 +387,12 @@
     }
   }
 
+ protected:
   const bool is_static_;
   const char* const shorty_;
   const uint32_t shorty_len_;
+
+ private:
   byte* const gpr_args_;  // Address of GPR arguments in callee save frame.
   byte* const fpr_args_;  // Address of FPR arguments in callee save frame.
   byte* const stack_args_;  // Address of stack arguments in caller's frame.
@@ -409,7 +411,7 @@
   BuildQuickShadowFrameVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                                const char* shorty, uint32_t shorty_len, ShadowFrame* sf,
                                size_t first_arg_reg) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -420,7 +422,7 @@
   DISALLOW_COPY_AND_ASSIGN(BuildQuickShadowFrameVisitor);
 };
 
-void BuildQuickShadowFrameVisitor::Visit()  {
+void BuildQuickShadowFrameVisitor::Visit() {
   Primitive::Type type = GetParamPrimitiveType();
   switch (type) {
     case Primitive::kPrimLong:  // Fall-through.
@@ -465,13 +467,14 @@
     return 0;
   } else {
     DCHECK(!method->IsNative()) << PrettyMethod(method);
-    const char* old_cause = self->StartAssertNoThreadSuspension("Building interpreter shadow frame");
+    const char* old_cause = self->StartAssertNoThreadSuspension(
+        "Building interpreter shadow frame");
     const DexFile::CodeItem* code_item = method->GetCodeItem();
     DCHECK(code_item != nullptr) << PrettyMethod(method);
     uint16_t num_regs = code_item->registers_size_;
     void* memory = alloca(ShadowFrame::ComputeSize(num_regs));
-    ShadowFrame* shadow_frame(ShadowFrame::Create(num_regs, NULL,  // No last shadow coming from quick.
-                                                  method, 0, memory));
+    // No last shadow coming from quick.
+    ShadowFrame* shadow_frame(ShadowFrame::Create(num_regs, nullptr, method, 0, memory));
     size_t first_arg_reg = code_item->registers_size_ - code_item->ins_size_;
     uint32_t shorty_len = 0;
     const char* shorty = method->GetShorty(&shorty_len);
@@ -512,7 +515,7 @@
   BuildQuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                             const char* shorty, uint32_t shorty_len,
                             ScopedObjectAccessUnchecked* soa, std::vector<jvalue>* args) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa), args_(args) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa), args_(args) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -584,7 +587,8 @@
   const char* old_cause =
       self->StartAssertNoThreadSuspension("Adding to IRT proxy object arguments");
   // Register the top of the managed stack, making stack crawlable.
-  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) << PrettyMethod(proxy_method);
+  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method)
+  << PrettyMethod(proxy_method);
   self->SetTopOfStack(sp, 0);
   DCHECK_EQ(proxy_method->GetFrameSizeInBytes(),
             Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes())
@@ -600,7 +604,7 @@
   // Placing arguments into args vector and remove the receiver.
   mirror::ArtMethod* non_proxy_method = proxy_method->GetInterfaceMethodIfProxy();
   CHECK(!non_proxy_method->IsStatic()) << PrettyMethod(proxy_method) << " "
-      << PrettyMethod(non_proxy_method);
+                                       << PrettyMethod(non_proxy_method);
   std::vector<jvalue> args;
   uint32_t shorty_len = 0;
   const char* shorty = proxy_method->GetShorty(&shorty_len);
@@ -632,7 +636,7 @@
   RememberForGcArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                                const char* shorty, uint32_t shorty_len,
                                ScopedObjectAccessUnchecked* soa) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -641,7 +645,8 @@
  private:
   ScopedObjectAccessUnchecked* const soa_;
   // References which we must update when exiting in case the GC moved the objects.
-  std::vector<std::pair<jobject, StackReference<mirror::Object>*>> references_;
+  std::vector<std::pair<jobject, StackReference<mirror::Object>*> > references_;
+
   DISALLOW_COPY_AND_ASSIGN(RememberForGcArgumentVisitor);
 };
 
@@ -663,7 +668,6 @@
   }
 }
 
-
 // Lazily resolve a method for quick. Called by stub code.
 extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
                                                     mirror::Object* receiver,
@@ -740,7 +744,6 @@
         is_range = false;
     }
     dex_method_idx = (is_range) ? instr->VRegB_3rc() : instr->VRegB_35c();
-
   } else {
     invoke_type = kStatic;
     dex_file = called->GetDexFile();
@@ -825,8 +828,6 @@
   return code;
 }
 
-
-
 /*
  * This class uses a couple of observations to unite the different calling conventions through
  * a few constants.
@@ -867,7 +868,7 @@
  *                                          entry in the HandleScope (nullptr if necessary).
  *
  */
-template <class T> class BuildGenericJniFrameStateMachine {
+template<class T> class BuildNativeCallFrameStateMachine {
  public:
 #if defined(__arm__)
   // TODO: These are all dummy values!
@@ -912,7 +913,7 @@
 
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
-  static constexpr bool kMultiRegistersAligned = false;       // x86 not using regs, anyways
+  static constexpr bool kMultiRegistersAligned = false;  // x86 not using regs, anyways
   static constexpr bool kMultiRegistersWidened = false;
   static constexpr bool kAlignLongOnStack = false;
   static constexpr bool kAlignDoubleOnStack = false;
@@ -932,34 +933,34 @@
 #endif
 
  public:
-  explicit BuildGenericJniFrameStateMachine(T* delegate) : gpr_index_(kNumNativeGprArgs),
-                                                           fpr_index_(kNumNativeFprArgs),
-                                                           stack_entries_(0),
-                                                           delegate_(delegate) {
+  explicit BuildNativeCallFrameStateMachine(T* delegate)
+      : gpr_index_(kNumNativeGprArgs),
+        fpr_index_(kNumNativeFprArgs),
+        stack_entries_(0),
+        delegate_(delegate) {
     // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
     // the next register is even; counting down is just to make the compiler happy...
     CHECK_EQ(kNumNativeGprArgs % 2, 0U);
     CHECK_EQ(kNumNativeFprArgs % 2, 0U);
   }
 
-  virtual ~BuildGenericJniFrameStateMachine() {}
+  virtual ~BuildNativeCallFrameStateMachine() {}
 
   bool HavePointerGpr() {
     return gpr_index_ > 0;
   }
 
-  void AdvancePointer(void* val) {
+  void AdvancePointer(const void* val) {
     if (HavePointerGpr()) {
       gpr_index_--;
       PushGpr(reinterpret_cast<uintptr_t>(val));
     } else {
-      stack_entries_++;         // TODO: have a field for pointer length as multiple of 32b
+      stack_entries_++;  // TODO: have a field for pointer length as multiple of 32b
       PushStack(reinterpret_cast<uintptr_t>(val));
       gpr_index_ = 0;
     }
   }
 
-
   bool HaveHandleScopeGpr() {
     return gpr_index_ > 0;
   }
@@ -976,7 +977,6 @@
     }
   }
 
-
   bool HaveIntGpr() {
     return gpr_index_ > 0;
   }
@@ -992,7 +992,6 @@
     }
   }
 
-
   bool HaveLongGpr() {
     return gpr_index_ >= kRegistersNeededForLong + (LongGprNeedsPadding() ? 1 : 0);
   }
@@ -1039,30 +1038,22 @@
     }
   }
 
-
   bool HaveFloatFpr() {
     return fpr_index_ > 0;
   }
 
-  template <typename U, typename V> V convert(U in) {
-    CHECK_LE(sizeof(U), sizeof(V));
-    union { U u; V v; } tmp;
-    tmp.u = in;
-    return tmp.v;
-  }
-
   void AdvanceFloat(float val) {
     if (kNativeSoftFloatAbi) {
-      AdvanceInt(convert<float, uint32_t>(val));
+      AdvanceInt(bit_cast<float, uint32_t>(val));
     } else {
       if (HaveFloatFpr()) {
         fpr_index_--;
         if (kRegistersNeededForDouble == 1) {
           if (kMultiRegistersWidened) {
-            PushFpr8(convert<double, uint64_t>(val));
+            PushFpr8(bit_cast<double, uint64_t>(val));
           } else {
             // No widening, just use the bits.
-            PushFpr8(convert<float, uint64_t>(val));
+            PushFpr8(bit_cast<float, uint64_t>(val));
           }
         } else {
           PushFpr4(val);
@@ -1071,16 +1062,17 @@
         stack_entries_++;
         if (kRegistersNeededForDouble == 1 && kMultiRegistersWidened) {
           // Need to widen before storing: Note the "double" in the template instantiation.
-          PushStack(convert<double, uintptr_t>(val));
+          // Note: We need to jump through those hoops to make the compiler happy.
+          DCHECK_EQ(sizeof(uintptr_t), sizeof(uint64_t));
+          PushStack(static_cast<uintptr_t>(bit_cast<double, uint64_t>(val)));
         } else {
-          PushStack(convert<float, uintptr_t>(val));
+          PushStack(bit_cast<float, uintptr_t>(val));
         }
         fpr_index_ = 0;
       }
     }
   }
 
-
   bool HaveDoubleFpr() {
     return fpr_index_ >= kRegistersNeededForDouble + (DoubleFprNeedsPadding() ? 1 : 0);
   }
@@ -1162,101 +1154,66 @@
   T* delegate_;             // What Push implementation gets called
 };
 
-class ComputeGenericJniFrameSize FINAL {
+// Computes the sizes of register stacks and call stack area. Handling of references can be extended
+// in subclasses.
+//
+// To handle native pointers, use "L" in the shorty for an object reference, which simulates
+// them with handles.
+class ComputeNativeCallFrameSize {
  public:
-  ComputeGenericJniFrameSize() : num_handle_scope_references_(0), num_stack_entries_(0) {}
+  ComputeNativeCallFrameSize() : num_stack_entries_(0) {}
+
+  virtual ~ComputeNativeCallFrameSize() {}
 
   uint32_t GetStackSize() {
     return num_stack_entries_ * sizeof(uintptr_t);
   }
 
-  // WARNING: After this, *sp won't be pointing to the method anymore!
-  void ComputeLayout(StackReference<mirror::ArtMethod>** m, bool is_static, const char* shorty,
-                     uint32_t shorty_len, void* sp, HandleScope** table,
-                     uint32_t* handle_scope_entries, uintptr_t** start_stack, uintptr_t** start_gpr,
-                     uint32_t** start_fpr, void** code_return, size_t* overall_size)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ComputeAll(is_static, shorty, shorty_len);
-
-    mirror::ArtMethod* method = (*m)->AsMirrorPtr();
-
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
-
-    // First, fix up the layout of the callee-save frame.
-    // We have to squeeze in the HandleScope, and relocate the method pointer.
-
-    // "Free" the slot for the method.
-    sp8 += kPointerSize;  // In the callee-save frame we use a full pointer.
-
-    // Under the callee saves put handle scope and new method stack reference.
-    *handle_scope_entries = num_handle_scope_references_;
-
-    size_t handle_scope_size = HandleScope::SizeOf(num_handle_scope_references_);
-    size_t scope_and_method = handle_scope_size + sizeof(StackReference<mirror::ArtMethod>);
-
-    sp8 -= scope_and_method;
-    // Align by kStackAlignment.
-    sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
-
-    uint8_t* sp8_table = sp8 + sizeof(StackReference<mirror::ArtMethod>);
-    *table = reinterpret_cast<HandleScope*>(sp8_table);
-    (*table)->SetNumberOfReferences(num_handle_scope_references_);
-
-    // Add a slot for the method pointer, and fill it. Fix the pointer-pointer given to us.
-    uint8_t* method_pointer = sp8;
-    StackReference<mirror::ArtMethod>* new_method_ref =
-        reinterpret_cast<StackReference<mirror::ArtMethod>*>(method_pointer);
-    new_method_ref->Assign(method);
-    *m = new_method_ref;
-
-    // Reference cookie and padding
-    sp8 -= 8;
-    // Store HandleScope size
-    *reinterpret_cast<uint32_t*>(sp8) = static_cast<uint32_t>(handle_scope_size & 0xFFFFFFFF);
-
-    // Next comes the native call stack.
+  uint8_t* LayoutCallStack(uint8_t* sp8) {
     sp8 -= GetStackSize();
     // Align by kStackAlignment.
     sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
-    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
-
-    // put fprs and gprs below
-    // Assumption is OK right now, as we have soft-float arm
-    size_t fregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeFprArgs;
-    sp8 -= fregs * sizeof(uintptr_t);
-    *start_fpr = reinterpret_cast<uint32_t*>(sp8);
-    size_t iregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeGprArgs;
-    sp8 -= iregs * sizeof(uintptr_t);
-    *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
-
-    // reserve space for the code pointer
-    sp8 -= kPointerSize;
-    *code_return = reinterpret_cast<void*>(sp8);
-
-    *overall_size = reinterpret_cast<uint8_t*>(sp) - sp8;
-
-    // The new SP is stored at the end of the alloca, so it can be immediately popped
-    sp8 = reinterpret_cast<uint8_t*>(sp) - 5 * KB;
-    *(reinterpret_cast<uint8_t**>(sp8)) = method_pointer;
+    return sp8;
   }
 
-  void ComputeHandleScopeOffset() { }  // nothing to do, static right now
+  uint8_t* LayoutCallRegisterStacks(uint8_t* sp8, uintptr_t** start_gpr, uint32_t** start_fpr) {
+    // Assumption is OK right now, as we have soft-float arm
+    size_t fregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeFprArgs;
+    sp8 -= fregs * sizeof(uintptr_t);
+    *start_fpr = reinterpret_cast<uint32_t*>(sp8);
+    size_t iregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeGprArgs;
+    sp8 -= iregs * sizeof(uintptr_t);
+    *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
+    return sp8;
+  }
 
-  void ComputeAll(bool is_static, const char* shorty, uint32_t shorty_len)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize> sm(this);
+  uint8_t* LayoutNativeCall(uint8_t* sp8, uintptr_t** start_stack, uintptr_t** start_gpr,
+                            uint32_t** start_fpr) {
+    // Native call stack.
+    sp8 = LayoutCallStack(sp8);
+    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
 
-    // JNIEnv
-    sm.AdvancePointer(nullptr);
+    // Put fprs and gprs below.
+    sp8 = LayoutCallRegisterStacks(sp8, start_gpr, start_fpr);
 
-    // Class object or this as first argument
-    sm.AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+    // Return the new bottom.
+    return sp8;
+  }
+
+  virtual void WalkHeader(BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {}
+
+  void Walk(const char* shorty, uint32_t shorty_len) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize> sm(this);
+
+    WalkHeader(&sm);
 
     for (uint32_t i = 1; i < shorty_len; ++i) {
       Primitive::Type cur_type_ = Primitive::GetType(shorty[i]);
       switch (cur_type_) {
         case Primitive::kPrimNot:
-          sm.AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+          sm.AdvanceHandleScope(
+              reinterpret_cast<mirror::Object*>(0x12345678));
           break;
 
         case Primitive::kPrimBoolean:
@@ -1299,50 +1256,135 @@
     // counting is already done in the superclass
   }
 
-  uintptr_t PushHandle(mirror::Object* /* ptr */) {
-    num_handle_scope_references_++;
+  virtual uintptr_t PushHandle(mirror::Object* /* ptr */) {
     return reinterpret_cast<uintptr_t>(nullptr);
   }
 
- private:
-  uint32_t num_handle_scope_references_;
+ protected:
   uint32_t num_stack_entries_;
 };
 
-// Visits arguments on the stack placing them into a region lower down the stack for the benefit
-// of transitioning into native code.
-class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+class ComputeGenericJniFrameSize FINAL : public ComputeNativeCallFrameSize {
  public:
-  BuildGenericJniFrameVisitor(StackReference<mirror::ArtMethod>** sp, bool is_static,
-                              const char* shorty, uint32_t shorty_len, Thread* self) :
-      QuickArgumentVisitor(*sp, is_static, shorty, shorty_len), sm_(this) {
-    ComputeGenericJniFrameSize fsc;
-    fsc.ComputeLayout(sp, is_static, shorty, shorty_len, *sp, &handle_scope_, &handle_scope_expected_refs_,
-                      &cur_stack_arg_, &cur_gpr_reg_, &cur_fpr_reg_, &code_return_,
-                      &alloca_used_size_);
-    handle_scope_number_of_references_ = 0;
-    cur_hs_entry_ = GetFirstHandleScopeEntry();
+  ComputeGenericJniFrameSize() : num_handle_scope_references_(0) {}
 
-    // jni environment is always first argument
-    sm_.AdvancePointer(self->GetJniEnv());
+  // Lays out the callee-save frame. Assumes that the incorrect frame corresponding to RefsAndArgs
+  // is at *m = sp. Will update to point to the bottom of the save frame.
+  //
+  // Note: assumes ComputeAll() has been run before.
+  void LayoutCalleeSaveFrame(StackReference<mirror::ArtMethod>** m, void* sp, HandleScope** table,
+                             uint32_t* handle_scope_entries)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    mirror::ArtMethod* method = (*m)->AsMirrorPtr();
 
-    if (is_static) {
-      sm_.AdvanceHandleScope((*sp)->AsMirrorPtr()->GetDeclaringClass());
-    }
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
+
+    // First, fix up the layout of the callee-save frame.
+    // We have to squeeze in the HandleScope, and relocate the method pointer.
+
+    // "Free" the slot for the method.
+    sp8 += kPointerSize;  // In the callee-save frame we use a full pointer.
+
+    // Under the callee saves put handle scope and new method stack reference.
+    *handle_scope_entries = num_handle_scope_references_;
+
+    size_t handle_scope_size = HandleScope::SizeOf(num_handle_scope_references_);
+    size_t scope_and_method = handle_scope_size + sizeof(StackReference<mirror::ArtMethod>);
+
+    sp8 -= scope_and_method;
+    // Align by kStackAlignment.
+    sp8 = reinterpret_cast<uint8_t*>(RoundDown(
+        reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
+
+    uint8_t* sp8_table = sp8 + sizeof(StackReference<mirror::ArtMethod>);
+    *table = reinterpret_cast<HandleScope*>(sp8_table);
+    (*table)->SetNumberOfReferences(num_handle_scope_references_);
+
+    // Add a slot for the method pointer, and fill it. Fix the pointer-pointer given to us.
+    uint8_t* method_pointer = sp8;
+    StackReference<mirror::ArtMethod>* new_method_ref =
+        reinterpret_cast<StackReference<mirror::ArtMethod>*>(method_pointer);
+    new_method_ref->Assign(method);
+    *m = new_method_ref;
   }
 
-  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
-
-  void FinalizeHandleScope(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
-  StackReference<mirror::Object>* GetFirstHandleScopeEntry()
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return handle_scope_->GetHandle(0).GetReference();
+  // Adds space for the cookie. Note: may leave stack unaligned.
+  void LayoutCookie(uint8_t** sp) {
+    // Reference cookie and padding
+    *sp -= 8;
   }
 
-  jobject GetFirstHandleScopeJObject()
+  // Re-layout the callee-save frame (insert a handle-scope). Then add space for the cookie.
+  // Returns the new bottom. Note: this may be unaligned.
+  uint8_t* LayoutJNISaveFrame(StackReference<mirror::ArtMethod>** m, void* sp, HandleScope** table,
+                              uint32_t* handle_scope_entries)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return handle_scope_->GetHandle(0).ToJObject();
+    // First, fix up the layout of the callee-save frame.
+    // We have to squeeze in the HandleScope, and relocate the method pointer.
+    LayoutCalleeSaveFrame(m, sp, table, handle_scope_entries);
+
+    // The bottom of the callee-save frame is now where the method is, *m.
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(*m);
+
+    // Add space for cookie.
+    LayoutCookie(&sp8);
+
+    return sp8;
+  }
+
+  // WARNING: After this, *sp won't be pointing to the method anymore!
+  uint8_t* ComputeLayout(StackReference<mirror::ArtMethod>** m, bool is_static, const char* shorty,
+                         uint32_t shorty_len, HandleScope** table, uint32_t* handle_scope_entries,
+                         uintptr_t** start_stack, uintptr_t** start_gpr, uint32_t** start_fpr)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    Walk(shorty, shorty_len);
+
+    // JNI part.
+    uint8_t* sp8 = LayoutJNISaveFrame(m, reinterpret_cast<void*>(*m), table, handle_scope_entries);
+
+    sp8 = LayoutNativeCall(sp8, start_stack, start_gpr, start_fpr);
+
+    // Return the new bottom.
+    return sp8;
+  }
+
+  uintptr_t PushHandle(mirror::Object* /* ptr */) OVERRIDE;
+
+  // Add JNIEnv* and jobj/jclass before the shorty-derived elements.
+  void WalkHeader(BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+ private:
+  uint32_t num_handle_scope_references_;
+};
+
+uintptr_t ComputeGenericJniFrameSize::PushHandle(mirror::Object* /* ptr */) {
+  num_handle_scope_references_++;
+  return reinterpret_cast<uintptr_t>(nullptr);
+}
+
+void ComputeGenericJniFrameSize::WalkHeader(
+    BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm) {
+  // JNIEnv
+  sm->AdvancePointer(nullptr);
+
+  // Class object or this as first argument
+  sm->AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+}
+
+// Class to push values to three separate regions. Used to fill the native call part. Adheres to
+// the template requirements of BuildGenericJniFrameStateMachine.
+class FillNativeCall {
+ public:
+  FillNativeCall(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args) :
+      cur_gpr_reg_(gpr_regs), cur_fpr_reg_(fpr_regs), cur_stack_arg_(stack_args) {}
+
+  virtual ~FillNativeCall() {}
+
+  void Reset(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args) {
+    cur_gpr_reg_ = gpr_regs;
+    cur_fpr_reg_ = fpr_regs;
+    cur_stack_arg_ = stack_args;
   }
 
   void PushGpr(uintptr_t val) {
@@ -1366,46 +1408,110 @@
     cur_stack_arg_++;
   }
 
-  uintptr_t PushHandle(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    uintptr_t tmp;
-    if (ref == nullptr) {
-      *cur_hs_entry_ = StackReference<mirror::Object>();
-      tmp = reinterpret_cast<uintptr_t>(nullptr);
-    } else {
-      *cur_hs_entry_ = StackReference<mirror::Object>::FromMirrorPtr(ref);
-      tmp = reinterpret_cast<uintptr_t>(cur_hs_entry_);
-    }
-    cur_hs_entry_++;
-    handle_scope_number_of_references_++;
-    return tmp;
-  }
-
-  // Size of the part of the alloca that we actually need.
-  size_t GetAllocaUsedSize() {
-    return alloca_used_size_;
-  }
-
-  void* GetCodeReturn() {
-    return code_return_;
+  virtual uintptr_t PushHandle(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    LOG(FATAL) << "(Non-JNI) Native call does not use handles.";
+    return 0U;
   }
 
  private:
-  uint32_t handle_scope_number_of_references_;
-  StackReference<mirror::Object>* cur_hs_entry_;
-  HandleScope* handle_scope_;
-  uint32_t handle_scope_expected_refs_;
   uintptr_t* cur_gpr_reg_;
   uint32_t* cur_fpr_reg_;
   uintptr_t* cur_stack_arg_;
-  // StackReference<mirror::Object>* top_of_handle_scope_;
-  void* code_return_;
-  size_t alloca_used_size_;
+};
 
-  BuildGenericJniFrameStateMachine<BuildGenericJniFrameVisitor> sm_;
+// Visits arguments on the stack placing them into a region lower down the stack for the benefit
+// of transitioning into native code.
+class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+ public:
+  BuildGenericJniFrameVisitor(StackReference<mirror::ArtMethod>** sp, bool is_static,
+                              const char* shorty, uint32_t shorty_len, Thread* self)
+     : QuickArgumentVisitor(*sp, is_static, shorty, shorty_len),
+       jni_call_(nullptr, nullptr, nullptr, nullptr), sm_(&jni_call_) {
+    ComputeGenericJniFrameSize fsc;
+    uintptr_t* start_gpr_reg;
+    uint32_t* start_fpr_reg;
+    uintptr_t* start_stack_arg;
+    uint32_t handle_scope_entries;
+    bottom_of_used_area_ = fsc.ComputeLayout(sp, is_static, shorty, shorty_len, &handle_scope_,
+                                             &handle_scope_entries, &start_stack_arg,
+                                             &start_gpr_reg, &start_fpr_reg);
+
+    handle_scope_->SetNumberOfReferences(handle_scope_entries);
+    jni_call_.Reset(start_gpr_reg, start_fpr_reg, start_stack_arg, handle_scope_);
+
+    // jni environment is always first argument
+    sm_.AdvancePointer(self->GetJniEnv());
+
+    if (is_static) {
+      sm_.AdvanceHandleScope((*sp)->AsMirrorPtr()->GetDeclaringClass());
+    }
+  }
+
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+
+  void FinalizeHandleScope(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  StackReference<mirror::Object>* GetFirstHandleScopeEntry()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return handle_scope_->GetHandle(0).GetReference();
+  }
+
+  jobject GetFirstHandleScopeJObject() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return handle_scope_->GetHandle(0).ToJObject();
+  }
+
+  void* GetBottomOfUsedArea() {
+    return bottom_of_used_area_;
+  }
+
+ private:
+  // A class to fill a JNI call. Adds reference/handle-scope management to FillNativeCall.
+  class FillJniCall FINAL : public FillNativeCall {
+   public:
+    FillJniCall(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args,
+                HandleScope* handle_scope) : FillNativeCall(gpr_regs, fpr_regs, stack_args),
+                                             handle_scope_(handle_scope), cur_entry_(0) {}
+
+    uintptr_t PushHandle(mirror::Object* ref) OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+    void Reset(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args, HandleScope* scope) {
+      FillNativeCall::Reset(gpr_regs, fpr_regs, stack_args);
+      handle_scope_ = scope;
+      cur_entry_ = 0U;
+    }
+
+    void ResetRemainingScopeSlots() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+      // Initialize padding entries.
+      size_t expected_slots = handle_scope_->NumberOfReferences();
+      while (cur_entry_ < expected_slots) {
+        handle_scope_->GetHandle(cur_entry_++).Assign(nullptr);
+      }
+      DCHECK_NE(cur_entry_, 0U);
+    }
+
+   private:
+    HandleScope* handle_scope_;
+    size_t cur_entry_;
+  };
+
+  HandleScope* handle_scope_;
+  FillJniCall jni_call_;
+  void* bottom_of_used_area_;
+
+  BuildNativeCallFrameStateMachine<FillJniCall> sm_;
 
   DISALLOW_COPY_AND_ASSIGN(BuildGenericJniFrameVisitor);
 };
 
+uintptr_t BuildGenericJniFrameVisitor::FillJniCall::PushHandle(mirror::Object* ref) {
+  uintptr_t tmp;
+  Handle<mirror::Object> h = handle_scope_->GetHandle(cur_entry_);
+  h.Assign(ref);
+  tmp = reinterpret_cast<uintptr_t>(h.ToJObject());
+  cur_entry_++;
+  return tmp;
+}
+
 void BuildGenericJniFrameVisitor::Visit() {
   Primitive::Type type = GetParamPrimitiveType();
   switch (type) {
@@ -1453,14 +1559,8 @@
 }
 
 void BuildGenericJniFrameVisitor::FinalizeHandleScope(Thread* self) {
-  // Initialize padding entries.
-  while (handle_scope_number_of_references_ < handle_scope_expected_refs_) {
-    *cur_hs_entry_ = StackReference<mirror::Object>();
-    cur_hs_entry_++;
-    handle_scope_number_of_references_++;
-  }
-  handle_scope_->SetNumberOfReferences(handle_scope_expected_refs_);
-  DCHECK_NE(handle_scope_expected_refs_, 0U);
+  // Clear out rest of the scope.
+  jni_call_.ResetRemainingScopeSlots();
   // Install HandleScope.
   self->PushHandleScope(handle_scope_);
 }
@@ -1495,19 +1595,20 @@
  * 1) How many bytes of the alloca can be released, if the value is non-negative.
  * 2) An error, if the value is negative.
  */
-extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, StackReference<mirror::ArtMethod>* sp)
+extern "C" TwoWordReturn artQuickGenericJniTrampoline(Thread* self,
+                                                      StackReference<mirror::ArtMethod>* sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   mirror::ArtMethod* called = sp->AsMirrorPtr();
   DCHECK(called->IsNative()) << PrettyMethod(called, true);
-
-  // run the visitor
   uint32_t shorty_len = 0;
   const char* shorty = called->GetShorty(&shorty_len);
+
+  // Run the visitor.
   BuildGenericJniFrameVisitor visitor(&sp, called->IsStatic(), shorty, shorty_len, self);
   visitor.VisitArguments();
   visitor.FinalizeHandleScope(self);
 
-  // fix up managed-stack things in Thread
+  // Fix up managed-stack things in Thread.
   self->SetTopOfStack(sp, 0);
 
   self->VerifyStack();
@@ -1519,7 +1620,7 @@
     if (self->IsExceptionPending()) {
       self->PopHandleScope();
       // A negative value denotes an error.
-      return -1;
+      return GetTwoWordFailureValue();
     }
   } else {
     cookie = JniMethodStart(self);
@@ -1550,36 +1651,31 @@
         artQuickGenericJniEndJNINonRef(self, cookie, lock);
       }
 
-      return -1;
+      return GetTwoWordFailureValue();
     }
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
   }
 
-  // Store the native code pointer in the stack at the right location.
-  uintptr_t* code_pointer = reinterpret_cast<uintptr_t*>(visitor.GetCodeReturn());
-  *code_pointer = reinterpret_cast<uintptr_t>(nativeCode);
-
-  // 5K reserved, window_size + frame pointer used.
-  size_t window_size = visitor.GetAllocaUsedSize();
-  return (5 * KB) - window_size - kPointerSize;
+  // Return native code addr(lo) and bottom of alloca address(hi).
+  return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
+                                reinterpret_cast<uintptr_t>(nativeCode));
 }
 
 /*
  * Is called after the native JNI code. Responsible for cleanup (handle scope, saved state) and
  * unlocking.
  */
-extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self,
-                                                    StackReference<mirror::ArtMethod>* sp,
-                                                    jvalue result, uint64_t result_f)
+extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, jvalue result, uint64_t result_f)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  StackReference<mirror::ArtMethod>* sp = self->GetManagedStack()->GetTopQuickFrame();
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   mirror::ArtMethod* called = sp->AsMirrorPtr();
   uint32_t cookie = *(sp32 - 1);
 
   jobject lock = nullptr;
   if (called->IsSynchronized()) {
-    HandleScope* table = reinterpret_cast<HandleScope*>(
-        reinterpret_cast<uint8_t*>(sp) + sizeof(StackReference<mirror::ArtMethod>));
+    HandleScope* table = reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(sp)
+        + sizeof(StackReference<mirror::ArtMethod>));
     lock = table->GetHandle(0).ToJObject();
   }
 
@@ -1636,8 +1732,7 @@
     FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
     const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()->GetDexFile();
     uint32_t shorty_len;
-    const char* shorty =
-        dex_file->GetMethodShorty(dex_file->GetMethodId(method_idx), &shorty_len);
+    const char* shorty = dex_file->GetMethodShorty(dex_file->GetMethodId(method_idx), &shorty_len);
     {
       // Remember the args in case a GC happens in FindMethodFromCode.
       ScopedObjectAccessUnchecked soa(self->GetJniEnv());
@@ -1657,8 +1752,9 @@
   const void* code = method->GetEntryPointFromQuickCompiledCode();
 
   // When we return, the caller will branch to this address, so it had better not be 0!
-  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
-      << method->GetDexFile()->GetLocation();
+  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method)
+                          << " location: "
+                          << method->GetDexFile()->GetLocation();
 
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(code),
                                 reinterpret_cast<uintptr_t>(method));
@@ -1685,47 +1781,50 @@
 EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, true);
 #undef EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL
 
-
 // See comments in runtime_support_asm.S
-extern "C" TwoWordReturn artInvokeInterfaceTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kInterface, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeInterfaceTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kInterface, true>(method_idx, this_object,
+                                           caller_method, self, sp);
 }
 
-
-extern "C" TwoWordReturn artInvokeDirectTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kDirect, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeDirectTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kDirect, true>(method_idx, this_object, caller_method,
+                                        self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeStaticTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kStatic, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeStaticTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kStatic, true>(method_idx, this_object, caller_method,
+                                        self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeSuperTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kSuper, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeSuperTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kSuper, true>(method_idx, this_object, caller_method,
+                                       self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeVirtualTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kVirtual, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeVirtualTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kVirtual, true>(method_idx, this_object, caller_method,
+                                         self, sp);
 }
 
 // Determine target of interface dispatch. This object is known non-null.
@@ -1769,10 +1868,11 @@
       dex_method_idx = instr->VRegB_3rc();
     }
 
-    const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()->GetDexFile();
+    const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()
+        ->GetDexFile();
     uint32_t shorty_len;
-    const char* shorty =
-        dex_file->GetMethodShorty(dex_file->GetMethodId(dex_method_idx), &shorty_len);
+    const char* shorty = dex_file->GetMethodShorty(dex_file->GetMethodId(dex_method_idx),
+                                                   &shorty_len);
     {
       // Remember the args in case a GC happens in FindMethodFromCode.
       ScopedObjectAccessUnchecked soa(self->GetJniEnv());
@@ -1791,8 +1891,8 @@
   const void* code = method->GetEntryPointFromQuickCompiledCode();
 
   // When we return, the caller will branch to this address, so it had better not be 0!
-  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
-      << method->GetDexFile()->GetLocation();
+  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method)
+                          << " location: " << method->GetDexFile()->GetLocation();
 
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(code),
                                 reinterpret_cast<uintptr_t>(method));
diff --git a/runtime/utils.h b/runtime/utils.h
index 68ea475..eb79968 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -203,6 +203,19 @@
   return (ch < ' ' || ch > '~');
 }
 
+// Interpret the bit pattern of input (type U) as type V. Requires the size
+// of V >= size of U (compile-time checked).
+template<typename U, typename V>
+static inline V bit_cast(U in) {
+  COMPILE_ASSERT(sizeof(U) <= sizeof(V), size_of_u_not_le_size_of_v);
+  union {
+    U u;
+    V v;
+  } tmp;
+  tmp.u = in;
+  return tmp.v;
+}
+
 std::string PrintableChar(uint16_t ch);
 
 // Returns an ASCII string corresponding to the given UTF-8 string.