Rewrite GenericJNI frame setup.

Move the handle scope out of the managed frame, move the
register values to load to the bottom of the reserved area
and pass the hidden argument to @CriticalNative methods to
prepare for implementing late lookup.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing --interp-ac
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing --interp-ac
Bug: 112189621
Change-Id: I4672176f9627bcbebafebb3dda0d02b8108e1329
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index cd27a57..9eee345 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1707,41 +1707,44 @@
     sub sp, sp, #5120
 
     // prepare for artQuickGenericJniTrampoline call
-    // (Thread*,  SP)
-    //    r0      r1   <= C calling convention
-    //  rSELF     r10  <= where they are
+    // (Thread*, managed_sp, reserved_area)
+    //    r0         r1            r2   <= C calling convention
+    //  rSELF       r10            sp   <= where they are
 
     mov r0, rSELF   // Thread*
-    mov r1, r10
-    blx artQuickGenericJniTrampoline  // (Thread*, sp)
+    mov r1, r10     // SP for the managed frame.
+    mov r2, sp      // reserved area for arguments and other saved data (up to managed frame)
+    blx artQuickGenericJniTrampoline  // (Thread*, managed_sp, reserved_area)
 
     // The C call will have registered the complete save-frame on success.
     // The result of the call is:
-    // r0: pointer to native code, 0 on error.
-    // r1: pointer to the bottom of the used area of the alloca, can restore stack till there.
+    //     r0: pointer to native code, 0 on error.
+    //     The bottom of the reserved area contains values for arg registers,
+    //     hidden arg register and SP for out args for the call.
 
-    // Check for error (locking can throw for synchronized native method).
+    // Check for error (class init check or locking for synchronized native method can throw).
     cbz r0, .Lexception_in_native
 
-    // Release part of the alloca.
-    mov sp, r1
-
     // Save the code pointer
-    mov r12, r0
+    mov lr, r0
 
-    // Load parameters from frame into registers.
-    pop {r0-r3}
+    // Load parameters from frame into registers r0-r3 (soft-float),
+    // hidden arg (r4) for @CriticalNative and SP for out args.
+    pop {r0-r3, r4, ip}
+
+    // Apply the new SP for out args, releasing unneeded reserved area.
+    mov sp, ip
 
     // Softfloat.
     // TODO: Change to hardfloat when supported.
 
-    blx r12           // native call.
+    blx lr            // native call.
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
     // (Thread*, result, result_f)
     //    r0      r2,r3    stack       <= C calling convention
-    //    r11     r0,r1    r0,r1          <= where they are
+    //    r11     r0,r1    r0,r1       <= where they are
     sub sp, sp, #8 // Stack alignment.
 
     push {r0-r1}
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 078494a..7260700 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1887,7 +1887,6 @@
  * | X22               |    callee save
  * | X21               |    callee save
  * | X20               |    callee save
- * | X19               |    callee save
  * | X7                |    arg7
  * | X6                |    arg6
  * | X5                |    arg5
@@ -1903,27 +1902,30 @@
  * | D2                |    float arg 3
  * | D1                |    float arg 2
  * | D0                |    float arg 1
- * | Method*           | <- X0
+ * | padding           | // 8B
+ * | Method*           | <- X0 (Managed frame similar to SaveRefsAndArgs.)
  * #-------------------#
  * | local ref cookie  | // 4B
- * | handle scope size | // 4B
+ * | padding           | // 0B or 4B to align handle scope on 8B address
+ * | handle scope      | // Size depends on number of references; multiple of 4B.
  * #-------------------#
- * | JNI Call Stack    |
- * #-------------------#    <--- SP on native call
+ * | JNI Stack Args    | // Empty if all args fit into registers x0-x7, d0-d7.
+ * #-------------------#    <--- SP on native call (1)
+ * | Free scratch      |
+ * #-------------------#
+ * | SP for JNI call   | // Pointer to (1).
+ * #-------------------#
+ * | Hidden arg        | // For @CriticalNative
+ * #-------------------#
  * |                   |
  * | Stack for Regs    |    The trampoline assembly will pop these values
  * |                   |    into registers for native call
  * #-------------------#
- * | Native code ptr   |
- * #-------------------#
- * | Free scratch      |
- * #-------------------#
- * | Ptr to (1)        |    <--- SP
- * #-------------------#
  */
     /*
      * Called to do a generic JNI down-call
      */
+    .extern artQuickGenericJniTrampoline
 ENTRY art_quick_generic_jni_trampoline
     SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_X0
 
@@ -1939,31 +1941,28 @@
     sub sp, sp, xIP0
 
     // prepare for artQuickGenericJniTrampoline call
-    // (Thread*,  SP)
-    //    x0      x1   <= C calling convention
-    //   xSELF    xFP  <= where they are
+    // (Thread*, managed_sp, reserved_area)
+    //    x0         x1            x2   <= C calling convention
+    //  xSELF       xFP            sp   <= where they are
 
     mov x0, xSELF   // Thread*
-    mov x1, xFP
+    mov x1, xFP     // SP for the managed frame.
+    mov x2, sp      // reserved area for arguments and other saved data (up to managed frame)
     bl artQuickGenericJniTrampoline  // (Thread*, sp)
 
     // The C call will have registered the complete save-frame on success.
     // The result of the call is:
-    // x0: pointer to native code, 0 on error.
-    // x1: pointer to the bottom of the used area of the alloca, can restore stack till there.
+    //     x0: pointer to native code, 0 on error.
+    //     The bottom of the reserved area contains values for arg registers,
+    //     hidden arg register and SP for out args for the call.
 
-    // Check for error (locking can throw for synchronized native method).
+    // Check for error (class init check or locking for synchronized native method can throw).
     cbz x0, .Lexception_in_native
 
-    // Release part of the alloca.
-    mov sp, x1
-
     // Save the code pointer
     mov xIP0, x0
 
     // Load parameters from frame into registers.
-    // TODO Check with artQuickGenericJniTrampoline.
-    //      Also, check again APPCS64 - the stack arguments are interleaved.
     ldp x0, x1, [sp]
     ldp x2, x3, [sp, #16]
     ldp x4, x5, [sp, #32]
@@ -1974,7 +1973,11 @@
     ldp d4, d5, [sp, #96]
     ldp d6, d7, [sp, #112]
 
-    add sp, sp, #128
+    // Load hidden arg (x15) for @CriticalNative and SP for out args.
+    ldp x15, xIP1, [sp, #128]
+
+    // Apply the new SP for out args, releasing unneeded reserved area.
+    mov sp, xIP1
 
     blr xIP0        // native call.
 
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 7533cf8..7d2a7e6 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1919,30 +1919,38 @@
     CFI_DEF_CFA_REGISTER(ebp)
     subl LITERAL(5120), %esp
     // prepare for artQuickGenericJniTrampoline call
-    // (Thread*,  SP)
-    //  (esp)    4(esp)   <= C calling convention
-    //  fs:...    ebp     <= where they are
+    // (Thread*, managed_sp, reserved_area)
+    //   (esp)    4(esp)        8(esp)  <= C calling convention
+    //  fs:...      ebp           esp   <= where they are
 
-    subl LITERAL(8), %esp         // Padding for 16B alignment.
-    pushl %ebp                    // Pass SP (to ArtMethod).
+    movl %esp, %eax
+    subl LITERAL(4), %esp         // Padding for 16B alignment.
+    pushl %eax                    // Pass reserved area.
+    pushl %ebp                    // Pass managed frame SP.
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     call SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
 
     // The C call will have registered the complete save-frame on success.
     // The result of the call is:
-    // eax: pointer to native code, 0 on error.
-    // edx: pointer to the bottom of the used area of the alloca, can restore stack till there.
+    //     eax: pointer to native code, 0 on error.
+    //     The bottom of the reserved area contains values for arg registers,
+    //     hidden arg register and SP for out args for the call.
 
-    // Check for error (locking can throw for synchronized native method).
+    // Check for error (class init check or locking for synchronized native method can throw).
     test %eax, %eax
     jz .Lexception_in_native
 
-    // Release part of the alloca.
-    movl %edx, %esp
+    // On x86 there are no registers passed, so no native call args to pop here.
 
-    // On x86 there are no registers passed, so nothing to pop here.
+    // Save code pointer in EDX.
+    movl %eax, %edx
+    // Load hidden arg (EAX) for @CriticalNative.
+    movl 16(%esp), %eax
+    // Load SP for out args, releasing unneeded reserved area.
+    movl 20(%esp), %esp
+
     // Native call.
-    call *%eax
+    call *%edx
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 56961d7..c2f87b2 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1680,28 +1680,27 @@
  * #-------------------#
  * | caller method...  |
  * #-------------------#
- * | Return            |
- * | Callee-Save Data  |
- * #-------------------#
- * | handle scope      |
- * #-------------------#
+ * | Return PC         |
+ * | Callee-Saves      |
+ * | padding           | // 8B
  * | Method*           |    <--- (1)
  * #-------------------#
  * | local ref cookie  | // 4B
- * | handle scope size | // 4B   TODO: roll into call stack alignment?
+ * | padding           | // 0B or 4B to align handle scope on 8B address
+ * | handle scope      | // Size depends on number of references; multiple of 4B.
  * #-------------------#
- * | JNI Call Stack    |
- * #-------------------#    <--- SP on native call
+ * | JNI Stack Args    | // Empty if all args fit into registers.
+ * #-------------------#    <--- SP on native call (1)
+ * | Free scratch      |
+ * #-------------------#
+ * | SP for JNI call   | // Pointer to (1).
+ * #-------------------#
+ * | Hidden arg        | // For @CriticalNative
+ * #-------------------#
  * |                   |
  * | Stack for Regs    |    The trampoline assembly will pop these values
  * |                   |    into registers for native call
  * #-------------------#
- * | Native code ptr   |
- * #-------------------#
- * | Free scratch      |
- * #-------------------#
- * | Ptr to (1)        |    <--- RSP
- * #-------------------#
  */
     /*
      * Called to do a generic JNI down-call
@@ -1731,25 +1730,24 @@
     // 5k = 5120
     subq LITERAL(5120), %rsp
     // prepare for artQuickGenericJniTrampoline call
-    // (Thread*,  SP)
-    //    rdi    rsi      <= C calling convention
-    //  gs:...   rbp      <= where they are
-    movq %gs:THREAD_SELF_OFFSET, %rdi
-    movq %rbp, %rsi
+    // (Thread*, managed_sp, reserved_area)
+    //    rdi       rsi           rdx   <= C calling convention
+    //  gs:...      rbp           rsp   <= where they are
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // Pass Thread::Current().
+    movq %rbp, %rsi                    // Pass managed frame SP.
+    movq %rsp, %rdx                    // Pass reserved area.
     call SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
 
     // The C call will have registered the complete save-frame on success.
     // The result of the call is:
-    // %rax: pointer to native code, 0 on error.
-    // %rdx: pointer to the bottom of the used area of the alloca, can restore stack till there.
+    //     %rax: pointer to native code, 0 on error.
+    //     The bottom of the reserved area contains values for arg registers,
+    //     hidden arg register and SP for out args for the call.
 
-    // Check for error (locking can throw for synchronized native method).
+    // Check for error (class init check or locking for synchronized native method can throw).
     test %rax, %rax
     jz .Lexception_in_native
 
-    // Release part of the alloca.
-    movq %rdx, %rsp
-
     // pop from the register-passing alloca region
     // what's the right layout?
     popq %rdi
@@ -1767,7 +1765,11 @@
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    addq LITERAL(64), %rsp          // floating-point done
+
+    // Load hidden arg (r11) for @CriticalNative.
+    movq 64(%rsp), %r11
+    // Load SP for out args, releasing unneeded reserved area.
+    movq 72(%rsp), %rsp
 
     // native call
     call *%rax
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index cf93150..a31be00 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -759,6 +759,14 @@
   return method->IsStatic() && !method->IsConstructor();
 }
 
+inline HandleScope* GetGenericJniHandleScope(ArtMethod** managed_sp,
+                                             size_t num_handle_scope_references) {
+  // The HandleScope is just below the cookie and padding to align as uintptr_t.
+  const size_t offset =
+      RoundUp(HandleScope::SizeOf(num_handle_scope_references) + kJniCookieSize, sizeof(uintptr_t));
+  return reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(managed_sp) - offset);
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_ENTRYPOINT_UTILS_INL_H_
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index 667f7fc..85082d3 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -42,6 +42,7 @@
 
 class ArtField;
 class ArtMethod;
+class HandleScope;
 enum InvokeType : uint32_t;
 class OatQuickMethodHeader;
 class ScopedObjectAccessAlreadyRunnable;
@@ -212,6 +213,11 @@
 // The caller is responsible for performing that check.
 bool NeedsClinitCheckBeforeCall(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_);
 
+constexpr size_t kJniCookieSize = sizeof(uint32_t);
+
+inline HandleScope* GetGenericJniHandleScope(ArtMethod** managed_sp,
+                                             size_t num_handle_scope_references);
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_ENTRYPOINT_UTILS_H_
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 5551df2..38c6d3c 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -216,8 +216,7 @@
                                     uint32_t saved_local_ref_cookie,
                                     jvalue result,
                                     uint64_t result_f,
-                                    ArtMethod* called,
-                                    HandleScope* handle_scope)
+                                    ArtMethod* called)
     // TODO: NO_THREAD_SAFETY_ANALYSIS as GoToRunnable() is NO_THREAD_SAFETY_ANALYSIS
     NO_THREAD_SAFETY_ANALYSIS {
   bool critical_native = called->IsCriticalNative();
@@ -232,6 +231,7 @@
   // locked object.
   if (called->IsSynchronized()) {
     DCHECK(normal_native) << "@FastNative/@CriticalNative and synchronize is not supported";
+    HandleScope* handle_scope = down_cast<HandleScope*>(self->GetTopHandleScope());
     jobject lock = handle_scope->GetHandle(0).ToJObject();
     DCHECK(lock != nullptr);
     UnlockJniSynchronizedMethod(lock, self);
@@ -242,7 +242,7 @@
         result.l, saved_local_ref_cookie, self));
   } else {
     if (LIKELY(!critical_native)) {
-      PopLocalReferences(saved_local_ref_cookie, self);  // Invalidates `handle_scope`.
+      PopLocalReferences(saved_local_ref_cookie, self);  // Invalidates top handle scope.
     }
     switch (return_shorty_char) {
       case 'F': {
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 1304c0d..aab5ff5 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1560,7 +1560,7 @@
   static constexpr bool kAlignDoubleOnStack = true;
 #elif defined(__aarch64__)
   static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
-  static constexpr size_t kNumNativeGprArgs = 8;  // 6 arguments passed in GPRs.
+  static constexpr size_t kNumNativeGprArgs = 8;  // 8 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 8;  // 8 arguments passed in FPRs.
 
   static constexpr size_t kRegistersNeededForLong = 1;
@@ -1598,8 +1598,8 @@
 #elif defined(__i386__)
   // TODO: Check these!
   static constexpr bool kNativeSoftFloatAbi = false;  // Not using int registers for fp
-  static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
-  static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
+  static constexpr size_t kNumNativeGprArgs = 0;  // 0 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 0;  // 0 arguments passed in FPRs.
 
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
@@ -1871,38 +1871,13 @@
     return num_stack_entries_ * sizeof(uintptr_t);
   }
 
-  uint8_t* LayoutCallStack(uint8_t* sp8) const {
+  uint8_t* LayoutStackArgs(uint8_t* sp8) const {
     sp8 -= GetStackSize();
-    // Align by kStackAlignment.
+    // Align by kStackAlignment; it is at least as strict as native stack alignment.
     sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
     return sp8;
   }
 
-  uint8_t* LayoutCallRegisterStacks(uint8_t* sp8, uintptr_t** start_gpr, uint32_t** start_fpr)
-      const {
-    // Assumption is OK right now, as we have soft-float arm
-    size_t fregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeFprArgs;
-    sp8 -= fregs * sizeof(uintptr_t);
-    *start_fpr = reinterpret_cast<uint32_t*>(sp8);
-    size_t iregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeGprArgs;
-    sp8 -= iregs * sizeof(uintptr_t);
-    *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
-    return sp8;
-  }
-
-  uint8_t* LayoutNativeCall(uint8_t* sp8, uintptr_t** start_stack, uintptr_t** start_gpr,
-                            uint32_t** start_fpr) const {
-    // Native call stack.
-    sp8 = LayoutCallStack(sp8);
-    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
-
-    // Put fprs and gprs below.
-    sp8 = LayoutCallRegisterStacks(sp8, start_gpr, start_fpr);
-
-    // Return the new bottom.
-    return sp8;
-  }
-
   virtual void WalkHeader(
       BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm ATTRIBUTE_UNUSED)
       REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -1976,80 +1951,53 @@
   explicit ComputeGenericJniFrameSize(bool critical_native)
     : num_handle_scope_references_(0), critical_native_(critical_native) {}
 
-  // Lays out the callee-save frame. Assumes that the incorrect frame corresponding to RefsAndArgs
-  // is at *m = sp. Will update to point to the bottom of the save frame.
-  //
-  // Note: assumes ComputeAll() has been run before.
-  void LayoutCalleeSaveFrame(Thread* self, ArtMethod*** m, void* sp, HandleScope** handle_scope)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    ArtMethod* method = **m;
-
+  uintptr_t* ComputeLayout(Thread* self,
+                           ArtMethod** managed_sp,
+                           const char* shorty,
+                           uint32_t shorty_len,
+                           HandleScope** handle_scope) REQUIRES_SHARED(Locks::mutator_lock_) {
     DCHECK_EQ(Runtime::Current()->GetClassLinker()->GetImagePointerSize(), kRuntimePointerSize);
 
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
-
-    // First, fix up the layout of the callee-save frame.
-    // We have to squeeze in the HandleScope, and relocate the method pointer.
-
-    // "Free" the slot for the method.
-    sp8 += sizeof(void*);  // In the callee-save frame we use a full pointer.
-
-    // Under the callee saves put handle scope and new method stack reference.
-    size_t handle_scope_size = HandleScope::SizeOf(num_handle_scope_references_);
-    size_t scope_and_method = handle_scope_size + sizeof(ArtMethod*);
-
-    sp8 -= scope_and_method;
-    // Align by kStackAlignment.
-    sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
-
-    uint8_t* sp8_table = sp8 + sizeof(ArtMethod*);
-    *handle_scope = HandleScope::Create(sp8_table, self->GetTopHandleScope(),
-                                        num_handle_scope_references_);
-
-    // Add a slot for the method pointer, and fill it. Fix the pointer-pointer given to us.
-    uint8_t* method_pointer = sp8;
-    auto** new_method_ref = reinterpret_cast<ArtMethod**>(method_pointer);
-    *new_method_ref = method;
-    *m = new_method_ref;
-  }
-
-  // Adds space for the cookie. Note: may leave stack unaligned.
-  void LayoutCookie(uint8_t** sp) const {
-    // Reference cookie and padding
-    *sp -= 8;
-  }
-
-  // Re-layout the callee-save frame (insert a handle-scope). Then add space for the cookie.
-  // Returns the new bottom. Note: this may be unaligned.
-  uint8_t* LayoutJNISaveFrame(Thread* self, ArtMethod*** m, void* sp, HandleScope** handle_scope)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
-    // First, fix up the layout of the callee-save frame.
-    // We have to squeeze in the HandleScope, and relocate the method pointer.
-    LayoutCalleeSaveFrame(self, m, sp, handle_scope);
-
-    // The bottom of the callee-save frame is now where the method is, *m.
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(*m);
-
-    // Add space for cookie.
-    LayoutCookie(&sp8);
-
-    return sp8;
-  }
-
-  // WARNING: After this, *sp won't be pointing to the method anymore!
-  uint8_t* ComputeLayout(Thread* self, ArtMethod*** m, const char* shorty, uint32_t shorty_len,
-                         HandleScope** handle_scope, uintptr_t** start_stack, uintptr_t** start_gpr,
-                         uint32_t** start_fpr)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
     Walk(shorty, shorty_len);
 
-    // JNI part.
-    uint8_t* sp8 = LayoutJNISaveFrame(self, m, reinterpret_cast<void*>(*m), handle_scope);
+    // Add space for cookie and HandleScope.
+    void* storage = GetGenericJniHandleScope(managed_sp, num_handle_scope_references_);
+    DCHECK_ALIGNED(storage, sizeof(uintptr_t));
+    *handle_scope =
+        HandleScope::Create(storage, self->GetTopHandleScope(), num_handle_scope_references_);
+    DCHECK_EQ(*handle_scope, storage);
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(*handle_scope);
+    DCHECK_GE(static_cast<size_t>(reinterpret_cast<uint8_t*>(managed_sp) - sp8),
+              HandleScope::SizeOf(num_handle_scope_references_) + kJniCookieSize);
 
-    sp8 = LayoutNativeCall(sp8, start_stack, start_gpr, start_fpr);
+    // Layout stack arguments.
+    sp8 = LayoutStackArgs(sp8);
 
     // Return the new bottom.
-    return sp8;
+    DCHECK_ALIGNED(sp8, sizeof(uintptr_t));
+    return reinterpret_cast<uintptr_t*>(sp8);
+  }
+
+  static uintptr_t* GetStartGprRegs(uintptr_t* reserved_area) {
+    return reserved_area;
+  }
+
+  static uint32_t* GetStartFprRegs(uintptr_t* reserved_area) {
+    constexpr size_t num_gprs =
+        BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeGprArgs;
+    return reinterpret_cast<uint32_t*>(GetStartGprRegs(reserved_area) + num_gprs);
+  }
+
+  static uintptr_t* GetHiddenArgSlot(uintptr_t* reserved_area) {
+    // Note: `num_fprs` is 0 on architectures where sizeof(uintptr_t) does not match the
+    // FP register size (it is actually 0 on all supported 32-bit architectures).
+    constexpr size_t num_fprs =
+        BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeFprArgs;
+    return reinterpret_cast<uintptr_t*>(GetStartFprRegs(reserved_area)) + num_fprs;
+  }
+
+  static uintptr_t* GetOutArgsSpSlot(uintptr_t* reserved_area) {
+    return GetHiddenArgSlot(reserved_area) + 1;
   }
 
   uintptr_t PushHandle(mirror::Object* /* ptr */) override;
@@ -2138,20 +2086,33 @@
                               bool critical_native,
                               const char* shorty,
                               uint32_t shorty_len,
-                              ArtMethod*** sp)
-     : QuickArgumentVisitor(*sp, is_static, shorty, shorty_len),
+                              ArtMethod** managed_sp,
+                              uintptr_t* reserved_area)
+     : QuickArgumentVisitor(managed_sp, is_static, shorty, shorty_len),
        jni_call_(nullptr, nullptr, nullptr, nullptr, critical_native),
        sm_(&jni_call_) {
-    ComputeGenericJniFrameSize fsc(critical_native);
-    uintptr_t* start_gpr_reg;
-    uint32_t* start_fpr_reg;
-    uintptr_t* start_stack_arg;
-    bottom_of_used_area_ = fsc.ComputeLayout(self, sp, shorty, shorty_len,
-                                             &handle_scope_,
-                                             &start_stack_arg,
-                                             &start_gpr_reg, &start_fpr_reg);
+    DCHECK_ALIGNED(managed_sp, kStackAlignment);
+    DCHECK_ALIGNED(reserved_area, sizeof(uintptr_t));
 
-    jni_call_.Reset(start_gpr_reg, start_fpr_reg, start_stack_arg, handle_scope_);
+    ComputeGenericJniFrameSize fsc(critical_native);
+    uintptr_t* out_args_sp =
+        fsc.ComputeLayout(self, managed_sp, shorty, shorty_len, &handle_scope_);
+
+    // Store hidden argument for @CriticalNative.
+    uintptr_t* hidden_arg_slot = fsc.GetHiddenArgSlot(reserved_area);
+    constexpr uintptr_t kGenericJniTag = 1u;
+    ArtMethod* method = *managed_sp;
+    *hidden_arg_slot = critical_native ? (reinterpret_cast<uintptr_t>(method) | kGenericJniTag)
+                                       : 0xebad6a89u;  // Bad value.
+
+    // Set out args SP.
+    uintptr_t* out_args_sp_slot = fsc.GetOutArgsSpSlot(reserved_area);
+    *out_args_sp_slot = reinterpret_cast<uintptr_t>(out_args_sp);
+
+    jni_call_.Reset(fsc.GetStartGprRegs(reserved_area),
+                    fsc.GetStartFprRegs(reserved_area),
+                    out_args_sp,
+                    handle_scope_);
 
     // First 2 parameters are always excluded for CriticalNative methods.
     if (LIKELY(!critical_native)) {
@@ -2159,7 +2120,7 @@
       sm_.AdvancePointer(self->GetJniEnv());
 
       if (is_static) {
-        sm_.AdvanceHandleScope((**sp)->GetDeclaringClass().Ptr());
+        sm_.AdvanceHandleScope(method->GetDeclaringClass().Ptr());
       }  // else "this" reference is already handled by QuickArgumentVisitor.
     }
   }
@@ -2176,10 +2137,6 @@
     return handle_scope_->GetHandle(0).ToJObject();
   }
 
-  void* GetBottomOfUsedArea() const {
-    return bottom_of_used_area_;
-  }
-
  private:
   // A class to fill a JNI call. Adds reference/handle-scope management to FillNativeCall.
   class FillJniCall final : public FillNativeCall {
@@ -2224,7 +2181,6 @@
 
   HandleScope* handle_scope_;
   FillJniCall jni_call_;
-  void* bottom_of_used_area_;
 
   BuildNativeCallFrameStateMachine<FillJniCall> sm_;
 
@@ -2296,21 +2252,27 @@
 }
 
 /*
- * Initializes an alloca region assumed to be directly below sp for a native call:
- * Create a HandleScope and call stack and fill a mini stack with values to be pushed to registers.
- * The final element on the stack is a pointer to the native code.
+ * Initializes the reserved area assumed to be directly below `managed_sp` for a native call:
  *
- * On entry, the stack has a standard callee-save frame above sp, and an alloca below it.
- * We need to fix this, as the handle scope needs to go into the callee-save frame.
+ * On entry, the stack has a standard callee-save frame above `managed_sp`,
+ * and the reserved area below it. Starting below `managed_sp`, we reserve space
+ * for local reference cookie (not present for @CriticalNative), HandleScope
+ * (not present for @CriticalNative) and stack args (if args do not fit into
+ * registers). At the bottom of the reserved area, there is space for register
+ * arguments, hidden arg (for @CriticalNative) and the SP for the native call
+ * (i.e. pointer to the stack args area), which the calling stub shall load
+ * to perform the native call. We fill all these fields, perform class init
+ * check (for static methods) and/or locking (for synchronized methods) if
+ * needed and return to the stub.
  *
- * The return of this function denotes:
- * 1) How many bytes of the alloca can be released, if the value is non-negative.
- * 2) An error, if the value is negative.
+ * The return value is the pointer to the native code, null on failure.
  */
-extern "C" TwoWordReturn artQuickGenericJniTrampoline(Thread* self, ArtMethod** sp)
+extern "C" const void* artQuickGenericJniTrampoline(Thread* self,
+                                                    ArtMethod** managed_sp,
+                                                    uintptr_t* reserved_area)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   // Note: We cannot walk the stack properly until fixed up below.
-  ArtMethod* called = *sp;
+  ArtMethod* called = *managed_sp;
   DCHECK(called->IsNative()) << called->PrettyMethod(true);
   Runtime* runtime = Runtime::Current();
   uint32_t shorty_len = 0;
@@ -2325,7 +2287,8 @@
                                       critical_native,
                                       shorty,
                                       shorty_len,
-                                      &sp);
+                                      managed_sp,
+                                      reserved_area);
   {
     ScopedAssertNoThreadSuspension sants(__FUNCTION__);
     visitor.VisitArguments();
@@ -2334,7 +2297,7 @@
   }
 
   // Fix up managed-stack things in Thread. After this we can walk the stack.
-  self->SetTopOfStackTagged(sp);
+  self->SetTopOfStackTagged(managed_sp);
 
   self->VerifyStack();
 
@@ -2356,8 +2319,7 @@
       if (!runtime->GetClassLinker()->EnsureInitialized(self, h_class, true, true)) {
         DCHECK(Thread::Current()->IsExceptionPending()) << called->PrettyMethod();
         self->PopHandleScope();
-        // A negative value denotes an error.
-        return GetTwoWordFailureValue();
+        return nullptr;  // Report error.
       }
     }
   }
@@ -2372,8 +2334,7 @@
       cookie = JniMethodStartSynchronized(visitor.GetFirstHandleScopeJObject(), self);
       if (self->IsExceptionPending()) {
         self->PopHandleScope();
-        // A negative value denotes an error.
-        return GetTwoWordFailureValue();
+        return nullptr;  // Report error.
       }
     } else {
       if (fast_native) {
@@ -2383,7 +2344,7 @@
         cookie = JniMethodStart(self);
       }
     }
-    sp32 = reinterpret_cast<uint32_t*>(sp);
+    sp32 = reinterpret_cast<uint32_t*>(managed_sp);
     *(sp32 - 1) = cookie;
   }
 
@@ -2433,15 +2394,17 @@
                         << " -> "
                         << std::hex << reinterpret_cast<uintptr_t>(nativeCode);
 
-  // Return native code addr(lo) and bottom of alloca address(hi).
-  return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
-                                reinterpret_cast<uintptr_t>(nativeCode));
+  // Return native code.
+  return nativeCode;
 }
 
 // Defined in quick_jni_entrypoints.cc.
-extern uint64_t GenericJniMethodEnd(Thread* self, uint32_t saved_local_ref_cookie,
-                                    jvalue result, uint64_t result_f, ArtMethod* called,
-                                    HandleScope* handle_scope);
+extern uint64_t GenericJniMethodEnd(Thread* self,
+                                    uint32_t saved_local_ref_cookie,
+                                    jvalue result,
+                                    uint64_t result_f,
+                                    ArtMethod* called);
+
 /*
  * Is called after the native JNI code. Responsible for cleanup (handle scope, saved state) and
  * unlocking.
@@ -2458,8 +2421,15 @@
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   ArtMethod* called = *sp;
   uint32_t cookie = *(sp32 - 1);
-  HandleScope* table = reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(sp) + sizeof(*sp));
-  return GenericJniMethodEnd(self, cookie, result, result_f, called, table);
+  if (kIsDebugBuild && !called->IsCriticalNative()) {
+    BaseHandleScope* handle_scope = self->GetTopHandleScope();
+    DCHECK(handle_scope != nullptr);
+    DCHECK(!handle_scope->IsVariableSized());
+    // Note: We do not hold mutator lock here for normal JNI, so we cannot use the method's shorty
+    // to determine the number of references. Instead rely on the value from the HandleScope.
+    DCHECK_EQ(handle_scope, GetGenericJniHandleScope(sp, handle_scope->NumberOfReferences()));
+  }
+  return GenericJniMethodEnd(self, cookie, result, result_f, called);
 }
 
 // We use TwoWordReturn to optimize scalar returns. We use the hi value for code, and the lo value
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 336b253..0459091 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -149,8 +149,18 @@
     return nullptr;
   } else if (m->IsNative()) {
     if (cur_quick_frame_ != nullptr) {
-      HandleScope* hs = reinterpret_cast<HandleScope*>(
-          reinterpret_cast<char*>(cur_quick_frame_) + sizeof(ArtMethod*));
+      HandleScope* hs;
+      if (cur_oat_quick_method_header_ != nullptr) {
+        hs = reinterpret_cast<HandleScope*>(
+            reinterpret_cast<char*>(cur_quick_frame_) + sizeof(ArtMethod*));
+      } else {
+        // GenericJNI frames have the HandleScope under the managed frame.
+        uint32_t shorty_len;
+        const char* shorty = m->GetShorty(&shorty_len);
+        const size_t num_handle_scope_references =
+            /* this */ 1u + std::count(shorty + 1, shorty + shorty_len, 'L');
+        hs = GetGenericJniHandleScope(cur_quick_frame_, num_handle_scope_references);
+      }
       return hs->GetReference(0);
     } else {
       return cur_shadow_frame_->GetVRegReference(0);
@@ -772,21 +782,6 @@
   }
 }
 
-// Counts the number of references in the parameter list of the corresponding method.
-// Note: Thus does _not_ include "this" for non-static methods.
-static uint32_t GetNumberOfReferenceArgsWithoutReceiver(ArtMethod* method)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  uint32_t shorty_len;
-  const char* shorty = method->GetShorty(&shorty_len);
-  uint32_t refs = 0;
-  for (uint32_t i = 1; i < shorty_len ; ++i) {
-    if (shorty[i] == 'L') {
-      refs++;
-    }
-  }
-  return refs;
-}
-
 QuickMethodFrameInfo StackVisitor::GetCurrentQuickFrameInfo() const {
   if (cur_oat_quick_method_header_ != nullptr) {
     if (cur_oat_quick_method_header_->IsOptimized()) {
@@ -831,18 +826,9 @@
           (runtime->GetJit() != nullptr &&
            runtime->GetJit()->GetCodeCache()->ContainsPc(entry_point))) << method->PrettyMethod();
   }
-  // Generic JNI frame.
-  uint32_t handle_refs = GetNumberOfReferenceArgsWithoutReceiver(method) + 1;
-  size_t scope_size = HandleScope::SizeOf(handle_refs);
-  constexpr QuickMethodFrameInfo callee_info =
-      RuntimeCalleeSaveFrame::GetMethodFrameInfo(CalleeSaveType::kSaveRefsAndArgs);
-
-  // Callee saves + handle scope + method ref + alignment
-  // Note: -sizeof(void*) since callee-save frame stores a whole method pointer.
-  size_t frame_size = RoundUp(
-      callee_info.FrameSizeInBytes() - sizeof(void*) + sizeof(ArtMethod*) + scope_size,
-      kStackAlignment);
-  return QuickMethodFrameInfo(frame_size, callee_info.CoreSpillMask(), callee_info.FpSpillMask());
+  // Generic JNI frame is just like the SaveRefsAndArgs frame.
+  // Note that HandleScope, if any, is below the frame.
+  return RuntimeCalleeSaveFrame::GetMethodFrameInfo(CalleeSaveType::kSaveRefsAndArgs);
 }
 
 template <StackVisitor::CountTransitions kCount>