jni: Do not create a managed frame for @CriticalNative.

Omit managed frame for @CriticalNative methods, do not check
for exceptions and and make a tail call when possible.
Pass the method pointer in a hidden argument to prepare for
implementing late binding for @CriticalNative methods.

This changes only the JNI compiler, Generic JNI shall be
updated in a separate change.

Performance improvements reported by Golem (art-opt-cc):
                                 x86 x86-64    arm  arm64
NativeDowncallStaticCritical6   +17%   +50%   +88%  +139%
NativeDowncallStaticCritical    +37%   +32%  +103%  +216%

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I5758c8f478627f2eee8f615b4537a907c211b9f8
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index 42a4603..2072302 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -74,6 +74,7 @@
     ArmManagedRegister::FromCoreRegister(R8),
     ArmManagedRegister::FromCoreRegister(R10),
     ArmManagedRegister::FromCoreRegister(R11),
+    ArmManagedRegister::FromCoreRegister(LR),
     // Hard float registers.
     ArmManagedRegister::FromSRegister(S16),
     ArmManagedRegister::FromSRegister(S17),
@@ -93,37 +94,79 @@
     ArmManagedRegister::FromSRegister(S31)
 };
 
-static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+template <size_t size>
+static constexpr uint32_t CalculateCoreCalleeSpillMask(
+    const ManagedRegister (&callee_saves)[size]) {
   // LR is a special callee save which is not reported by CalleeSaveRegisters().
-  uint32_t result = 1 << LR;
-  for (auto&& r : kCalleeSaveRegisters) {
+  uint32_t result = 0u;
+  for (auto&& r : callee_saves) {
     if (r.AsArm().IsCoreRegister()) {
-      result |= (1 << r.AsArm().AsCoreRegister());
+      result |= (1u << r.AsArm().AsCoreRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t CalculateFpCalleeSpillMask() {
-  uint32_t result = 0;
-  for (auto&& r : kCalleeSaveRegisters) {
+template <size_t size>
+static constexpr uint32_t CalculateFpCalleeSpillMask(const ManagedRegister (&callee_saves)[size]) {
+  uint32_t result = 0u;
+  for (auto&& r : callee_saves) {
     if (r.AsArm().IsSRegister()) {
-      result |= (1 << r.AsArm().AsSRegister());
+      result |= (1u << r.AsArm().AsSRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
-static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask(kCalleeSaveRegisters);
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask(kCalleeSaveRegisters);
+
+// The AAPCS requires 8-byte alignement. This is not as strict as the Managed ABI stack alignment.
+static constexpr size_t kAapcsStackAlignment = 8u;
+static_assert(kAapcsStackAlignment < kStackAlignment);
+
+static constexpr ManagedRegister kAapcsCalleeSaveRegisters[] = {
+    // Core registers.
+    ArmManagedRegister::FromCoreRegister(R4),
+    ArmManagedRegister::FromCoreRegister(R5),
+    ArmManagedRegister::FromCoreRegister(R6),
+    ArmManagedRegister::FromCoreRegister(R7),
+    ArmManagedRegister::FromCoreRegister(R8),
+    ArmManagedRegister::FromCoreRegister(R9),  // The platform register is callee-save on Android.
+    ArmManagedRegister::FromCoreRegister(R10),
+    ArmManagedRegister::FromCoreRegister(R11),
+    ArmManagedRegister::FromCoreRegister(LR),
+    // Hard float registers.
+    ArmManagedRegister::FromSRegister(S16),
+    ArmManagedRegister::FromSRegister(S17),
+    ArmManagedRegister::FromSRegister(S18),
+    ArmManagedRegister::FromSRegister(S19),
+    ArmManagedRegister::FromSRegister(S20),
+    ArmManagedRegister::FromSRegister(S21),
+    ArmManagedRegister::FromSRegister(S22),
+    ArmManagedRegister::FromSRegister(S23),
+    ArmManagedRegister::FromSRegister(S24),
+    ArmManagedRegister::FromSRegister(S25),
+    ArmManagedRegister::FromSRegister(S26),
+    ArmManagedRegister::FromSRegister(S27),
+    ArmManagedRegister::FromSRegister(S28),
+    ArmManagedRegister::FromSRegister(S29),
+    ArmManagedRegister::FromSRegister(S30),
+    ArmManagedRegister::FromSRegister(S31)
+};
+
+static constexpr uint32_t kAapcsCoreCalleeSpillMask =
+    CalculateCoreCalleeSpillMask(kAapcsCalleeSaveRegisters);
+static constexpr uint32_t kAapcsFpCalleeSpillMask =
+    CalculateFpCalleeSpillMask(kAapcsCalleeSaveRegisters);
 
 // Calling convention
 
-ManagedRegister ArmManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister ArmManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return ArmManagedRegister::FromCoreRegister(IP);  // R12
 }
 
-ManagedRegister ArmJniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister ArmJniCallingConvention::InterproceduralScratchRegister() const {
   return ArmManagedRegister::FromCoreRegister(IP);  // R12
 }
 
@@ -179,11 +222,9 @@
 
 FrameOffset ArmManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
-  FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +        // displacement
-                  kFramePointerSize +                 // Method*
-                  (itr_slots_ * kFramePointerSize));  // offset into in args
-  return result;
+  return FrameOffset(displacement_.Int32Value() +        // displacement
+                     kFramePointerSize +                 // Method*
+                     (itr_slots_ * kFramePointerSize));  // offset into in args
 }
 
 const ManagedRegisterEntrySpills& ArmManagedRuntimeCallingConvention::EntrySpills() {
@@ -252,6 +293,7 @@
   }
   return entry_spills_;
 }
+
 // JNI calling convention
 
 ArmJniCallingConvention::ArmJniCallingConvention(bool is_static,
@@ -321,7 +363,7 @@
     }
   }
 
-  if (cur_reg < kJniArgumentRegisterCount) {
+  if (cur_reg <= kJniArgumentRegisterCount) {
     // As a special case when, as a result of shifting (or not) there are no arguments on the stack,
     // we actually have 0 stack padding.
     //
@@ -347,53 +389,84 @@
 
 uint32_t ArmJniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
-  return kCoreCalleeSpillMask;
+  return is_critical_native_ ? 0u : kCoreCalleeSpillMask;
 }
 
 uint32_t ArmJniCallingConvention::FpSpillMask() const {
-  return kFpCalleeSpillMask;
+  return is_critical_native_ ? 0u : kFpCalleeSpillMask;
 }
 
 ManagedRegister ArmJniCallingConvention::ReturnScratchRegister() const {
   return ArmManagedRegister::FromCoreRegister(R2);
 }
 
-size_t ArmJniCallingConvention::FrameSize() {
-  // Method*, LR and callee save area size, local reference segment state
+size_t ArmJniCallingConvention::FrameSize() const {
+  if (UNLIKELY(is_critical_native_)) {
+    CHECK(!SpillsMethod());
+    CHECK(!HasLocalReferenceSegmentState());
+    CHECK(!HasHandleScope());
+    CHECK(!SpillsReturnValue());
+    return 0u;  // There is no managed frame for @CriticalNative.
+  }
+
+  // Method*, callee save area size, local reference segment state
+  CHECK(SpillsMethod());
   const size_t method_ptr_size = static_cast<size_t>(kArmPointerSize);
-  const size_t lr_return_addr_size = kFramePointerSize;
   const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
-  size_t frame_data_size = method_ptr_size + lr_return_addr_size + callee_save_area_size;
+  size_t total_size = method_ptr_size + callee_save_area_size;
 
-  if (LIKELY(HasLocalReferenceSegmentState())) {
-    // local reference segment state
-    frame_data_size += kFramePointerSize;
-    // TODO: Probably better to use sizeof(IRTSegmentState) here...
-  }
+  CHECK(HasLocalReferenceSegmentState());
+  // local reference segment state
+  total_size += kFramePointerSize;
+  // TODO: Probably better to use sizeof(IRTSegmentState) here...
 
-  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
-  const size_t handle_scope_size = HandleScope::SizeOf(kArmPointerSize, ReferenceCount());
-
-  size_t total_size = frame_data_size;
-  if (LIKELY(HasHandleScope())) {
-    // HandleScope is sometimes excluded.
-    total_size += handle_scope_size;                                 // handle scope size
-  }
+  CHECK(HasHandleScope());
+  total_size += HandleScope::SizeOf(kArmPointerSize, ReferenceCount());
 
   // Plus return value spill area size
+  CHECK(SpillsReturnValue());
   total_size += SizeOfReturnValue();
 
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t ArmJniCallingConvention::OutArgSize() {
-  // TODO: Identical to x86_64 except for also adding additional padding.
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
-                 kStackAlignment);
+size_t ArmJniCallingConvention::OutArgSize() const {
+  // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
+  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
+  // Account for arguments passed through r0-r3. (No FP args, AAPCS32 is soft-float.)
+  size_t stack_args = all_args - std::min(kJniArgumentRegisterCount, all_args);
+  // The size of outgoing arguments.
+  size_t size = stack_args * kFramePointerSize + padding_;
+
+  // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS.
+  static_assert((kCoreCalleeSpillMask & ~kAapcsCoreCalleeSpillMask) == 0u);
+  static_assert((kFpCalleeSpillMask & ~kAapcsFpCalleeSpillMask) == 0u);
+
+  // For @CriticalNative, we can make a tail call if there are no stack args and the
+  // return type is not an FP type (otherwise we need to move the result to FP register).
+  DCHECK(!RequiresSmallResultTypeExtension());
+  if (is_critical_native_ && (size != 0u || GetShorty()[0] == 'F' || GetShorty()[0] == 'D')) {
+    size += kFramePointerSize;  // We need to spill LR with the args.
+  }
+  return RoundUp(size, kAapcsStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> ArmJniCallingConvention::CalleeSaveRegisters() const {
-  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  if (UNLIKELY(IsCriticalNative())) {
+    if (UseTailCall()) {
+      return ArrayRef<const ManagedRegister>();  // Do not spill anything.
+    } else {
+      // Spill LR with out args.
+      static_assert((kCoreCalleeSpillMask >> LR) == 1u);  // Contains LR as the highest bit.
+      constexpr size_t lr_index = POPCOUNT(kCoreCalleeSpillMask) - 1u;
+      static_assert(kCalleeSaveRegisters[lr_index].Equals(
+                        ArmManagedRegister::FromCoreRegister(LR)));
+      return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters).SubArray(
+          /*pos*/ lr_index, /*length=*/ 1u);
+    }
+  } else {
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  }
 }
 
 // JniCallingConvention ABI follows AAPCS where longs and doubles must occur
@@ -451,18 +524,27 @@
   return FrameOffset(offset);
 }
 
-size_t ArmJniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();  // twice count 8-byte args
-  // XX: Why is the long/ordouble counted twice but not JNIEnv* ???
-  // count JNIEnv* less arguments in registers
-  size_t internal_args = (HasJniEnv() ? 1 : 0 /* jni env */);
-  size_t total_args = static_args + param_args + internal_args;
+ManagedRegister ArmJniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // R4 is neither managed callee-save, nor argument register, nor scratch register.
+  // (It is native callee-save but the value coming from managed code can be clobbered.)
+  // TODO: Change to static_assert; std::none_of should be constexpr since C++20.
+  DCHECK(std::none_of(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
+                      [](ManagedRegister callee_save) constexpr {
+                        return callee_save.Equals(ArmManagedRegister::FromCoreRegister(R4));
+                      }));
+  DCHECK(std::none_of(kJniArgumentRegisters,
+                      kJniArgumentRegisters + std::size(kJniArgumentRegisters),
+                      [](Register reg) { return reg == R4; }));
+  DCHECK(!InterproceduralScratchRegister().Equals(ArmManagedRegister::FromCoreRegister(R4)));
+  return ArmManagedRegister::FromCoreRegister(R4);
+}
 
-  return total_args - std::min(kJniArgumentRegisterCount, static_cast<size_t>(total_args));
-
-  // TODO: Very similar to x86_64 except for the return pc.
+// Whether to use tail call (used only for @CriticalNative).
+bool ArmJniCallingConvention::UseTailCall() const {
+  CHECK(IsCriticalNative());
+  return OutArgSize() == 0u;
 }
 
 }  // namespace arm
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index b327898..04ad00b 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -35,7 +35,7 @@
   ~ArmManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -60,11 +60,11 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
   void Next() override;  // Override default behavior for AAPCS
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -79,8 +79,11 @@
     return false;
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   // Padding to ensure longs and doubles are not split in AAPCS
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 4a6a754..44218ef 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -59,6 +59,8 @@
     // Jni function is the native function which the java code wants to call.
     // Jni method is the method that is compiled by jni compiler.
     // Call chain: managed code(java) --> jni method --> jni function.
+    // This does not apply to the @CriticalNative.
+
     // Thread register(X19) is saved on stack.
     Arm64ManagedRegister::FromXRegister(X19),
     Arm64ManagedRegister::FromXRegister(X20),
@@ -86,58 +88,73 @@
     Arm64ManagedRegister::FromDRegister(D15),
 };
 
-static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+template <size_t size>
+static constexpr uint32_t CalculateCoreCalleeSpillMask(
+    const ManagedRegister (&callee_saves)[size]) {
   uint32_t result = 0u;
-  for (auto&& r : kCalleeSaveRegisters) {
+  for (auto&& r : callee_saves) {
     if (r.AsArm64().IsXRegister()) {
-      result |= (1 << r.AsArm64().AsXRegister());
+      result |= (1u << r.AsArm64().AsXRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t CalculateFpCalleeSpillMask() {
-  uint32_t result = 0;
-  for (auto&& r : kCalleeSaveRegisters) {
+template <size_t size>
+static constexpr uint32_t CalculateFpCalleeSpillMask(const ManagedRegister (&callee_saves)[size]) {
+  uint32_t result = 0u;
+  for (auto&& r : callee_saves) {
     if (r.AsArm64().IsDRegister()) {
-      result |= (1 << r.AsArm64().AsDRegister());
+      result |= (1u << r.AsArm64().AsDRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
-static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask(kCalleeSaveRegisters);
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask(kCalleeSaveRegisters);
+
+// The AAPCS64 requires 16-byte alignement. This is the same as the Managed ABI stack alignment.
+static constexpr size_t kAapcs64StackAlignment = 16u;
+static_assert(kAapcs64StackAlignment == kStackAlignment);
+
+static constexpr ManagedRegister kAapcs64CalleeSaveRegisters[] = {
+    // Core registers.
+    Arm64ManagedRegister::FromXRegister(X19),
+    Arm64ManagedRegister::FromXRegister(X20),
+    Arm64ManagedRegister::FromXRegister(X21),
+    Arm64ManagedRegister::FromXRegister(X22),
+    Arm64ManagedRegister::FromXRegister(X23),
+    Arm64ManagedRegister::FromXRegister(X24),
+    Arm64ManagedRegister::FromXRegister(X25),
+    Arm64ManagedRegister::FromXRegister(X26),
+    Arm64ManagedRegister::FromXRegister(X27),
+    Arm64ManagedRegister::FromXRegister(X28),
+    Arm64ManagedRegister::FromXRegister(X29),
+    Arm64ManagedRegister::FromXRegister(LR),
+    // Hard float registers.
+    Arm64ManagedRegister::FromDRegister(D8),
+    Arm64ManagedRegister::FromDRegister(D9),
+    Arm64ManagedRegister::FromDRegister(D10),
+    Arm64ManagedRegister::FromDRegister(D11),
+    Arm64ManagedRegister::FromDRegister(D12),
+    Arm64ManagedRegister::FromDRegister(D13),
+    Arm64ManagedRegister::FromDRegister(D14),
+    Arm64ManagedRegister::FromDRegister(D15),
+};
+
+static constexpr uint32_t kAapcs64CoreCalleeSpillMask =
+    CalculateCoreCalleeSpillMask(kAapcs64CalleeSaveRegisters);
+static constexpr uint32_t kAapcs64FpCalleeSpillMask =
+    CalculateFpCalleeSpillMask(kAapcs64CalleeSaveRegisters);
 
 // Calling convention
-ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
-  // X20 is safe to use as a scratch register:
-  // - with Baker read barriers (in the case of a non-critical native
-  //   method), it is reserved as Marking Register, and thus does not
-  //   actually need to be saved/restored; it is refreshed on exit
-  //   (see Arm64JNIMacroAssembler::RemoveFrame);
-  // - in other cases, it is saved on entry (in
-  //   Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in
-  //   Arm64JNIMacroAssembler::RemoveFrame). This is also expected in
-  //   the case of a critical native method in the Baker read barrier
-  //   configuration, where the value of MR must be preserved across
-  //   the JNI call (as there is no MR refresh in that case).
-  return Arm64ManagedRegister::FromXRegister(X20);
+ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
+  return Arm64ManagedRegister::FromXRegister(IP0);  // X16
 }
 
-ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() {
-  // X20 is safe to use as a scratch register:
-  // - with Baker read barriers (in the case of a non-critical native
-  //   method), it is reserved as Marking Register, and thus does not
-  //   actually need to be saved/restored; it is refreshed on exit
-  //   (see Arm64JNIMacroAssembler::RemoveFrame);
-  // - in other cases, it is saved on entry (in
-  //   Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in
-  //   Arm64JNIMacroAssembler::RemoveFrame). This is also expected in
-  //   the case of a critical native method in the Baker read barrier
-  //   configuration, where the value of MR must be preserved across
-  //   the JNI call (as there is no MR refresh in that case).
-  return Arm64ManagedRegister::FromXRegister(X20);
+ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() const {
+  return Arm64ManagedRegister::FromXRegister(IP0);  // X16
 }
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
@@ -187,11 +204,9 @@
 
 FrameOffset Arm64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
-  FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +  // displacement
-                  kFramePointerSize +  // Method ref
-                  (itr_slots_ * sizeof(uint32_t)));  // offset into in args
-  return result;
+  return FrameOffset(displacement_.Int32Value() +  // displacement
+                     kFramePointerSize +  // Method ref
+                     (itr_slots_ * sizeof(uint32_t)));  // offset into in args
 }
 
 const ManagedRegisterEntrySpills& Arm64ManagedRuntimeCallingConvention::EntrySpills() {
@@ -243,6 +258,7 @@
 }
 
 // JNI calling convention
+
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static,
                                                      bool is_synchronized,
                                                      bool is_critical_native,
@@ -255,52 +271,88 @@
 }
 
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
-  return kCoreCalleeSpillMask;
+  return is_critical_native_ ? 0u : kCoreCalleeSpillMask;
 }
 
 uint32_t Arm64JniCallingConvention::FpSpillMask() const {
-  return kFpCalleeSpillMask;
+  return is_critical_native_ ? 0u : kFpCalleeSpillMask;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
   return ManagedRegister::NoRegister();
 }
 
-size_t Arm64JniCallingConvention::FrameSize() {
+size_t Arm64JniCallingConvention::FrameSize() const {
+  if (is_critical_native_) {
+    CHECK(!SpillsMethod());
+    CHECK(!HasLocalReferenceSegmentState());
+    CHECK(!HasHandleScope());
+    CHECK(!SpillsReturnValue());
+    return 0u;  // There is no managed frame for @CriticalNative.
+  }
+
   // Method*, callee save area size, local reference segment state
-  //
-  // (Unlike x86_64, do not include return address, and the segment state is uint32
-  // instead of pointer).
+  CHECK(SpillsMethod());
   size_t method_ptr_size = static_cast<size_t>(kFramePointerSize);
   size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
+  size_t total_size = method_ptr_size + callee_save_area_size;
 
-  size_t frame_data_size = method_ptr_size + callee_save_area_size;
-  if (LIKELY(HasLocalReferenceSegmentState())) {
-    frame_data_size += sizeof(uint32_t);
-  }
-  // References plus 2 words for HandleScope header
-  size_t handle_scope_size = HandleScope::SizeOf(kArm64PointerSize, ReferenceCount());
+  CHECK(HasLocalReferenceSegmentState());
+  total_size += sizeof(uint32_t);
 
-  size_t total_size = frame_data_size;
-  if (LIKELY(HasHandleScope())) {
-    // HandleScope is sometimes excluded.
-    total_size += handle_scope_size;                                 // handle scope size
-  }
+  CHECK(HasHandleScope());
+  total_size += HandleScope::SizeOf(kArm64PointerSize, ReferenceCount());
 
   // Plus return value spill area size
+  CHECK(SpillsReturnValue());
   total_size += SizeOfReturnValue();
 
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t Arm64JniCallingConvention::OutArgSize() {
-  // Same as X86_64
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
+size_t Arm64JniCallingConvention::OutArgSize() const {
+  // Count param args, including JNIEnv* and jclass*.
+  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
+  size_t num_fp_args = NumFloatOrDoubleArgs();
+  DCHECK_GE(all_args, num_fp_args);
+  size_t num_non_fp_args = all_args - num_fp_args;
+  // Account for FP arguments passed through v0-v7.
+  size_t num_stack_fp_args =
+      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
+  // Account for other (integer and pointer) arguments passed through GPR (x0-x7).
+  size_t num_stack_non_fp_args =
+      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
+  // The size of outgoing arguments.
+  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+
+  // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS64.
+  static_assert((kCoreCalleeSpillMask & ~kAapcs64CoreCalleeSpillMask) == 0u);
+  static_assert((kFpCalleeSpillMask & ~kAapcs64FpCalleeSpillMask) == 0u);
+
+  // For @CriticalNative, we can make a tail call if there are no stack args and
+  // we do not need to extend the result. Otherwise, add space for return PC.
+  if (is_critical_native_ && (size != 0u || RequiresSmallResultTypeExtension())) {
+    size += kFramePointerSize;  // We need to spill LR with the args.
+  }
+  return RoundUp(size, kStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> Arm64JniCallingConvention::CalleeSaveRegisters() const {
-  // Same as X86_64
-  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  if (UNLIKELY(IsCriticalNative())) {
+    if (UseTailCall()) {
+      return ArrayRef<const ManagedRegister>();  // Do not spill anything.
+    } else {
+      // Spill LR with out args.
+      static_assert((kCoreCalleeSpillMask >> LR) == 1u);  // Contains LR as the highest bit.
+      constexpr size_t lr_index = POPCOUNT(kCoreCalleeSpillMask) - 1u;
+      static_assert(kCalleeSaveRegisters[lr_index].Equals(
+                        Arm64ManagedRegister::FromXRegister(LR)));
+      return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters).SubArray(
+          /*pos*/ lr_index, /*length=*/ 1u);
+    }
+  } else {
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  }
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
@@ -347,25 +399,28 @@
   size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
-  // TODO: Seems identical to X86_64 code.
 }
 
-size_t Arm64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  // all arguments including JNI args
-  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
+ManagedRegister Arm64JniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // X15 is neither managed callee-save, nor argument register, nor scratch register.
+  // TODO: Change to static_assert; std::none_of should be constexpr since C++20.
+  DCHECK(std::none_of(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
+                      [](ManagedRegister callee_save) constexpr {
+                        return callee_save.Equals(Arm64ManagedRegister::FromXRegister(X15));
+                      }));
+  DCHECK(std::none_of(kXArgumentRegisters,
+                      kXArgumentRegisters + std::size(kXArgumentRegisters),
+                      [](XRegister reg) { return reg == X15; }));
+  DCHECK(!InterproceduralScratchRegister().Equals(Arm64ManagedRegister::FromXRegister(X15)));
+  return Arm64ManagedRegister::FromXRegister(X15);
+}
 
-  DCHECK_GE(all_args, NumFloatOrDoubleArgs());
-
-  size_t all_stack_args =
-      all_args
-      - std::min(kMaxFloatOrDoubleRegisterArguments,
-                 static_cast<size_t>(NumFloatOrDoubleArgs()))
-      - std::min(kMaxIntLikeRegisterArguments,
-                 static_cast<size_t>((all_args - NumFloatOrDoubleArgs())));
-
-  // TODO: Seems similar to X86_64 code except it doesn't count return pc.
-
-  return all_stack_args;
+// Whether to use tail call (used only for @CriticalNative).
+bool Arm64JniCallingConvention::UseTailCall() const {
+  CHECK(IsCriticalNative());
+  return OutArgSize() == 0u;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index ed0ddeb..f4148c7 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -35,7 +35,7 @@
   ~Arm64ManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -60,10 +60,10 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -75,11 +75,14 @@
 
   // aarch64 calling convention leaves upper bits undefined.
   bool RequiresSmallResultTypeExtension() const override {
-    return true;
+    return HasSmallReturnType();
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(Arm64JniCallingConvention);
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index f031b9b..15af248 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -347,21 +347,6 @@
   }
 }
 
-bool JniCallingConvention::HasHandleScope() const {
-  // Exclude HandleScope for @CriticalNative methods for optimization speed.
-  return is_critical_native_ == false;
-}
-
-bool JniCallingConvention::HasLocalReferenceSegmentState() const {
-  // Exclude local reference segment states for @CriticalNative methods for optimization speed.
-  return is_critical_native_ == false;
-}
-
-bool JniCallingConvention::HasJniEnv() const {
-  // Exclude "JNIEnv*" parameter for @CriticalNative methods.
-  return HasExtraArgumentsForJni();
-}
-
 bool JniCallingConvention::HasSelfClass() const {
   if (!IsStatic()) {
     // Virtual functions: There is never an implicit jclass parameter.
@@ -372,11 +357,6 @@
   }
 }
 
-bool JniCallingConvention::HasExtraArgumentsForJni() const {
-  // @CriticalNative jni implementations exclude both JNIEnv* and the jclass/jobject parameters.
-  return is_critical_native_ == false;
-}
-
 unsigned int JniCallingConvention::GetIteratorPositionWithinShorty() const {
   // We need to subtract out the extra JNI arguments if we want to use this iterator position
   // with the inherited CallingConvention member functions, which rely on scanning the shorty.
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 77a5d59..3d4cefe 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -49,12 +49,7 @@
   // Register that holds result of this method invocation.
   virtual ManagedRegister ReturnRegister() = 0;
   // Register reserved for scratch usage during procedure calls.
-  virtual ManagedRegister InterproceduralScratchRegister() = 0;
-
-  // Offset of Method within the frame.
-  FrameOffset MethodStackOffset() {
-    return displacement_;
-  }
+  virtual ManagedRegister InterproceduralScratchRegister() const = 0;
 
   // Iterator interface
 
@@ -70,6 +65,14 @@
     itr_float_and_doubles_ = 0;
   }
 
+  FrameOffset GetDisplacement() const {
+    return displacement_;
+  }
+
+  PointerSize GetFramePointerSize() const {
+    return frame_pointer_size_;
+  }
+
   virtual ~CallingConvention() {}
 
  protected:
@@ -239,6 +242,11 @@
                                                                  const char* shorty,
                                                                  InstructionSet instruction_set);
 
+  // Offset of Method within the managed frame.
+  FrameOffset MethodStackOffset() {
+    return FrameOffset(0u);
+  }
+
   // Register that holds the incoming method argument
   virtual ManagedRegister MethodRegister() = 0;
 
@@ -296,10 +304,10 @@
   // Size of frame excluding space for outgoing args (its assumed Method* is
   // always at the bottom of a frame, but this doesn't work for outgoing
   // native args). Includes alignment.
-  virtual size_t FrameSize() = 0;
+  virtual size_t FrameSize() const = 0;
   // Size of outgoing arguments (stack portion), including alignment.
   // -- Arguments that are passed via registers are excluded from this size.
-  virtual size_t OutArgSize() = 0;
+  virtual size_t OutArgSize() const = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
   // Location where the segment state of the local indirect reference table is saved
@@ -365,6 +373,32 @@
 
   virtual ~JniCallingConvention() {}
 
+  bool IsCriticalNative() const {
+    return is_critical_native_;
+  }
+
+  // Does the transition have a method pointer in the stack frame?
+  bool SpillsMethod() const {
+    // Exclude method pointer for @CriticalNative methods for optimization speed.
+    return !IsCriticalNative();
+  }
+
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  virtual ManagedRegister HiddenArgumentRegister() const = 0;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  virtual bool UseTailCall() const = 0;
+
+  // Whether the return type is small. Used for RequiresSmallResultTypeExtension()
+  // on architectures that require the sign/zero extension.
+  bool HasSmallReturnType() const {
+    Primitive::Type return_type = GetReturnType();
+    return return_type == Primitive::kPrimByte ||
+           return_type == Primitive::kPrimShort ||
+           return_type == Primitive::kPrimBoolean ||
+           return_type == Primitive::kPrimChar;
+  }
+
  protected:
   // Named iterator positions
   enum IteratorPos {
@@ -380,24 +414,41 @@
       : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size),
         is_critical_native_(is_critical_native) {}
 
-  // Number of stack slots for outgoing arguments, above which the handle scope is
-  // located
-  virtual size_t NumberOfOutgoingStackArgs() = 0;
-
  protected:
   size_t NumberOfExtraArgumentsForJni() const;
 
   // Does the transition have a StackHandleScope?
-  bool HasHandleScope() const;
+  bool HasHandleScope() const {
+    // Exclude HandleScope for @CriticalNative methods for optimization speed.
+    return !IsCriticalNative();
+  }
+
   // Does the transition have a local reference segment state?
-  bool HasLocalReferenceSegmentState() const;
-  // Has a JNIEnv* parameter implicitly?
-  bool HasJniEnv() const;
-  // Has a 'jclass' parameter implicitly?
-  bool HasSelfClass() const;
+  bool HasLocalReferenceSegmentState() const {
+    // Exclude local reference segment states for @CriticalNative methods for optimization speed.
+    return !IsCriticalNative();
+  }
+
+  // Does the transition back spill the return value in the stack frame?
+  bool SpillsReturnValue() const {
+    // Exclude return value for @CriticalNative methods for optimization speed.
+    return !IsCriticalNative();
+  }
 
   // Are there extra JNI arguments (JNIEnv* and maybe jclass)?
-  bool HasExtraArgumentsForJni() const;
+  bool HasExtraArgumentsForJni() const {
+    // @CriticalNative jni implementations exclude both JNIEnv* and the jclass/jobject parameters.
+    return !IsCriticalNative();
+  }
+
+  // Has a JNIEnv* parameter implicitly?
+  bool HasJniEnv() const {
+    // Exclude "JNIEnv*" parameter for @CriticalNative methods.
+    return HasExtraArgumentsForJni();
+  }
+
+  // Has a 'jclass' parameter implicitly?
+  bool HasSelfClass() const;
 
   // Returns the position of itr_args_, fixed up by removing the offset of extra JNI arguments.
   unsigned int GetIteratorPositionWithinShorty() const;
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 7054078..363e646 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -54,8 +54,7 @@
 template <PointerSize kPointerSize>
 static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                           ManagedRuntimeCallingConvention* mr_conv,
-                          JniCallingConvention* jni_conv,
-                          size_t frame_size, size_t out_arg_size);
+                          JniCallingConvention* jni_conv);
 template <PointerSize kPointerSize>
 static void SetNativeParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                                JniCallingConvention* jni_conv,
@@ -131,7 +130,7 @@
   const bool is_fast_native = (access_flags & kAccFastNative) != 0u;
 
   // i.e. if the method was annotated with @CriticalNative
-  bool is_critical_native = (access_flags & kAccCriticalNative) != 0u;
+  const bool is_critical_native = (access_flags & kAccCriticalNative) != 0u;
 
   VLOG(jni) << "JniCompile: Method :: "
               << dex_file.PrettyMethod(method_idx, /* with signature */ true)
@@ -220,17 +219,22 @@
   jni_asm->SetEmitRunTimeChecksInDebugMode(compiler_options.EmitRunTimeChecksInDebugMode());
 
   // 1. Build the frame saving all callee saves, Method*, and PC return address.
-  const size_t frame_size(main_jni_conv->FrameSize());  // Excludes outgoing args.
+  //    For @CriticalNative, this includes space for out args, otherwise just the managed frame.
+  const size_t managed_frame_size = main_jni_conv->FrameSize();
+  const size_t main_out_arg_size = main_jni_conv->OutArgSize();
+  size_t current_frame_size = is_critical_native ? main_out_arg_size : managed_frame_size;
+  ManagedRegister method_register =
+      is_critical_native ? ManagedRegister::NoRegister() : mr_conv->MethodRegister();
   ArrayRef<const ManagedRegister> callee_save_regs = main_jni_conv->CalleeSaveRegisters();
-  __ BuildFrame(frame_size, mr_conv->MethodRegister(), callee_save_regs, mr_conv->EntrySpills());
-  DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
+  __ BuildFrame(current_frame_size, method_register, callee_save_regs, mr_conv->EntrySpills());
+  DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
 
   if (LIKELY(!is_critical_native)) {
     // NOTE: @CriticalNative methods don't have a HandleScope
     //       because they can't have any reference parameters or return values.
 
     // 2. Set up the HandleScope
-    mr_conv->ResetIterator(FrameOffset(frame_size));
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(0));
     __ StoreImmediateToFrame(main_jni_conv->HandleScopeNumRefsOffset(),
                              main_jni_conv->ReferenceCount(),
@@ -249,7 +253,7 @@
     if (is_static) {
       FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
       // Check handle scope offset is within frame
-      CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
+      CHECK_LT(handle_scope_offset.Uint32Value(), current_frame_size);
       // Note this LoadRef() doesn't need heap unpoisoning since it's from the ArtMethod.
       // Note this LoadRef() does not include read barrier. It will be handled below.
       //
@@ -272,7 +276,7 @@
         // must be null.
         FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
         // Check handle scope offset is within frame and doesn't run into the saved segment state.
-        CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
+        CHECK_LT(handle_scope_offset.Uint32Value(), current_frame_size);
         CHECK_NE(handle_scope_offset.Uint32Value(),
                  main_jni_conv->SavedLocalReferenceCookieOffset().Uint32Value());
         bool input_in_reg = mr_conv->IsCurrentParamInRegister();
@@ -304,9 +308,17 @@
   }  // if (!is_critical_native)
 
   // 5. Move frame down to allow space for out going args.
-  const size_t main_out_arg_size = main_jni_conv->OutArgSize();
   size_t current_out_arg_size = main_out_arg_size;
-  __ IncreaseFrameSize(main_out_arg_size);
+  if (UNLIKELY(is_critical_native)) {
+    DCHECK_EQ(main_out_arg_size, current_frame_size);
+    // Move the method pointer to the hidden argument register.
+    __ Move(main_jni_conv->HiddenArgumentRegister(),
+            mr_conv->MethodRegister(),
+            static_cast<size_t>(main_jni_conv->GetFramePointerSize()));
+  } else {
+    __ IncreaseFrameSize(main_out_arg_size);
+    current_frame_size += main_out_arg_size;
+  }
 
   // Call the read barrier for the declaring class loaded from the method for a static call.
   // Skip this for @CriticalNative because we didn't build a HandleScope to begin with.
@@ -376,6 +388,8 @@
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
   FrameOffset locked_object_handle_scope_offset(0xBEEFDEAD);
+  FrameOffset saved_cookie_offset(
+      FrameOffset(0xDEADBEEFu));  // @CriticalNative - use obviously bad value for debugging
   if (LIKELY(!is_critical_native)) {
     // Skip this for @CriticalNative methods. They do not call JniMethodStart.
     ThreadOffset<kPointerSize> jni_start(
@@ -414,12 +428,8 @@
     if (is_synchronized) {  // Check for exceptions from monitor enter.
       __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
     }
-  }
 
-  // Store into stack_frame[saved_cookie_offset] the return value of JniMethodStart.
-  FrameOffset saved_cookie_offset(
-      FrameOffset(0xDEADBEEFu));  // @CriticalNative - use obviously bad value for debugging
-  if (LIKELY(!is_critical_native)) {
+    // Store into stack_frame[saved_cookie_offset] the return value of JniMethodStart.
     saved_cookie_offset = main_jni_conv->SavedLocalReferenceCookieOffset();
     __ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4 /* sizeof cookie */);
   }
@@ -430,7 +440,7 @@
   //    null (which must be encoded as null).
   //    Note: we do this prior to materializing the JNIEnv* and static's jclass to
   //    give as many free registers for the shuffle as possible.
-  mr_conv->ResetIterator(FrameOffset(frame_size + main_out_arg_size));
+  mr_conv->ResetIterator(FrameOffset(current_frame_size));
   uint32_t args_count = 0;
   while (mr_conv->HasNext()) {
     args_count++;
@@ -440,8 +450,12 @@
   // Do a backward pass over arguments, so that the generated code will be "mov
   // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
   // TODO: A reverse iterator to improve readability.
+  // TODO: This is currently useless as all archs spill args when building the frame.
+  //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
+  //       to determine which arg registers are clobbered before they are needed.
+  // TODO: For @CriticalNative, do a forward pass because there are no JNIEnv* and jclass* args.
   for (uint32_t i = 0; i < args_count; ++i) {
-    mr_conv->ResetIterator(FrameOffset(frame_size + main_out_arg_size));
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
 
     // Skip the extra JNI parameters for now.
@@ -456,11 +470,11 @@
       mr_conv->Next();
       main_jni_conv->Next();
     }
-    CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get(), frame_size, main_out_arg_size);
+    CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
   }
   if (is_static && !is_critical_native) {
     // Create argument for Class
-    mr_conv->ResetIterator(FrameOffset(frame_size + main_out_arg_size));
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     main_jni_conv->Next();  // Skip JNIEnv*
     FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
@@ -496,20 +510,33 @@
   // 9. Plant call to native code associated with method.
   MemberOffset jni_entrypoint_offset =
       ArtMethod::EntryPointFromJniOffset(InstructionSetPointerSize(instruction_set));
-  // FIXME: Not sure if MethodStackOffset will work here. What does it even do?
-  __ Call(main_jni_conv->MethodStackOffset(),
-          jni_entrypoint_offset,
-          // XX: Why not the jni conv scratch register?
-          mr_conv->InterproceduralScratchRegister());
+  if (UNLIKELY(is_critical_native)) {
+    if (main_jni_conv->UseTailCall()) {
+      __ Jump(main_jni_conv->HiddenArgumentRegister(),
+              jni_entrypoint_offset,
+              main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(main_jni_conv->HiddenArgumentRegister(),
+              jni_entrypoint_offset,
+              main_jni_conv->InterproceduralScratchRegister());
+    }
+  } else {
+    __ Call(FrameOffset(main_out_arg_size + mr_conv->MethodStackOffset().SizeValue()),
+            jni_entrypoint_offset,
+            main_jni_conv->InterproceduralScratchRegister());
+  }
 
   // 10. Fix differences in result widths.
   if (main_jni_conv->RequiresSmallResultTypeExtension()) {
+    DCHECK(main_jni_conv->HasSmallReturnType());
+    CHECK(!is_critical_native || !main_jni_conv->UseTailCall());
     if (main_jni_conv->GetReturnType() == Primitive::kPrimByte ||
         main_jni_conv->GetReturnType() == Primitive::kPrimShort) {
       __ SignExtend(main_jni_conv->ReturnRegister(),
                     Primitive::ComponentSize(main_jni_conv->GetReturnType()));
-    } else if (main_jni_conv->GetReturnType() == Primitive::kPrimBoolean ||
-               main_jni_conv->GetReturnType() == Primitive::kPrimChar) {
+    } else {
+      CHECK(main_jni_conv->GetReturnType() == Primitive::kPrimBoolean ||
+            main_jni_conv->GetReturnType() == Primitive::kPrimChar);
       __ ZeroExtend(main_jni_conv->ReturnRegister(),
                     Primitive::ComponentSize(main_jni_conv->GetReturnType()));
     }
@@ -531,7 +558,7 @@
         // TODO: refactor this into the JniCallingConvention code
         // as a return value alignment requirement.
       }
-      CHECK_LT(return_save_location.Uint32Value(), frame_size + main_out_arg_size);
+      CHECK_LT(return_save_location.Uint32Value(), current_frame_size);
       __ Store(return_save_location,
                main_jni_conv->ReturnRegister(),
                main_jni_conv->SizeOfReturnValue());
@@ -545,6 +572,7 @@
       // If they differ, only then do we have to do anything about it.
       // Otherwise the return value is already in the right place when we return.
       if (!jni_return_reg.Equals(mr_return_reg)) {
+        CHECK(!main_jni_conv->UseTailCall());
         // This is typically only necessary on ARM32 due to native being softfloat
         // while managed is hardfloat.
         // -- For example VMOV {r0, r1} -> D0; VMOV r0 -> S0.
@@ -557,23 +585,21 @@
     }
   }
 
-  // Increase frame size for out args if needed by the end_jni_conv.
-  const size_t end_out_arg_size = end_jni_conv->OutArgSize();
-  if (end_out_arg_size > current_out_arg_size) {
-    size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
-    current_out_arg_size = end_out_arg_size;
-    // TODO: This is redundant for @CriticalNative but we need to
-    // conditionally do __DecreaseFrameSize below.
-    __ IncreaseFrameSize(out_arg_size_diff);
-    saved_cookie_offset = FrameOffset(saved_cookie_offset.SizeValue() + out_arg_size_diff);
-    locked_object_handle_scope_offset =
-        FrameOffset(locked_object_handle_scope_offset.SizeValue() + out_arg_size_diff);
-    return_save_location = FrameOffset(return_save_location.SizeValue() + out_arg_size_diff);
-  }
-  //     thread.
-  end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-
   if (LIKELY(!is_critical_native)) {
+    // Increase frame size for out args if needed by the end_jni_conv.
+    const size_t end_out_arg_size = end_jni_conv->OutArgSize();
+    if (end_out_arg_size > current_out_arg_size) {
+      size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
+      current_out_arg_size = end_out_arg_size;
+      __ IncreaseFrameSize(out_arg_size_diff);
+      current_frame_size += out_arg_size_diff;
+      saved_cookie_offset = FrameOffset(saved_cookie_offset.SizeValue() + out_arg_size_diff);
+      locked_object_handle_scope_offset =
+          FrameOffset(locked_object_handle_scope_offset.SizeValue() + out_arg_size_diff);
+      return_save_location = FrameOffset(return_save_location.SizeValue() + out_arg_size_diff);
+    }
+    end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
+
     // 12. Call JniMethodEnd
     ThreadOffset<kPointerSize> jni_end(
         GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd,
@@ -629,19 +655,28 @@
   }  // if (!is_critical_native)
 
   // 14. Move frame up now we're done with the out arg space.
-  __ DecreaseFrameSize(current_out_arg_size);
+  //     @CriticalNative remove out args together with the frame in RemoveFrame().
+  if (LIKELY(!is_critical_native)) {
+    __ DecreaseFrameSize(current_out_arg_size);
+    current_frame_size -= current_out_arg_size;
+  }
 
   // 15. Process pending exceptions from JNI call or monitor exit.
-  __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), 0 /* stack_adjust= */);
+  //     @CriticalNative methods do not need exception poll in the stub.
+  if (LIKELY(!is_critical_native)) {
+    __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), 0 /* stack_adjust= */);
+  }
 
   // 16. Remove activation - need to restore callee save registers since the GC may have changed
   //     them.
-  DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
-  // We expect the compiled method to possibly be suspended during its
-  // execution, except in the case of a CriticalNative method.
-  bool may_suspend = !is_critical_native;
-  __ RemoveFrame(frame_size, callee_save_regs, may_suspend);
-  DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
+  DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
+  if (LIKELY(!is_critical_native) || !main_jni_conv->UseTailCall()) {
+    // We expect the compiled method to possibly be suspended during its
+    // execution, except in the case of a CriticalNative method.
+    bool may_suspend = !is_critical_native;
+    __ RemoveFrame(current_frame_size, callee_save_regs, may_suspend);
+    DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
+  }
 
   // 17. Finalize code generation
   __ FinalizeCode();
@@ -652,7 +687,7 @@
 
   return JniCompiledMethod(instruction_set,
                            std::move(managed_code),
-                           frame_size,
+                           managed_frame_size,
                            main_jni_conv->CoreSpillMask(),
                            main_jni_conv->FpSpillMask(),
                            ArrayRef<const uint8_t>(*jni_asm->cfi().data()));
@@ -662,9 +697,7 @@
 template <PointerSize kPointerSize>
 static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                           ManagedRuntimeCallingConvention* mr_conv,
-                          JniCallingConvention* jni_conv,
-                          size_t frame_size,
-                          size_t out_arg_size) {
+                          JniCallingConvention* jni_conv) {
   bool input_in_reg = mr_conv->IsCurrentParamInRegister();
   bool output_in_reg = jni_conv->IsCurrentParamInRegister();
   FrameOffset handle_scope_offset(0);
@@ -686,7 +719,7 @@
     // as with regular references).
     handle_scope_offset = jni_conv->CurrentParamHandleScopeEntryOffset();
     // Check handle scope offset is within frame.
-    CHECK_LT(handle_scope_offset.Uint32Value(), (frame_size + out_arg_size));
+    CHECK_LT(handle_scope_offset.Uint32Value(), mr_conv->GetDisplacement().Uint32Value());
   }
   if (input_in_reg && output_in_reg) {
     ManagedRegister in_reg = mr_conv->CurrentParamRegister();
@@ -716,7 +749,7 @@
     FrameOffset in_off = mr_conv->CurrentParamStackOffset();
     ManagedRegister out_reg = jni_conv->CurrentParamRegister();
     // Check that incoming stack arguments are above the current stack frame.
-    CHECK_GT(in_off.Uint32Value(), frame_size);
+    CHECK_GT(in_off.Uint32Value(), mr_conv->GetDisplacement().Uint32Value());
     if (ref_param) {
       __ CreateHandleScopeEntry(out_reg, handle_scope_offset, ManagedRegister::NoRegister(), null_allowed);
     } else {
@@ -728,8 +761,8 @@
     CHECK(input_in_reg && !output_in_reg);
     ManagedRegister in_reg = mr_conv->CurrentParamRegister();
     FrameOffset out_off = jni_conv->CurrentParamStackOffset();
-    // Check outgoing argument is within frame
-    CHECK_LT(out_off.Uint32Value(), frame_size);
+    // Check outgoing argument is within frame part dedicated to out args.
+    CHECK_LT(out_off.Uint32Value(), jni_conv->GetDisplacement().Uint32Value());
     if (ref_param) {
       // TODO: recycle value in in_reg rather than reload from handle scope
       __ CreateHandleScopeEntry(out_off, handle_scope_offset, mr_conv->InterproceduralScratchRegister(),
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index c69854d..cbb692e 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -75,11 +75,11 @@
 static constexpr uint32_t kFpCalleeSpillMask = 0u;
 
 // Calling convention
-ManagedRegister MipsManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister MipsManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return MipsManagedRegister::FromCoreRegister(T9);
 }
 
-ManagedRegister MipsJniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister MipsJniCallingConvention::InterproceduralScratchRegister() const {
   return MipsManagedRegister::FromCoreRegister(T9);
 }
 
@@ -334,7 +334,7 @@
   return MipsManagedRegister::FromCoreRegister(AT);
 }
 
-size_t MipsJniCallingConvention::FrameSize() {
+size_t MipsJniCallingConvention::FrameSize() const {
   // ArtMethod*, RA and callee save area size, local reference segment state.
   const size_t method_ptr_size = static_cast<size_t>(kMipsPointerSize);
   const size_t ra_return_addr_size = kFramePointerSize;
@@ -362,7 +362,7 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t MipsJniCallingConvention::OutArgSize() {
+size_t MipsJniCallingConvention::OutArgSize() const {
   // Argument Passing (3-17):
   //   "Despite the fact that some or all of the arguments to a function are passed in registers,
   // always allocate space on the stack for all arguments. This stack space should be a structure
@@ -371,8 +371,19 @@
   // for arguments are called the home locations."
   //
   // Allocate 16 bytes for home locations + space needed for stack arguments.
+
+  size_t static_args = HasSelfClass() ? 1 : 0;            // Count jclass.
+  // Regular argument parameters and this.
+  size_t param_args = NumArgs() + NumLongOrDoubleArgs();  // Twice count 8-byte args.
+  // Count JNIEnv* less arguments in registers.
+  size_t internal_args = (HasJniEnv() ? 1 : 0);
+  size_t total_args = static_args + param_args + internal_args;
+
+  size_t stack_args =
+      total_args - std::min(kMaxIntLikeRegisterArguments, static_cast<size_t>(total_args));
+
   return RoundUp(
-      (kMaxIntLikeRegisterArguments + NumberOfOutgoingStackArgs()) * kFramePointerSize + padding_,
+      (kMaxIntLikeRegisterArguments + stack_args) * kFramePointerSize + padding_,
       kStackAlignment);
 }
 
@@ -446,15 +457,14 @@
   return FrameOffset(offset);
 }
 
-size_t MipsJniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = HasSelfClass() ? 1 : 0;            // Count jclass.
-  // Regular argument parameters and this.
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();  // Twice count 8-byte args.
-  // Count JNIEnv* less arguments in registers.
-  size_t internal_args = (HasJniEnv() ? 1 : 0);
-  size_t total_args = static_args + param_args + internal_args;
+ManagedRegister MipsJniCallingConvention::HiddenArgumentRegister() const {
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
+}
 
-  return total_args - std::min(kMaxIntLikeRegisterArguments, static_cast<size_t>(total_args));
+bool MipsJniCallingConvention::UseTailCall() const {
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
 }
 
 }  // namespace mips
diff --git a/compiler/jni/quick/mips/calling_convention_mips.h b/compiler/jni/quick/mips/calling_convention_mips.h
index 8b395a0..af27dc8 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.h
+++ b/compiler/jni/quick/mips/calling_convention_mips.h
@@ -37,7 +37,7 @@
   ~MipsManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -62,11 +62,11 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
   void Next() override;  // Override default behavior for o32.
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -81,8 +81,11 @@
     return false;
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   // Padding to ensure longs and doubles are not split in o32.
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.cc b/compiler/jni/quick/mips64/calling_convention_mips64.cc
index 2c297b3..e65ad83 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.cc
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.cc
@@ -64,11 +64,11 @@
 static constexpr uint32_t kFpCalleeSpillMask = 0u;
 
 // Calling convention
-ManagedRegister Mips64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister Mips64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return Mips64ManagedRegister::FromGpuRegister(T9);
 }
 
-ManagedRegister Mips64JniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister Mips64JniCallingConvention::InterproceduralScratchRegister() const {
   return Mips64ManagedRegister::FromGpuRegister(T9);
 }
 
@@ -178,7 +178,7 @@
   return Mips64ManagedRegister::FromGpuRegister(AT);
 }
 
-size_t Mips64JniCallingConvention::FrameSize() {
+size_t Mips64JniCallingConvention::FrameSize() const {
   // ArtMethod*, RA and callee save area size, local reference segment state.
   size_t method_ptr_size = static_cast<size_t>(kFramePointerSize);
   size_t ra_and_callee_save_area_size = (CalleeSaveRegisters().size() + 1) * kFramePointerSize;
@@ -203,8 +203,14 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t Mips64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
+size_t Mips64JniCallingConvention::OutArgSize() const {
+  // all arguments including JNI args
+  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
+
+  // Nothing on the stack unless there are more than 8 arguments
+  size_t stack_args = (all_args > kMaxRegisterArguments) ? all_args - kMaxRegisterArguments : 0;
+
+  return RoundUp(stack_args * kFramePointerSize, kStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> Mips64JniCallingConvention::CalleeSaveRegisters() const {
@@ -236,12 +242,15 @@
   return FrameOffset(offset);
 }
 
-size_t Mips64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  // all arguments including JNI args
-  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
-
-  // Nothing on the stack unless there are more than 8 arguments
-  return (all_args > kMaxRegisterArguments) ? all_args - kMaxRegisterArguments : 0;
+ManagedRegister Mips64JniCallingConvention::HiddenArgumentRegister() const {
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
 }
+
+bool Mips64JniCallingConvention::UseTailCall() const {
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
+}
+
 }  // namespace mips64
 }  // namespace art
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.h b/compiler/jni/quick/mips64/calling_convention_mips64.h
index d87f73a..e9a42a4 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.h
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.h
@@ -37,7 +37,7 @@
   ~Mips64ManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -62,10 +62,10 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -80,8 +80,11 @@
     return false;
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(Mips64JniCallingConvention);
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 1f255e2..d12eb9b 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -26,7 +26,6 @@
 namespace x86 {
 
 static_assert(kX86PointerSize == PointerSize::k32, "Unexpected x86 pointer size");
-static_assert(kStackAlignment >= 16u, "IA-32 cdecl requires at least 16 byte stack alignment");
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -36,10 +35,12 @@
     // No hard float callee saves.
 };
 
-static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+template <size_t size>
+static constexpr uint32_t CalculateCoreCalleeSpillMask(
+    const ManagedRegister (&callee_saves)[size]) {
   // The spilled PC gets a special marker.
   uint32_t result = 1 << kNumberOfCpuRegisters;
-  for (auto&& r : kCalleeSaveRegisters) {
+  for (auto&& r : callee_saves) {
     if (r.AsX86().IsCpuRegister()) {
       result |= (1 << r.AsX86().AsCpuRegister());
     }
@@ -47,16 +48,32 @@
   return result;
 }
 
-static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask(kCalleeSaveRegisters);
 static constexpr uint32_t kFpCalleeSpillMask = 0u;
 
+static constexpr size_t kNativeStackAlignment = 16;  // IA-32 cdecl requires 16 byte alignment.
+static_assert(kNativeStackAlignment == kStackAlignment);
+
+static constexpr ManagedRegister kNativeCalleeSaveRegisters[] = {
+    // Core registers.
+    X86ManagedRegister::FromCpuRegister(EBX),
+    X86ManagedRegister::FromCpuRegister(EBP),
+    X86ManagedRegister::FromCpuRegister(ESI),
+    X86ManagedRegister::FromCpuRegister(EDI),
+    // No hard float callee saves.
+};
+
+static constexpr uint32_t kNativeCoreCalleeSpillMask =
+    CalculateCoreCalleeSpillMask(kNativeCalleeSaveRegisters);
+static constexpr uint32_t kNativeFpCalleeSpillMask = 0u;
+
 // Calling convention
 
-ManagedRegister X86ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86ManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return X86ManagedRegister::FromCpuRegister(ECX);
 }
 
-ManagedRegister X86JniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86JniCallingConvention::InterproceduralScratchRegister() const {
   return X86ManagedRegister::FromCpuRegister(ECX);
 }
 
@@ -205,47 +222,81 @@
 }
 
 uint32_t X86JniCallingConvention::CoreSpillMask() const {
-  return kCoreCalleeSpillMask;
+  return is_critical_native_ ? 0u : kCoreCalleeSpillMask;
 }
 
 uint32_t X86JniCallingConvention::FpSpillMask() const {
-  return kFpCalleeSpillMask;
+  return is_critical_native_ ? 0u : kFpCalleeSpillMask;
 }
 
-size_t X86JniCallingConvention::FrameSize() {
+size_t X86JniCallingConvention::FrameSize() const {
+  if (is_critical_native_) {
+    CHECK(!SpillsMethod());
+    CHECK(!HasLocalReferenceSegmentState());
+    CHECK(!HasHandleScope());
+    CHECK(!SpillsReturnValue());
+    return 0u;  // There is no managed frame for @CriticalNative.
+  }
+
   // Method*, PC return address and callee save area size, local reference segment state
+  CHECK(SpillsMethod());
   const size_t method_ptr_size = static_cast<size_t>(kX86PointerSize);
   const size_t pc_return_addr_size = kFramePointerSize;
   const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
-  size_t frame_data_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
+  size_t total_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
 
-  if (LIKELY(HasLocalReferenceSegmentState())) {                     // local ref. segment state
-    // Local reference segment state is sometimes excluded.
-    frame_data_size += kFramePointerSize;
-  }
+  CHECK(HasLocalReferenceSegmentState());
+  total_size += kFramePointerSize;
 
-  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
-  const size_t handle_scope_size = HandleScope::SizeOf(kX86PointerSize, ReferenceCount());
-
-  size_t total_size = frame_data_size;
-  if (LIKELY(HasHandleScope())) {
-    // HandleScope is sometimes excluded.
-    total_size += handle_scope_size;                                 // handle scope size
-  }
+  CHECK(HasHandleScope());
+  total_size += HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
 
   // Plus return value spill area size
+  CHECK(SpillsReturnValue());
   total_size += SizeOfReturnValue();
 
   return RoundUp(total_size, kStackAlignment);
-  // TODO: Same thing as x64 except using different pointer size. Refactor?
 }
 
-size_t X86JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
+size_t X86JniCallingConvention::OutArgSize() const {
+  // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
+  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
+  // The size of outgoiong arguments.
+  size_t size = all_args * kFramePointerSize;
+
+  // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS.
+  static_assert((kCoreCalleeSpillMask & ~kNativeCoreCalleeSpillMask) == 0u);
+  static_assert((kFpCalleeSpillMask & ~kNativeFpCalleeSpillMask) == 0u);
+
+  if (is_critical_native_) {
+    // Add return address size for @CriticalNative
+    // For normal native the return PC is part of the managed stack frame instead of out args.
+    size += kFramePointerSize;
+    // For @CriticalNative, we can make a tail call if there are no stack args
+    // and the return type is not FP type (needs moving from ST0 to MMX0) and
+    // we do not need to extend the result.
+    bool return_type_ok = GetShorty()[0] == 'I' || GetShorty()[0] == 'J' || GetShorty()[0] == 'V';
+    DCHECK_EQ(
+        return_type_ok,
+        GetShorty()[0] != 'F' && GetShorty()[0] != 'D' && !RequiresSmallResultTypeExtension());
+    if (return_type_ok && size == kFramePointerSize) {
+      // Note: This is not aligned to kNativeStackAlignment but that's OK for tail call.
+      DCHECK_EQ(size, kFramePointerSize);
+      static_assert(kFramePointerSize < kNativeStackAlignment);
+      return kFramePointerSize;
+    }
+  }
+
+  return RoundUp(size, kNativeStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> X86JniCallingConvention::CalleeSaveRegisters() const {
-  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  if (UNLIKELY(IsCriticalNative())) {
+    // Do not spill anything, whether tail call or not (return PC is already on the stack).
+    return ArrayRef<const ManagedRegister>();
+  } else {
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  }
 }
 
 bool X86JniCallingConvention::IsCurrentParamInRegister() {
@@ -265,15 +316,21 @@
   return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize));
 }
 
-size_t X86JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* and return pc (pushed after Method*)
-  size_t internal_args = 1 /* return pc */ + (HasJniEnv() ? 1 : 0 /* jni env */);
-  // No register args.
-  size_t total_args = static_args + param_args + internal_args;
-  return total_args;
+ManagedRegister X86JniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // EAX is neither managed callee-save, nor argument register, nor scratch register.
+  DCHECK(std::none_of(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
+                      [](ManagedRegister callee_save) constexpr {
+                        return callee_save.Equals(X86ManagedRegister::FromCpuRegister(EAX));
+                      }));
+  DCHECK(!InterproceduralScratchRegister().Equals(X86ManagedRegister::FromCpuRegister(EAX)));
+  return X86ManagedRegister::FromCpuRegister(EAX);
+}
+
+bool X86JniCallingConvention::UseTailCall() const {
+  CHECK(IsCriticalNative());
+  return OutArgSize() == kFramePointerSize;
 }
 
 }  // namespace x86
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index d0c6198..4d65fc3 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -36,7 +36,7 @@
   ~X86ManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -63,10 +63,10 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -78,11 +78,14 @@
 
   // x86 needs to extend small return types.
   bool RequiresSmallResultTypeExtension() const override {
-    return true;
+    return HasSmallReturnType();
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(X86JniCallingConvention);
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 9e77d6b..b15d904 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -28,7 +28,8 @@
 
 constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k64);
 static_assert(kX86_64PointerSize == PointerSize::k64, "Unexpected x86_64 pointer size");
-static_assert(kStackAlignment >= 16u, "System V AMD64 ABI requires at least 16 byte stack alignment");
+
+constexpr size_t kMmxSpillSize = 8u;
 
 // XMM0..XMM7 can be used to pass the first 8 floating args. The rest must go on the stack.
 // -- Managed and JNI calling conventions.
@@ -53,37 +54,59 @@
     X86_64ManagedRegister::FromXmmRegister(XMM15),
 };
 
-static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+template <size_t size>
+static constexpr uint32_t CalculateCoreCalleeSpillMask(
+    const ManagedRegister (&callee_saves)[size]) {
   // The spilled PC gets a special marker.
-  uint32_t result = 1 << kNumberOfCpuRegisters;
-  for (auto&& r : kCalleeSaveRegisters) {
+  uint32_t result = 1u << kNumberOfCpuRegisters;
+  for (auto&& r : callee_saves) {
     if (r.AsX86_64().IsCpuRegister()) {
-      result |= (1 << r.AsX86_64().AsCpuRegister().AsRegister());
+      result |= (1u << r.AsX86_64().AsCpuRegister().AsRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t CalculateFpCalleeSpillMask() {
-  uint32_t result = 0;
-  for (auto&& r : kCalleeSaveRegisters) {
+template <size_t size>
+static constexpr uint32_t CalculateFpCalleeSpillMask(const ManagedRegister (&callee_saves)[size]) {
+  uint32_t result = 0u;
+  for (auto&& r : callee_saves) {
     if (r.AsX86_64().IsXmmRegister()) {
-      result |= (1 << r.AsX86_64().AsXmmRegister().AsFloatRegister());
+      result |= (1u << r.AsX86_64().AsXmmRegister().AsFloatRegister());
     }
   }
   return result;
 }
 
-static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
-static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask(kCalleeSaveRegisters);
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask(kCalleeSaveRegisters);
+
+static constexpr size_t kNativeStackAlignment = 16;
+static_assert(kNativeStackAlignment == kStackAlignment);
+
+static constexpr ManagedRegister kNativeCalleeSaveRegisters[] = {
+    // Core registers.
+    X86_64ManagedRegister::FromCpuRegister(RBX),
+    X86_64ManagedRegister::FromCpuRegister(RBP),
+    X86_64ManagedRegister::FromCpuRegister(R12),
+    X86_64ManagedRegister::FromCpuRegister(R13),
+    X86_64ManagedRegister::FromCpuRegister(R14),
+    X86_64ManagedRegister::FromCpuRegister(R15),
+    // No callee-save float registers.
+};
+
+static constexpr uint32_t kNativeCoreCalleeSpillMask =
+    CalculateCoreCalleeSpillMask(kNativeCalleeSaveRegisters);
+static constexpr uint32_t kNativeFpCalleeSpillMask =
+    CalculateFpCalleeSpillMask(kNativeCalleeSaveRegisters);
 
 // Calling convention
 
-ManagedRegister X86_64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86_64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return X86_64ManagedRegister::FromCpuRegister(RAX);
 }
 
-ManagedRegister X86_64JniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86_64JniCallingConvention::InterproceduralScratchRegister() const {
   return X86_64ManagedRegister::FromCpuRegister(RAX);
 }
 
@@ -149,6 +172,7 @@
 }
 
 FrameOffset X86_64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
+  CHECK(IsCurrentParamOnStack());
   return FrameOffset(displacement_.Int32Value() +  // displacement
                      static_cast<size_t>(kX86_64PointerSize) +  // Method ref
                      itr_slots_ * sizeof(uint32_t));  // offset into in args
@@ -187,46 +211,86 @@
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
-  return kCoreCalleeSpillMask;
+  return is_critical_native_ ? 0u : kCoreCalleeSpillMask;
 }
 
 uint32_t X86_64JniCallingConvention::FpSpillMask() const {
-  return kFpCalleeSpillMask;
+  return is_critical_native_ ? 0u : kFpCalleeSpillMask;
 }
 
-size_t X86_64JniCallingConvention::FrameSize() {
+size_t X86_64JniCallingConvention::FrameSize() const {
+  if (is_critical_native_) {
+    CHECK(!SpillsMethod());
+    CHECK(!HasLocalReferenceSegmentState());
+    CHECK(!HasHandleScope());
+    CHECK(!SpillsReturnValue());
+    return 0u;  // There is no managed frame for @CriticalNative.
+  }
+
   // Method*, PC return address and callee save area size, local reference segment state
+  CHECK(SpillsMethod());
   const size_t method_ptr_size = static_cast<size_t>(kX86_64PointerSize);
   const size_t pc_return_addr_size = kFramePointerSize;
   const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
-  size_t frame_data_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
+  size_t total_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
 
-  if (LIKELY(HasLocalReferenceSegmentState())) {                     // local ref. segment state
-    // Local reference segment state is sometimes excluded.
-    frame_data_size += kFramePointerSize;
-  }
+  CHECK(HasLocalReferenceSegmentState());
+  total_size += kFramePointerSize;
 
-  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
-  const size_t handle_scope_size = HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
-
-  size_t total_size = frame_data_size;
-  if (LIKELY(HasHandleScope())) {
-    // HandleScope is sometimes excluded.
-    total_size += handle_scope_size;                                 // handle scope size
-  }
+  CHECK(HasHandleScope());
+  total_size += HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
 
   // Plus return value spill area size
+  CHECK(SpillsReturnValue());
   total_size += SizeOfReturnValue();
 
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t X86_64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
+size_t X86_64JniCallingConvention::OutArgSize() const {
+  // Count param args, including JNIEnv* and jclass*.
+  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
+  size_t num_fp_args = NumFloatOrDoubleArgs();
+  DCHECK_GE(all_args, num_fp_args);
+  size_t num_non_fp_args = all_args - num_fp_args;
+  // Account for FP arguments passed through Xmm0..Xmm7.
+  size_t num_stack_fp_args =
+      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
+  // Account for other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9).
+  size_t num_stack_non_fp_args =
+      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
+  // The size of outgoing arguments.
+  static_assert(kFramePointerSize == kMmxSpillSize);
+  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+
+  if (UNLIKELY(IsCriticalNative())) {
+    // We always need to spill xmm12-xmm15 as they are managed callee-saves
+    // but not native callee-saves.
+    static_assert((kCoreCalleeSpillMask & ~kNativeCoreCalleeSpillMask) == 0u);
+    static_assert((kFpCalleeSpillMask & ~kNativeFpCalleeSpillMask) != 0u);
+    size += POPCOUNT(kFpCalleeSpillMask & ~kNativeFpCalleeSpillMask) * kMmxSpillSize;
+    // Add return address size for @CriticalNative
+    // For normal native the return PC is part of the managed stack frame instead of out args.
+    size += kFramePointerSize;
+  }
+
+  return RoundUp(size, kNativeStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> X86_64JniCallingConvention::CalleeSaveRegisters() const {
-  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  if (UNLIKELY(IsCriticalNative())) {
+    DCHECK(!UseTailCall());
+    static_assert(std::size(kCalleeSaveRegisters) > std::size(kNativeCalleeSaveRegisters));
+    // TODO: Change to static_assert; std::equal should be constexpr since C++20.
+    DCHECK(std::equal(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kNativeCalleeSaveRegisters),
+                      kNativeCalleeSaveRegisters,
+                      [](ManagedRegister lhs, ManagedRegister rhs) { return lhs.Equals(rhs); }));
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters).SubArray(
+        /*pos=*/ std::size(kNativeCalleeSaveRegisters));
+  } else {
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  }
 }
 
 bool X86_64JniCallingConvention::IsCurrentParamInRegister() {
@@ -271,24 +335,24 @@
   return FrameOffset(offset);
 }
 
-// TODO: Calling this "NumberArgs" is misleading.
-// It's really more like NumberSlots (like itr_slots_)
-// because doubles/longs get counted twice.
-size_t X86_64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* and return pc (pushed after Method*)
-  size_t internal_args = 1 /* return pc */ + (HasJniEnv() ? 1 : 0 /* jni env */);
-  size_t total_args = static_args + param_args + internal_args;
+ManagedRegister X86_64JniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // R11 is neither managed callee-save, nor argument register, nor scratch register.
+  DCHECK(std::none_of(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
+                      [](ManagedRegister callee_save) constexpr {
+                        return callee_save.Equals(X86_64ManagedRegister::FromCpuRegister(R11));
+                      }));
+  DCHECK(!InterproceduralScratchRegister().Equals(X86_64ManagedRegister::FromCpuRegister(R11)));
+  return X86_64ManagedRegister::FromCpuRegister(R11);
+}
 
-  // Float arguments passed through Xmm0..Xmm7
-  // Other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9)
-  size_t total_stack_args = total_args
-                            - std::min(kMaxFloatOrDoubleRegisterArguments, static_cast<size_t>(NumFloatOrDoubleArgs()))
-                            - std::min(kMaxIntLikeRegisterArguments, static_cast<size_t>(NumArgs() - NumFloatOrDoubleArgs()));
-
-  return total_stack_args;
+// Whether to use tail call (used only for @CriticalNative).
+bool X86_64JniCallingConvention::UseTailCall() const {
+  CHECK(IsCriticalNative());
+  // We always need to spill xmm12-xmm15 as they are managed callee-saves
+  // but not native callee-saves, so we can never use a tail call.
+  return false;
 }
 
 }  // namespace x86_64
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index dfab41b..37b5978 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -33,7 +33,7 @@
   ~X86_64ManagedRuntimeCallingConvention() override {}
   // Calling convention
   ManagedRegister ReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -56,10 +56,10 @@
   // Calling convention
   ManagedRegister ReturnRegister() override;
   ManagedRegister IntReturnRegister() override;
-  ManagedRegister InterproceduralScratchRegister() override;
+  ManagedRegister InterproceduralScratchRegister() const override;
   // JNI calling convention
-  size_t FrameSize() override;
-  size_t OutArgSize() override;
+  size_t FrameSize() const override;
+  size_t OutArgSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
@@ -71,11 +71,14 @@
 
   // x86-64 needs to extend small return types.
   bool RequiresSmallResultTypeExtension() const override {
-    return true;
+    return HasSmallReturnType();
   }
 
- protected:
-  size_t NumberOfOutgoingStackArgs() override;
+  // Hidden argument register, used to pass the method pointer for @CriticalNative call.
+  ManagedRegister HiddenArgumentRegister() const override;
+
+  // Whether to use tail call (used only for @CriticalNative).
+  bool UseTailCall() const override;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(X86_64JniCallingConvention);
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 47a067b..ffb58ac 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -37,6 +37,10 @@
 #define ___   asm_.GetVIXLAssembler()->
 #endif
 
+// The AAPCS requires 8-byte alignement. This is not as strict as the Managed ABI stack alignment.
+static constexpr size_t kAapcsStackAlignment = 8u;
+static_assert(kAapcsStackAlignment < kStackAlignment);
+
 vixl::aarch32::Register AsVIXLRegister(ArmManagedRegister reg) {
   CHECK(reg.IsCoreRegister());
   return vixl::aarch32::Register(reg.RegId());
@@ -74,11 +78,16 @@
                                           ManagedRegister method_reg,
                                           ArrayRef<const ManagedRegister> callee_save_regs,
                                           const ManagedRegisterEntrySpills& entry_spills) {
-  CHECK_ALIGNED(frame_size, kStackAlignment);
-  CHECK(r0.Is(AsVIXLRegister(method_reg.AsArm())));
+  // If we're creating an actual frame with the method, enforce managed stack alignment,
+  // otherwise only the native stack alignment.
+  if (method_reg.IsNoRegister()) {
+    CHECK_ALIGNED_PARAM(frame_size, kAapcsStackAlignment);
+  } else {
+    CHECK_ALIGNED_PARAM(frame_size, kStackAlignment);
+  }
 
   // Push callee saves and link register.
-  RegList core_spill_mask = 1 << LR;
+  RegList core_spill_mask = 0;
   uint32_t fp_spill_mask = 0;
   for (const ManagedRegister& reg : callee_save_regs) {
     if (reg.AsArm().IsCoreRegister()) {
@@ -87,9 +96,11 @@
       fp_spill_mask |= 1 << reg.AsArm().AsSRegister();
     }
   }
-  ___ Push(RegisterList(core_spill_mask));
-  cfi().AdjustCFAOffset(POPCOUNT(core_spill_mask) * kFramePointerSize);
-  cfi().RelOffsetForMany(DWARFReg(r0), 0, core_spill_mask, kFramePointerSize);
+  if (core_spill_mask != 0u) {
+    ___ Push(RegisterList(core_spill_mask));
+    cfi().AdjustCFAOffset(POPCOUNT(core_spill_mask) * kFramePointerSize);
+    cfi().RelOffsetForMany(DWARFReg(r0), 0, core_spill_mask, kFramePointerSize);
+  }
   if (fp_spill_mask != 0) {
     uint32_t first = CTZ(fp_spill_mask);
 
@@ -103,12 +114,15 @@
 
   // Increase frame to required size.
   int pushed_values = POPCOUNT(core_spill_mask) + POPCOUNT(fp_spill_mask);
-  // Must at least have space for Method*.
-  CHECK_GT(frame_size, pushed_values * kFramePointerSize);
+  // Must at least have space for Method* if we're going to spill it.
+  CHECK_GE(frame_size, (pushed_values + (method_reg.IsRegister() ? 1u : 0u)) * kFramePointerSize);
   IncreaseFrameSize(frame_size - pushed_values * kFramePointerSize);  // handles CFI as well.
 
-  // Write out Method*.
-  asm_.StoreToOffset(kStoreWord, r0, sp, 0);
+  if (method_reg.IsRegister()) {
+    // Write out Method*.
+    CHECK(r0.Is(AsVIXLRegister(method_reg.AsArm())));
+    asm_.StoreToOffset(kStoreWord, r0, sp, 0);
+  }
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
@@ -133,27 +147,27 @@
 void ArmVIXLJNIMacroAssembler::RemoveFrame(size_t frame_size,
                                            ArrayRef<const ManagedRegister> callee_save_regs,
                                            bool may_suspend) {
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  CHECK_ALIGNED(frame_size, kAapcsStackAlignment);
   cfi().RememberState();
 
-  // Compute callee saves to pop and LR.
-  RegList core_spill_mask = 1 << LR;
-  uint32_t fp_spill_mask = 0;
+  // Compute callee saves to pop.
+  RegList core_spill_mask = 0u;
+  uint32_t fp_spill_mask = 0u;
   for (const ManagedRegister& reg : callee_save_regs) {
     if (reg.AsArm().IsCoreRegister()) {
-      core_spill_mask |= 1 << reg.AsArm().AsCoreRegister();
+      core_spill_mask |= 1u << reg.AsArm().AsCoreRegister();
     } else {
-      fp_spill_mask |= 1 << reg.AsArm().AsSRegister();
+      fp_spill_mask |= 1u << reg.AsArm().AsSRegister();
     }
   }
 
   // Decrease frame to start of callee saves.
-  int pop_values = POPCOUNT(core_spill_mask) + POPCOUNT(fp_spill_mask);
-  CHECK_GT(frame_size, pop_values * kFramePointerSize);
+  size_t pop_values = POPCOUNT(core_spill_mask) + POPCOUNT(fp_spill_mask);
+  CHECK_GE(frame_size, pop_values * kFramePointerSize);
   DecreaseFrameSize(frame_size - (pop_values * kFramePointerSize));  // handles CFI as well.
 
   // Pop FP callee saves.
-  if (fp_spill_mask != 0) {
+  if (fp_spill_mask != 0u) {
     uint32_t first = CTZ(fp_spill_mask);
     // Check that list is contiguous.
      DCHECK_EQ(fp_spill_mask >> CTZ(fp_spill_mask), ~0u >> (32 - POPCOUNT(fp_spill_mask)));
@@ -164,7 +178,9 @@
   }
 
   // Pop core callee saves and LR.
-  ___ Pop(RegisterList(core_spill_mask));
+  if (core_spill_mask != 0u) {
+    ___ Pop(RegisterList(core_spill_mask));
+  }
 
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     if (may_suspend) {
@@ -173,11 +189,8 @@
     } else {
       // The method shall not be suspended; no need to refresh the Marking Register.
 
-      // Check that the Marking Register is a callee-save register,
-      // and thus has been preserved by native code following the
-      // AAPCS calling convention.
-      DCHECK_NE(core_spill_mask & (1 << MR), 0)
-          << "core_spill_mask should contain Marking Register R" << MR;
+      // The Marking Register is a callee-save register, and thus has been
+      // preserved by native code following the AAPCS calling convention.
 
       // The following condition is a compile-time one, so it does not have a run-time cost.
       if (kIsDebugBuild) {
@@ -206,13 +219,17 @@
 
 
 void ArmVIXLJNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
-  asm_.AddConstant(sp, -adjust);
-  cfi().AdjustCFAOffset(adjust);
+  if (adjust != 0u) {
+    asm_.AddConstant(sp, -adjust);
+    cfi().AdjustCFAOffset(adjust);
+  }
 }
 
 void ArmVIXLJNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
-  asm_.AddConstant(sp, adjust);
-  cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0u) {
+    asm_.AddConstant(sp, adjust);
+    cfi().AdjustCFAOffset(-adjust);
+  }
 }
 
 void ArmVIXLJNIMacroAssembler::Store(FrameOffset dest, ManagedRegister m_src, size_t size) {
@@ -562,6 +579,17 @@
   // TODO: not validating references.
 }
 
+void ArmVIXLJNIMacroAssembler::Jump(ManagedRegister mbase,
+                                    Offset offset,
+                                    ManagedRegister mscratch) {
+  vixl::aarch32::Register base = AsVIXLRegister(mbase.AsArm());
+  vixl::aarch32::Register scratch = AsVIXLRegister(mscratch.AsArm());
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  temps.Exclude(scratch);
+  asm_.LoadFromOffset(kLoadWord, scratch, base, offset.Int32Value());
+  ___ Bx(scratch);
+}
+
 void ArmVIXLJNIMacroAssembler::Call(ManagedRegister mbase,
                                     Offset offset,
                                     ManagedRegister mscratch) {
@@ -602,7 +630,7 @@
 }
 
 void ArmVIXLJNIMacroAssembler::ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) {
-  CHECK_ALIGNED(stack_adjust, kStackAlignment);
+  CHECK_ALIGNED(stack_adjust, kAapcsStackAlignment);
   vixl::aarch32::Register scratch = AsVIXLRegister(mscratch.AsArm());
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   temps.Exclude(scratch);
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 0b1b6d2..1724671 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -181,6 +181,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 0eab49f..5b46971 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -37,6 +37,10 @@
 #define reg_d(D) Arm64Assembler::reg_d(D)
 #define reg_s(S) Arm64Assembler::reg_s(S)
 
+// The AAPCS64 requires 16-byte alignement. This is the same as the Managed ABI stack alignment.
+static constexpr size_t kAapcs64StackAlignment = 16u;
+static_assert(kAapcs64StackAlignment == kStackAlignment);
+
 Arm64JNIMacroAssembler::~Arm64JNIMacroAssembler() {
 }
 
@@ -57,16 +61,20 @@
 
 // See Arm64 PCS Section 5.2.2.1.
 void Arm64JNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  AddConstant(SP, -adjust);
-  cfi().AdjustCFAOffset(adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kStackAlignment);
+    AddConstant(SP, -adjust);
+    cfi().AdjustCFAOffset(adjust);
+  }
 }
 
 // See Arm64 PCS Section 5.2.2.1.
 void Arm64JNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  AddConstant(SP, adjust);
-  cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kStackAlignment);
+    AddConstant(SP, adjust);
+    cfi().AdjustCFAOffset(-adjust);
+  }
 }
 
 void Arm64JNIMacroAssembler::AddConstant(XRegister rd, int32_t value, Condition cond) {
@@ -531,6 +539,15 @@
   // TODO: not validating references.
 }
 
+void Arm64JNIMacroAssembler::Jump(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch) {
+  Arm64ManagedRegister base = m_base.AsArm64();
+  Arm64ManagedRegister scratch = m_scratch.AsArm64();
+  CHECK(base.IsXRegister()) << base;
+  CHECK(scratch.IsXRegister()) << scratch;
+  LoadFromOffset(scratch.AsXRegister(), base.AsXRegister(), offs.Int32Value());
+  ___ Br(reg_x(scratch.AsXRegister()));
+}
+
 void Arm64JNIMacroAssembler::Call(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch) {
   Arm64ManagedRegister base = m_base.AsArm64();
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -704,18 +721,20 @@
 
   // Increase frame to required size.
   DCHECK_ALIGNED(frame_size, kStackAlignment);
-  DCHECK_GE(frame_size, core_reg_size + fp_reg_size + static_cast<size_t>(kArm64PointerSize));
+  // Must at least have space for Method* if we're going to spill it.
+  DCHECK_GE(frame_size,
+            core_reg_size + fp_reg_size + (method_reg.IsRegister() ? kXRegSizeInBytes : 0u));
   IncreaseFrameSize(frame_size);
 
   // Save callee-saves.
   asm_.SpillRegisters(core_reg_list, frame_size - core_reg_size);
   asm_.SpillRegisters(fp_reg_list, frame_size - core_reg_size - fp_reg_size);
 
-  DCHECK(core_reg_list.IncludesAliasOf(reg_x(TR)));
-
-  // Write ArtMethod*
-  DCHECK(X0 == method_reg.AsArm64().AsXRegister());
-  StoreToOffset(X0, SP, 0);
+  if (method_reg.IsRegister()) {
+    // Write ArtMethod*
+    DCHECK(X0 == method_reg.AsArm64().AsXRegister());
+    StoreToOffset(X0, SP, 0);
+  }
 
   // Write out entry spills
   int32_t offset = frame_size + static_cast<size_t>(kArm64PointerSize);
@@ -760,10 +779,8 @@
 
   // For now we only check that the size of the frame is large enough to hold spills and method
   // reference.
-  DCHECK_GE(frame_size, core_reg_size + fp_reg_size + static_cast<size_t>(kArm64PointerSize));
-  DCHECK_ALIGNED(frame_size, kStackAlignment);
-
-  DCHECK(core_reg_list.IncludesAliasOf(reg_x(TR)));
+  DCHECK_GE(frame_size, core_reg_size + fp_reg_size);
+  DCHECK_ALIGNED(frame_size, kAapcs64StackAlignment);
 
   cfi().RememberState();
 
@@ -781,11 +798,8 @@
     } else {
       // The method shall not be suspended; no need to refresh the Marking Register.
 
-      // Check that the Marking Register is a callee-save register,
-      // and thus has been preserved by native code following the
-      // AAPCS64 calling convention.
-      DCHECK(core_reg_list.IncludesAliasOf(mr))
-          << "core_reg_list should contain Marking Register X" << mr.GetCode();
+      // The Marking Register is a callee-save register and thus has been
+      // preserved by native code following the AAPCS64 calling convention.
 
       // The following condition is a compile-time one, so it does not have a run-time cost.
       if (kIsDebugBuild) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 45316ed..54592a3 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -162,6 +162,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) override;
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index e6130cf..bbe0f73 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -197,6 +197,9 @@
   virtual void VerifyObject(ManagedRegister src, bool could_be_null) = 0;
   virtual void VerifyObject(FrameOffset src, bool could_be_null) = 0;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  virtual void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) = 0;
+
   // Call to address held at [base+offset]
   virtual void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) = 0;
   virtual void Call(FrameOffset base, Offset offset, ManagedRegister scratch) = 0;
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index db9c36c..fb41153 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -66,6 +66,10 @@
     return id_ == other.id_;
   }
 
+  constexpr bool IsRegister() const {
+    return id_ != kNoRegister;
+  }
+
   constexpr bool IsNoRegister() const {
     return id_ == kNoRegister;
   }
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index a9d1a25..6b73695 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -5191,6 +5191,17 @@
   // TODO: not validating references.
 }
 
+void MipsAssembler::Jump(ManagedRegister mbase, Offset offset, ManagedRegister mscratch) {
+  MipsManagedRegister base = mbase.AsMips();
+  MipsManagedRegister scratch = mscratch.AsMips();
+  CHECK(base.IsCoreRegister()) << base;
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
+                 base.AsCoreRegister(), offset.Int32Value());
+  Jr(scratch.AsCoreRegister());
+  NopIfNoReordering();
+}
+
 void MipsAssembler::Call(ManagedRegister mbase, Offset offset, ManagedRegister mscratch) {
   MipsManagedRegister base = mbase.AsMips();
   MipsManagedRegister scratch = mscratch.AsMips();
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index a24071d..3a4e0ce 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -1359,6 +1359,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) override;
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 70313ca..07d3716 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -4027,6 +4027,17 @@
   // TODO: not validating references
 }
 
+void Mips64Assembler::Jump(ManagedRegister mbase, Offset offset, ManagedRegister mscratch) {
+  Mips64ManagedRegister base = mbase.AsMips64();
+  Mips64ManagedRegister scratch = mscratch.AsMips64();
+  CHECK(base.IsGpuRegister()) << base;
+  CHECK(scratch.IsGpuRegister()) << scratch;
+  LoadFromOffset(kLoadDoubleword, scratch.AsGpuRegister(),
+                 base.AsGpuRegister(), offset.Int32Value());
+  Jr(scratch.AsGpuRegister());
+  Nop();
+}
+
 void Mips64Assembler::Call(ManagedRegister mbase, Offset offset, ManagedRegister mscratch) {
   Mips64ManagedRegister base = mbase.AsMips64();
   Mips64ManagedRegister scratch = mscratch.AsMips64();
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index b331cee..03eae91 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -1424,6 +1424,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) override;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 540d72b..f4ea004 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -39,6 +39,9 @@
 
 constexpr size_t kFramePointerSize = 4;
 
+static constexpr size_t kNativeStackAlignment = 16;
+static_assert(kNativeStackAlignment == kStackAlignment);
+
 #define __ asm_.
 
 void X86JNIMacroAssembler::BuildFrame(size_t frame_size,
@@ -47,7 +50,15 @@
                                       const ManagedRegisterEntrySpills& entry_spills) {
   DCHECK_EQ(CodeSize(), 0U);  // Nothing emitted yet.
   cfi().SetCurrentCFAOffset(4);  // Return address on stack.
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  if (frame_size == kFramePointerSize) {
+    // For @CriticalNative tail call.
+    CHECK(method_reg.IsNoRegister());
+    CHECK(spill_regs.empty());
+  } else if (method_reg.IsNoRegister()) {
+    CHECK_ALIGNED(frame_size, kNativeStackAlignment);
+  } else {
+    CHECK_ALIGNED(frame_size, kStackAlignment);
+  }
   int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
     Register spill = spill_regs[i].AsX86().AsCpuRegister();
@@ -59,12 +70,16 @@
 
   // return address then method on stack.
   int32_t adjust = frame_size - gpr_count * kFramePointerSize -
-      kFramePointerSize /*method*/ -
-      kFramePointerSize /*return address*/;
-  __ addl(ESP, Immediate(-adjust));
-  cfi().AdjustCFAOffset(adjust);
-  __ pushl(method_reg.AsX86().AsCpuRegister());
-  cfi().AdjustCFAOffset(kFramePointerSize);
+      kFramePointerSize /*return address*/ -
+      (method_reg.IsRegister() ? kFramePointerSize /*method*/ : 0u);
+  if (adjust != 0) {
+    __ addl(ESP, Immediate(-adjust));
+    cfi().AdjustCFAOffset(adjust);
+  }
+  if (method_reg.IsRegister()) {
+    __ pushl(method_reg.AsX86().AsCpuRegister());
+    cfi().AdjustCFAOffset(kFramePointerSize);
+  }
   DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size);
 
   for (const ManagedRegisterSpill& spill : entry_spills) {
@@ -86,12 +101,14 @@
 void X86JNIMacroAssembler::RemoveFrame(size_t frame_size,
                                        ArrayRef<const ManagedRegister> spill_regs,
                                        bool may_suspend ATTRIBUTE_UNUSED) {
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  CHECK_ALIGNED(frame_size, kNativeStackAlignment);
   cfi().RememberState();
   // -kFramePointerSize for ArtMethod*.
   int adjust = frame_size - spill_regs.size() * kFramePointerSize - kFramePointerSize;
-  __ addl(ESP, Immediate(adjust));
-  cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0) {
+    __ addl(ESP, Immediate(adjust));
+    cfi().AdjustCFAOffset(-adjust);
+  }
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     Register spill = spill_regs[i].AsX86().AsCpuRegister();
     __ popl(spill);
@@ -105,15 +122,19 @@
 }
 
 void X86JNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  __ addl(ESP, Immediate(-adjust));
-  cfi().AdjustCFAOffset(adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    __ addl(ESP, Immediate(-adjust));
+    cfi().AdjustCFAOffset(adjust);
+  }
 }
 
 static void DecreaseFrameSizeImpl(X86Assembler* assembler, size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  assembler->addl(ESP, Immediate(adjust));
-  assembler->cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    assembler->addl(ESP, Immediate(adjust));
+    assembler->cfi().AdjustCFAOffset(-adjust);
+  }
 }
 
 void X86JNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
@@ -301,7 +322,7 @@
       __ movl(dest.AsCpuRegister(), src.AsCpuRegister());
     } else if (src.IsX87Register() && dest.IsXmmRegister()) {
       // Pass via stack and pop X87 register
-      __ subl(ESP, Immediate(16));
+      IncreaseFrameSize(16);
       if (size == 4) {
         CHECK_EQ(src.AsX87Register(), ST0);
         __ fstps(Address(ESP, 0));
@@ -311,7 +332,7 @@
         __ fstpl(Address(ESP, 0));
         __ movsd(dest.AsXmmRegister(), Address(ESP, 0));
       }
-      __ addl(ESP, Immediate(16));
+      DecreaseFrameSize(16);
     } else {
       // TODO: x87, SSE
       UNIMPLEMENTED(FATAL) << ": Move " << dest << ", " << src;
@@ -487,6 +508,12 @@
   // TODO: not validating references
 }
 
+void X86JNIMacroAssembler::Jump(ManagedRegister mbase, Offset offset, ManagedRegister) {
+  X86ManagedRegister base = mbase.AsX86();
+  CHECK(base.IsCpuRegister());
+  __ jmp(Address(base.AsCpuRegister(), offset.Int32Value()));
+}
+
 void X86JNIMacroAssembler::Call(ManagedRegister mbase, Offset offset, ManagedRegister) {
   X86ManagedRegister base = mbase.AsX86();
   CHECK(base.IsCpuRegister());
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index a701080..7bf2f98 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -146,6 +146,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset]
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) override;
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 3921c4a..993cf95 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -2410,7 +2410,7 @@
 
   // Construct assembly text counterpart.
   std::ostringstream str;
-  str << "addq $0, %rsp\n";
+  // Increase by 0 is a NO-OP and ignored by the assembler.
   str << "addq $-" << kStackAlignment << ", %rsp\n";
   str << "addq $-" << 10 * kStackAlignment << ", %rsp\n";
 
@@ -2430,7 +2430,7 @@
 
   // Construct assembly text counterpart.
   std::ostringstream str;
-  str << "addq $0, %rsp\n";
+  // Decrease by 0 is a NO-OP and ignored by the assembler.
   str << "addq $" << kStackAlignment << ", %rsp\n";
   str << "addq $" << 10 * kStackAlignment << ", %rsp\n";
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 5924a8b..ffe9020 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -33,6 +33,9 @@
 
 constexpr size_t kFramePointerSize = 8;
 
+static constexpr size_t kNativeStackAlignment = 16;
+static_assert(kNativeStackAlignment == kStackAlignment);
+
 #define __ asm_.
 
 void X86_64JNIMacroAssembler::BuildFrame(size_t frame_size,
@@ -41,8 +44,13 @@
                                          const ManagedRegisterEntrySpills& entry_spills) {
   DCHECK_EQ(CodeSize(), 0U);  // Nothing emitted yet.
   cfi().SetCurrentCFAOffset(8);  // Return address on stack.
-  CHECK_ALIGNED(frame_size, kStackAlignment);
-  int gpr_count = 0;
+  // Note: @CriticalNative tail call is not used (would have frame_size == kFramePointerSize).
+  if (method_reg.IsNoRegister()) {
+    CHECK_ALIGNED(frame_size, kNativeStackAlignment);
+  } else {
+    CHECK_ALIGNED(frame_size, kStackAlignment);
+  }
+  size_t gpr_count = 0u;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
     x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsCpuRegister()) {
@@ -56,8 +64,10 @@
   int64_t rest_of_frame = static_cast<int64_t>(frame_size)
                           - (gpr_count * kFramePointerSize)
                           - kFramePointerSize /*return address*/;
-  __ subq(CpuRegister(RSP), Immediate(rest_of_frame));
-  cfi().AdjustCFAOffset(rest_of_frame);
+  if (rest_of_frame != 0) {
+    __ subq(CpuRegister(RSP), Immediate(rest_of_frame));
+    cfi().AdjustCFAOffset(rest_of_frame);
+  }
 
   // spill xmms
   int64_t offset = rest_of_frame;
@@ -73,7 +83,9 @@
   static_assert(static_cast<size_t>(kX86_64PointerSize) == kFramePointerSize,
                 "Unexpected frame pointer size.");
 
-  __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
+  if (method_reg.IsRegister()) {
+    __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
+  }
 
   for (const ManagedRegisterSpill& spill : entry_spills) {
     if (spill.AsX86_64().IsCpuRegister()) {
@@ -101,26 +113,29 @@
 void X86_64JNIMacroAssembler::RemoveFrame(size_t frame_size,
                                           ArrayRef<const ManagedRegister> spill_regs,
                                           bool may_suspend ATTRIBUTE_UNUSED) {
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  CHECK_ALIGNED(frame_size, kNativeStackAlignment);
   cfi().RememberState();
   int gpr_count = 0;
   // unspill xmms
   int64_t offset = static_cast<int64_t>(frame_size)
       - (spill_regs.size() * kFramePointerSize)
-      - 2 * kFramePointerSize;
+      - kFramePointerSize;
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsXmmRegister()) {
-      offset += sizeof(double);
       __ movsd(spill.AsXmmRegister(), Address(CpuRegister(RSP), offset));
       cfi().Restore(DWARFReg(spill.AsXmmRegister().AsFloatRegister()));
+      offset += sizeof(double);
     } else {
       gpr_count++;
     }
   }
-  int adjust = static_cast<int>(frame_size) - (gpr_count * kFramePointerSize) - kFramePointerSize;
-  __ addq(CpuRegister(RSP), Immediate(adjust));
-  cfi().AdjustCFAOffset(-adjust);
+  DCHECK_EQ(static_cast<size_t>(offset),
+            frame_size - (gpr_count * kFramePointerSize) - kFramePointerSize);
+  if (offset != 0) {
+    __ addq(CpuRegister(RSP), Immediate(offset));
+    cfi().AdjustCFAOffset(-offset);
+  }
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsCpuRegister()) {
@@ -136,15 +151,19 @@
 }
 
 void X86_64JNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  __ addq(CpuRegister(RSP), Immediate(-static_cast<int64_t>(adjust)));
-  cfi().AdjustCFAOffset(adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    __ addq(CpuRegister(RSP), Immediate(-static_cast<int64_t>(adjust)));
+    cfi().AdjustCFAOffset(adjust);
+  }
 }
 
 static void DecreaseFrameSizeImpl(size_t adjust, X86_64Assembler* assembler) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  assembler->addq(CpuRegister(RSP), Immediate(adjust));
-  assembler->cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    assembler->addq(CpuRegister(RSP), Immediate(adjust));
+    assembler->cfi().AdjustCFAOffset(-adjust);
+  }
 }
 
 void X86_64JNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
@@ -544,6 +563,12 @@
   // TODO: not validating references
 }
 
+void X86_64JNIMacroAssembler::Jump(ManagedRegister mbase, Offset offset, ManagedRegister) {
+  X86_64ManagedRegister base = mbase.AsX86_64();
+  CHECK(base.IsCpuRegister());
+  __ jmp(Address(base.AsCpuRegister(), offset.Int32Value()));
+}
+
 void X86_64JNIMacroAssembler::Call(ManagedRegister mbase, Offset offset, ManagedRegister) {
   X86_64ManagedRegister base = mbase.AsX86_64();
   CHECK(base.IsCpuRegister());
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 4c2fd8f..d3f1fce 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -172,6 +172,9 @@
   void VerifyObject(ManagedRegister src, bool could_be_null) override;
   void VerifyObject(FrameOffset src, bool could_be_null) override;
 
+  // Jump to address held at [base+offset] (used for tail calls).
+  void Jump(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
+
   // Call to address held at [base+offset]
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) override;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) override;