When entering nterp, take a fast path for instance calls with 1 argument.

Such methods only take 'this' as an argument and we don't need to fetch
the shorty.

We can make this optimization when doing nterp->compiled as a follow-up,
by checking that the next instruction after the invoke is not
move-result(-wide).

Test: test.py
Bug: 112676029
Change-Id: Ibc7b4d4ca1c636f4ad6572484e0990ccdbd63293
diff --git a/runtime/interpreter/mterp/arm64ng/main.S b/runtime/interpreter/mterp/arm64ng/main.S
index b6d9db6..a977a90 100644
--- a/runtime/interpreter/mterp/arm64ng/main.S
+++ b/runtime/interpreter/mterp/arm64ng/main.S
@@ -383,7 +383,7 @@
 //
 // Outputs
 // - ip contains the dex registers size
-// - x13 contains the old stack pointer.
+// - x28 contains the old stack pointer.
 // - \code_item is replaced with a pointer to the instructions
 // - if load_ins is 1, w15 contains the ins
 //
@@ -410,10 +410,10 @@
     add \fp, \refs, ip, lsl #2
 
     // Now setup the stack pointer.
-    mov x13, sp
-    .cfi_def_cfa_register x13
+    mov x28, sp
+    .cfi_def_cfa_register x28
     mov sp, x14
-    str x13, [\refs, #-8]
+    str x28, [\refs, #-8]
     CFI_DEF_CFA_BREG_PLUS_UCONST \cfi_refs, -8, CALLEE_SAVES_SIZE
 
     // Put nulls in reference frame.
@@ -483,14 +483,10 @@
 .endm
 
 .macro SPILL_ALL_ARGUMENTS
-    INCREASE_FRAME 128
-    // GP arguments.
-    SAVE_TWO_REGS x0, x1, 0
-    SAVE_TWO_REGS x2, x3, 16
-    SAVE_TWO_REGS x4, x5, 32
-    SAVE_TWO_REGS x6, x7, 48
-
-    // FP arguments
+    stp x0, x1, [sp, #-128]!
+    stp x2, x3, [sp, #16]
+    stp x4, x5, [sp, #32]
+    stp x6, x7, [sp, #48]
     stp d0, d1, [sp, #64]
     stp d2, d3, [sp, #80]
     stp d4, d5, [sp, #96]
@@ -498,18 +494,14 @@
 .endm
 
 .macro RESTORE_ALL_ARGUMENTS
-    // GP arguments.
-    RESTORE_TWO_REGS x0, x1, 0
-    RESTORE_TWO_REGS x2, x3, 16
-    RESTORE_TWO_REGS x4, x5, 32
-    RESTORE_TWO_REGS x6, x7, 48
-
-    // FP arguments
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
     ldp d0, d1, [sp, #64]
     ldp d2, d3, [sp, #80]
     ldp d4, d5, [sp, #96]
     ldp d6, d7, [sp, #112]
-    DECREASE_FRAME 128
+    ldp x0, x1, [sp], #128
 .endm
 
 // Helper to setup the stack after doing a nterp to nterp call. This will setup:
@@ -1356,16 +1348,7 @@
     /* Spill callee save regs */
     SPILL_ALL_CALLEE_SAVES
 
-    // TODO: Get shorty in a better way and remove below
-    SPILL_ALL_ARGUMENTS
-
-    bl NterpGetShorty
-    // Save shorty in callee-save xIBASE.
-    mov xIBASE, x0
-
-    RESTORE_ALL_ARGUMENTS
     ldr xPC, [x0, #ART_METHOD_DATA_OFFSET_64]
-
     // Setup the stack for executing the method.
     SETUP_STACK_FRAME xPC, xREFS, xFP, CFI_REFS, load_ins=1
 
@@ -1373,24 +1356,37 @@
     cbz w15, .Lxmm_setup_finished
 
     sub ip2, ip, x15
-    lsl x8, ip2, #2 // x8 is now the offset for inputs into the registers array.
+    ldr w26, [x0, #ART_METHOD_ACCESS_FLAGS_OFFSET]
+    lsl x21, ip2, #2 // x21 is now the offset for inputs into the registers array.
 
-    // Setup shorty, pointer to inputs in FP and pointer to inputs in REFS
-    add x9, xIBASE, #1  // shorty + 1  ; ie skip return arg character
-    add x10, xFP, x8
-    add x11, xREFS, x8
+    // If the method is not static and there is one argument ('this'), we don't need to fetch the
+    // shorty.
+    tbnz w26, #ART_METHOD_IS_STATIC_FLAG_BIT, .Lsetup_with_shorty
+    str w1, [xFP, x21]
+    str w1, [xREFS, x21]
+    cmp w15, #1
+    b.eq .Lxmm_setup_finished
 
-    ldr wip, [x0, #ART_METHOD_ACCESS_FLAGS_OFFSET]
-    // TODO: could be TBNZ but we'd need a constant for log2(ART_METHOD_IS_STATIC_FLAG).
-    tst wip, #ART_METHOD_IS_STATIC_FLAG
-    b.ne .Lhandle_static_method
-    str w1, [x10], #4
-    str w1, [x11], #4
-    add x13, x13, #4
+.Lsetup_with_shorty:
+    // TODO: Get shorty in a better way and remove below
+    SPILL_ALL_ARGUMENTS
+    bl NterpGetShorty
+    // Save shorty in callee-save xIBASE.
+    mov xIBASE, x0
+    RESTORE_ALL_ARGUMENTS
+
+    // Setup pointer to inputs in FP and pointer to inputs in REFS
+    add x10, xFP, x21
+    add x11, xREFS, x21
     mov x12, #0
+
+    add x9, xIBASE, #1  // shorty + 1  ; ie skip return arg character
+    tbnz w26, #ART_METHOD_IS_STATIC_FLAG_BIT, .Lhandle_static_method
+    add x10, x10, #4
+    add x11, x11, #4
+    add x28, x28, #4
     b .Lcontinue_setup_gprs
 .Lhandle_static_method:
-    mov x12, #0
     LOOP_OVER_SHORTY_STORING_GPRS x1, w1, x9, x12, x10, x11, .Lgpr_setup_finished
 .Lcontinue_setup_gprs:
     LOOP_OVER_SHORTY_STORING_GPRS x2, w2, x9, x12, x10, x11, .Lgpr_setup_finished
@@ -1399,7 +1395,7 @@
     LOOP_OVER_SHORTY_STORING_GPRS x5, w5, x9, x12, x10, x11, .Lgpr_setup_finished
     LOOP_OVER_SHORTY_STORING_GPRS x6, w6, x9, x12, x10, x11, .Lgpr_setup_finished
     LOOP_OVER_SHORTY_STORING_GPRS x7, w7, x9, x12, x10, x11, .Lgpr_setup_finished
-    LOOP_OVER_INTs x9, x12, x10, x11, x13, .Lgpr_setup_finished
+    LOOP_OVER_INTs x9, x12, x10, x11, x28, .Lgpr_setup_finished
 .Lgpr_setup_finished:
     add x9, xIBASE, #1  // shorty + 1  ; ie skip return arg character
     mov x12, #0  // reset counter
@@ -1411,7 +1407,7 @@
     LOOP_OVER_SHORTY_STORING_FPS d5, s5, x9, x12, x10, .Lxmm_setup_finished
     LOOP_OVER_SHORTY_STORING_FPS d6, s6, x9, x12, x10, .Lxmm_setup_finished
     LOOP_OVER_SHORTY_STORING_FPS d7, s7, x9, x12, x10, .Lxmm_setup_finished
-    LOOP_OVER_FPs x9, x12, x10, x13, .Lxmm_setup_finished
+    LOOP_OVER_FPs x9, x12, x10, x28, .Lxmm_setup_finished
 .Lxmm_setup_finished:
     CFI_DEFINE_DEX_PC_WITH_OFFSET(CFI_TMP, CFI_DEX, 0)
 
diff --git a/runtime/interpreter/mterp/armng/main.S b/runtime/interpreter/mterp/armng/main.S
index 7095f58..d2ca06f 100644
--- a/runtime/interpreter/mterp/armng/main.S
+++ b/runtime/interpreter/mterp/armng/main.S
@@ -505,31 +505,6 @@
     .endif
 .endm
 
-.macro SPILL_ALL_ARGUMENTS
-    // We spill r4 for stack alignment.
-    push {r0-r4}
-    .cfi_adjust_cfa_offset 20
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r1, 4
-    .cfi_rel_offset r2, 8
-    .cfi_rel_offset r3, 12
-    .cfi_rel_offset r4, 16
-    vpush {s0-s15}
-    .cfi_adjust_cfa_offset 64
-.endm
-
-.macro RESTORE_ALL_ARGUMENTS
-    vpop {s0-s15}
-    .cfi_adjust_cfa_offset -64
-    pop {r0-r4}
-    .cfi_restore r0
-    .cfi_restore r1
-    .cfi_restore r2
-    .cfi_restore r3
-    .cfi_restore r4
-    .cfi_adjust_cfa_offset -20
-.endm
-
 // Helper to setup the stack after doing a nterp to nterp call. This will setup:
 // - rNEW_FP: the new pointer to dex registers
 // - rNEW_REFS: the new pointer to references
@@ -1396,15 +1371,6 @@
     /* Spill callee save regs */
     SPILL_ALL_CALLEE_SAVES
 
-    // TODO: Get shorty in a better way and remove below
-    SPILL_ALL_ARGUMENTS
-
-    bl NterpGetShorty
-    // Save shorty in callee-save rIBASE.
-    mov rIBASE, r0
-
-    RESTORE_ALL_ARGUMENTS
-
     ldr rPC, [r0, #ART_METHOD_DATA_OFFSET_32]
 
     // Setup the stack for executing the method.
@@ -1414,36 +1380,54 @@
     cmp r4, #0
     beq .Lxmm_setup_finished
 
-    sub r4, rINST, r4
-    lsl r4, r4, #2 // r4 is now the offset for inputs into the registers array.
+    sub rINST, rINST, r4
+    ldr r8, [r0, #ART_METHOD_ACCESS_FLAGS_OFFSET]
+    lsl rINST, rINST, #2 // rINST is now the offset for inputs into the registers array.
+    mov rIBASE, ip // rIBASE contains the old stack pointer
 
-    mov lr, ip // lr contains the old stack pointer
+    // If the method is not static and there is one argument ('this'), we don't need to fetch the
+    // shorty.
+    tst r8, #ART_METHOD_IS_STATIC_FLAG
+    bne .Lsetup_with_shorty
+    str r1, [rFP, rINST]
+    str r1, [rREFS, rINST]
+    cmp r4, #1
+    beq .Lxmm_setup_finished
 
-    ldr ip, [r0, #ART_METHOD_ACCESS_FLAGS_OFFSET]
-    // r0 is now available.
+.Lsetup_with_shorty:
+    // Save arguments that were passed before calling into the runtime.
+    // No need to save r0 (ArtMethod) as we're not using it later in this code.
+    // Save r4 for stack aligment.
+    // TODO: Get shorty in a better way and remove below
+    push {r1-r4}
+    vpush {s0-s15}
+    bl NterpGetShorty
+    vpop {s0-s15}
+    pop {r1-r4}
+
+    mov ip, r8
+    add r8, rREFS, rINST
+    add r7, rFP, rINST
+    mov r4, #0
     // Setup shorty, pointer to inputs in FP and pointer to inputs in REFS
-    add r0, rIBASE, #1  // shorty + 1  ; ie skip return arg character
-    add r7, rFP, r4
-    add r8, rREFS, r4
+    add lr, r0, #1  // shorty + 1  ; ie skip return arg character
     tst ip, #ART_METHOD_IS_STATIC_FLAG
     bne .Lhandle_static_method
-    str r1, [r7], #4
-    str r1, [r8], #4
-    add lr, lr, #4
-    mov r4, #0
+    add r7, r7, #4
+    add r8, r8, #4
+    add rIBASE, rIBASE, #4
     b .Lcontinue_setup_gprs
 .Lhandle_static_method:
-    mov r4, #0
-    LOOP_OVER_SHORTY_STORING_GPRS r1, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
+    LOOP_OVER_SHORTY_STORING_GPRS r1, lr, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
 .Lcontinue_setup_gprs:
-    LOOP_OVER_SHORTY_STORING_GPRS r2, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
-    LOOP_OVER_SHORTY_STORING_GPRS r3, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=1
+    LOOP_OVER_SHORTY_STORING_GPRS r2, lr, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
+    LOOP_OVER_SHORTY_STORING_GPRS r3, lr, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=1
 .Lif_long:
-    LOOP_OVER_INTs r0, r4, r7, r8, lr, ip, r1, .Lgpr_setup_finished
+    LOOP_OVER_INTs lr, r4, r7, r8, rIBASE, ip, r1, .Lgpr_setup_finished
 .Lgpr_setup_finished:
-    add r0, rIBASE, #1  // shorty + 1  ; ie skip return arg character
+    add r0, r0, #1  // shorty + 1  ; ie skip return arg character
     mov r1, r7
-    add r2, lr, #OFFSET_TO_FIRST_ARGUMENT_IN_STACK
+    add r2, rIBASE, #OFFSET_TO_FIRST_ARGUMENT_IN_STACK
     vpush {s0-s15}
     mov r3, sp
     bl NterpStoreArm32Fprs
diff --git a/runtime/interpreter/mterp/x86_64ng/main.S b/runtime/interpreter/mterp/x86_64ng/main.S
index 20dc760..02f0c5a 100644
--- a/runtime/interpreter/mterp/x86_64ng/main.S
+++ b/runtime/interpreter/mterp/x86_64ng/main.S
@@ -1450,17 +1450,40 @@
     /* Spill callee save regs */
     SPILL_ALL_CALLEE_SAVES
 
+    movq ART_METHOD_DATA_OFFSET_64(%rdi), rPC
+
+    // Setup the stack for executing the method.
+    SETUP_STACK_FRAME rPC, rREFS, rREFS32, rFP, CFI_REFS, load_ins=1
+
+    // Setup the parameters
+    testl %r14d, %r14d
+    je .Lxmm_setup_finished
+
+    subq %r14, %rbx
+    salq $$2, %rbx // rbx is now the offset for inputs into the registers array.
+
+    // If the method is not static and there is one argument ('this'), we don't need to fetch the
+    // shorty.
+    testl $$ART_METHOD_IS_STATIC_FLAG, ART_METHOD_ACCESS_FLAGS_OFFSET(%rdi)
+    jne .Lsetup_with_shorty
+
+    movl %esi, (rFP, %rbx)
+    movl %esi, (rREFS, %rbx)
+
+    cmpl $$1, %r14d
+    je .Lxmm_setup_finished
+
+.Lsetup_with_shorty:
     // TODO: Get shorty in a better way and remove below
-    PUSH rdi
-    PUSH rsi
-    PUSH rdx
-    PUSH rcx
-    PUSH r8
-    PUSH r9
+    push %rdi
+    push %rsi
+    push %rdx
+    push %rcx
+    push %r8
+    push %r9
 
     // Save xmm registers + alignment.
     subq MACRO_LITERAL(8 * 8 + 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(8 * 8 + 8)
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
     movq %xmm2, 16(%rsp)
@@ -1484,45 +1507,31 @@
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
     addq MACRO_LITERAL(8 * 8 + 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-8 * 8 - 8)
 
-    POP r9
-    POP r8
-    POP rcx
-    POP rdx
-    POP rsi
-    POP rdi
+    pop %r9
+    pop %r8
+    pop %rcx
+    pop %rdx
+    pop %rsi
+    pop %rdi
+    // Reload the old stack pointer, which used to be stored in %r11, which is not callee-saved.
+    movq -8(rREFS), %r11
     // TODO: Get shorty in a better way and remove above
 
-    movq ART_METHOD_DATA_OFFSET_64(%rdi), rPC
-
-    // Setup the stack for executing the method.
-    SETUP_STACK_FRAME rPC, rREFS, rREFS32, rFP, CFI_REFS, load_ins=1
-
-    // Setup the parameters
-    testl %r14d, %r14d
-    je .Lxmm_setup_finished
-
-    subq %r14, %rbx
-    salq $$2, %rbx // rbx is now the offset for inputs into the registers array.
-
+    movq $$0, %r14
     testl $$ART_METHOD_IS_STATIC_FLAG, ART_METHOD_ACCESS_FLAGS_OFFSET(%rdi)
 
-    // Available: rdi, r10, r14
+    // Available: rdi, r10
     // Note the leaq below don't change the flags.
     leaq 1(%rbp), %r10  // shorty + 1  ; ie skip return arg character
     leaq (rFP, %rbx, 1), %rdi
     leaq (rREFS, %rbx, 1), %rbx
     jne .Lhandle_static_method
-    movl %esi, (%rdi)
-    movl %esi, (%rbx)
     addq $$4, %rdi
     addq $$4, %rbx
     addq $$4, %r11
-    movq $$0, %r14
     jmp .Lcontinue_setup_gprs
 .Lhandle_static_method:
-    movq $$0, %r14
     LOOP_OVER_SHORTY_STORING_GPRS rsi, esi, r10, r14, rdi, rbx, .Lgpr_setup_finished
 .Lcontinue_setup_gprs:
     LOOP_OVER_SHORTY_STORING_GPRS rdx, edx, r10, r14, rdi, rbx, .Lgpr_setup_finished
diff --git a/tools/cpp-define-generator/art_method.def b/tools/cpp-define-generator/art_method.def
index 7b5606f..c2e18b1 100644
--- a/tools/cpp-define-generator/art_method.def
+++ b/tools/cpp-define-generator/art_method.def
@@ -25,6 +25,8 @@
            art::kAccStatic)
 ASM_DEFINE(ART_METHOD_IMT_MASK,
            art::ImTable::kSizeTruncToPowerOfTwo - 1)
+ASM_DEFINE(ART_METHOD_IS_STATIC_FLAG_BIT,
+           art::MostSignificantBit(art::kAccStatic))
 ASM_DEFINE(ART_METHOD_DECLARING_CLASS_OFFSET,
            art::ArtMethod::DeclaringClassOffset().Int32Value())
 ASM_DEFINE(ART_METHOD_JNI_OFFSET_32,