Minor improvements in JNI assemblers.

Address issues identified in
    https://android-review.googlesource.com/1246286
and clean up a few other things.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 12189621
Change-Id: I9ee1a9c113ef756d7aa3bd4d3f17ef1aaa4306f5
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 579347f9..e97cab8 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -304,13 +304,13 @@
 
 ManagedRegister X86_64JniCallingConvention::HiddenArgumentRegister() const {
   CHECK(IsCriticalNative());
-  // R11 is neither managed callee-save, nor argument register, nor scratch register.
+  // RAX is neither managed callee-save, nor argument register, nor scratch register.
   DCHECK(std::none_of(kCalleeSaveRegisters,
                       kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
                       [](ManagedRegister callee_save) constexpr {
-                        return callee_save.Equals(X86_64ManagedRegister::FromCpuRegister(R11));
+                        return callee_save.Equals(X86_64ManagedRegister::FromCpuRegister(RAX));
                       }));
-  return X86_64ManagedRegister::FromCpuRegister(R11);
+  return X86_64ManagedRegister::FromCpuRegister(RAX);
 }
 
 // Whether to use tail call (used only for @CriticalNative).
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 4efbe9f..5a355be 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -576,20 +576,16 @@
 
 void ArmVIXLJNIMacroAssembler::Call(ManagedRegister mbase, Offset offset) {
   vixl::aarch32::Register base = AsVIXLRegister(mbase.AsArm());
-  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
-  vixl32::Register scratch = temps.Acquire();
-  asm_.LoadFromOffset(kLoadWord, scratch, base, offset.Int32Value());
-  ___ Blx(scratch);
+  asm_.LoadFromOffset(kLoadWord, lr, base, offset.Int32Value());
+  ___ Blx(lr);
   // TODO: place reference map on call.
 }
 
 void ArmVIXLJNIMacroAssembler::Call(FrameOffset base, Offset offset) {
-  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
-  vixl32::Register scratch = temps.Acquire();
   // Call *(*(SP + base) + offset)
-  asm_.LoadFromOffset(kLoadWord, scratch, sp, base.Int32Value());
-  asm_.LoadFromOffset(kLoadWord, scratch, scratch, offset.Int32Value());
-  ___ Blx(scratch);
+  asm_.LoadFromOffset(kLoadWord, lr, sp, base.Int32Value());
+  asm_.LoadFromOffset(kLoadWord, lr, lr, offset.Int32Value());
+  ___ Blx(lr);
   // TODO: place reference map on call
 }
 
@@ -671,13 +667,10 @@
   // Pass exception object as argument.
   // Don't care about preserving r0 as this won't return.
   ___ Mov(r0, scratch);
-  temps.Include(scratch);
-  // TODO: check that exception->scratch_ is dead by this point.
-  vixl32::Register temp = temps.Acquire();
-  ___ Ldr(temp,
+  ___ Ldr(lr,
           MemOperand(tr,
               QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pDeliverException).Int32Value()));
-  ___ Blx(temp);
+  ___ Blx(lr);
 }
 
 void ArmVIXLJNIMacroAssembler::MemoryBarrier(ManagedRegister scratch ATTRIBUTE_UNUSED) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index c2aef83..a31ed93 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -165,7 +165,7 @@
 void Arm64JNIMacroAssembler::StoreImmediateToFrame(FrameOffset offs, uint32_t imm) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
-  ___ Mov(scratch.X(), imm);  // TODO: Use W register.
+  ___ Mov(scratch, imm);
   ___ Str(scratch, MEM_OP(reg_x(SP), offs.Int32Value()));
 }
 
@@ -408,13 +408,7 @@
   DCHECK(size == 4 || size == 8) << size;
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = (size == 8) ? temps.AcquireX() : temps.AcquireW();
-  if (size < 8u || IsAligned<8u>(src.Int32Value()) || src.Int32Value() < 0x100) {
-    ___ Ldr(scratch, MEM_OP(reg_x(SP), src.Int32Value()));
-  } else {
-    // TODO: Let the macro assembler deal with this case as well (uses another scratch register).
-    ___ Mov(scratch.X(), src.Int32Value());
-    ___ Ldr(scratch, MEM_OP(reg_x(SP), scratch.X()));
-  }
+  ___ Ldr(scratch, MEM_OP(reg_x(SP), src.Int32Value()));
   ___ Str(scratch, MEM_OP(reg_x(SP), dest.Int32Value()));
 }
 
@@ -559,19 +553,15 @@
 void Arm64JNIMacroAssembler::Call(ManagedRegister m_base, Offset offs) {
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(base.IsXRegister()) << base;
-  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
-  Register scratch = temps.AcquireX();
-  ___ Ldr(scratch, MEM_OP(reg_x(base.AsXRegister()), offs.Int32Value()));
-  ___ Blr(scratch);
+  ___ Ldr(lr, MEM_OP(reg_x(base.AsXRegister()), offs.Int32Value()));
+  ___ Blr(lr);
 }
 
 void Arm64JNIMacroAssembler::Call(FrameOffset base, Offset offs) {
-  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
-  Register scratch = temps.AcquireX();
   // Call *(*(SP + base) + offset)
-  ___ Ldr(scratch, MEM_OP(reg_x(SP), base.Int32Value()));
-  ___ Ldr(scratch, MEM_OP(scratch, offs.Int32Value()));
-  ___ Blr(scratch);
+  ___ Ldr(lr, MEM_OP(reg_x(SP), base.Int32Value()));
+  ___ Ldr(lr, MEM_OP(lr, offs.Int32Value()));
+  ___ Blr(lr);
 }
 
 void Arm64JNIMacroAssembler::CallFromThread(ThreadOffset64 offset ATTRIBUTE_UNUSED) {
@@ -612,15 +602,14 @@
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireX();
   if (null_allowed) {
-    // TODO: Clean this up; load to temp2 (W register), use xzr for CSEL, reorder ADD earlier.
     Register scratch2 = temps.AcquireW();
-    ___ Ldr(scratch.W(), MEM_OP(reg_x(SP), handle_scope_offset.Int32Value()));
+    ___ Ldr(scratch2, MEM_OP(reg_x(SP), handle_scope_offset.Int32Value()));
+    ___ Add(scratch, reg_x(SP), handle_scope_offset.Int32Value());
     // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
     // the address in the handle scope holding the reference.
     // e.g. scratch = (scratch == 0) ? 0 : (SP+handle_scope_offset)
-    ___ Cmp(scratch.W(), 0);
-    ___ Add(scratch2.X(), reg_x(SP), handle_scope_offset.Int32Value());
-    ___ Csel(scratch, scratch2.X(), scratch, ne);
+    ___ Cmp(scratch2, 0);
+    ___ Csel(scratch, scratch, xzr, ne);
   } else {
     ___ Add(scratch, reg_x(SP), handle_scope_offset.Int32Value());
   }
@@ -669,12 +658,11 @@
   Register scratch = temps.AcquireW();
   ___ Ldr(scratch, MEM_OP(reg_x(TR), Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value()));
   switch (cond) {
-    // TODO: Use `scratch` instead of `scratch.X()`.
     case JNIMacroUnaryCondition::kZero:
-      ___ Cbz(scratch.X(), Arm64JNIMacroLabel::Cast(label)->AsArm64());
+      ___ Cbz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
       break;
     case JNIMacroUnaryCondition::kNotZero:
-      ___ Cbnz(scratch.X(), Arm64JNIMacroLabel::Cast(label)->AsArm64());
+      ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
       break;
     default:
       LOG(FATAL) << "Not implemented unary condition: " << static_cast<int>(cond);
@@ -690,7 +678,6 @@
 void Arm64JNIMacroAssembler::EmitExceptionPoll(Arm64Exception* exception) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   temps.Exclude(exception->scratch_);
-  Register scratch = temps.AcquireX();
 
   // Bind exception poll entry.
   ___ Bind(exception->Entry());
@@ -700,11 +687,11 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), exception->scratch_);
-  ___ Ldr(scratch,
+  ___ Ldr(lr,
           MEM_OP(reg_x(TR),
                  QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pDeliverException).Int32Value()));
 
-  ___ Blr(scratch);
+  ___ Blr(lr);
   // Call should never return.
   ___ Brk();
 }
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index d6b1c50..b7a6058 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -41,8 +41,8 @@
   "  7e:	f50d 5c80 	add.w	ip, sp, #4096	; 0x1000\n",
   "  82:	f8c9 c200 	str.w	ip, [r9, #512]	; 0x200\n",
   "  86:	f8c9 d200 	str.w	sp, [r9, #512]	; 0x200\n",
-  "  8a:	f8d0 c030 	ldr.w	ip, [r0, #48]	; 0x30\n",
-  "  8e:	47e0      	blx	ip\n",
+  "  8a:	f8d0 e030 	ldr.w	lr, [r0, #48]	; 0x30\n",
+  "  8e:	47f0      	blx	lr\n",
   "  90:	f8dd c02c 	ldr.w	ip, [sp, #44]	; 0x2c\n",
   "  94:	f8cd c030 	str.w	ip, [sp, #48]	; 0x30\n",
   "  98:	f8d9 c200 	ldr.w	ip, [r9, #512]	; 0x200\n",
@@ -153,8 +153,8 @@
   " 21c:	f8d9 8034 	ldr.w	r8, [r9, #52]	; 0x34\n",
   " 220:	4770      	bx	lr\n",
   " 222:	4660      	mov	r0, ip\n",
-  " 224:	f8d9 c2e8 	ldr.w	ip, [r9, #744]	; 0x2e8\n",
-  " 228:	47e0      	blx	ip\n",
+  " 224:	f8d9 e2e8 	ldr.w	lr, [r9, #744]	; 0x2e8\n",
+  " 228:	47f0      	blx	lr\n",
   nullptr
 };
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index e4ce338..1adcc20 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -563,14 +563,10 @@
       UNREACHABLE();
   }
 
-  // TODO: Compare the memory location with immediate 0.
-  Register scratch = GetScratchRegister();
-  DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
-  __ fs()->movl(scratch, Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>()));
-
-  // TEST reg, reg
+  // CMP self->tls32_.is_gc_marking, 0
   // Jcc <Offset>
-  __ testl(scratch, scratch);
+  DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
+  __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>()), Immediate(0));
   __ j(x86_cond, X86JNIMacroLabel::Cast(label)->AsX86());
 }
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index ee4ae2e..d57ea41 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -37,8 +37,7 @@
 static_assert(kNativeStackAlignment == kStackAlignment);
 
 static inline CpuRegister GetScratchRegister() {
-  // TODO: Use R11 in line with Optimizing.
-  return CpuRegister(RAX);
+  return CpuRegister(R11);
 }
 
 #define __ asm_.
@@ -403,11 +402,8 @@
   DCHECK(size == 4 || size == 8) << size;
   CpuRegister scratch = GetScratchRegister();
   if (size == 8) {
-    // TODO: Use MOVQ.
-    __ movl(scratch, Address(CpuRegister(RSP), src));
-    __ movl(Address(CpuRegister(RSP), dest), scratch);
-    __ movl(scratch, Address(CpuRegister(RSP), FrameOffset(src.Int32Value() + 4)));
-    __ movl(Address(CpuRegister(RSP), FrameOffset(dest.Int32Value() + 4)), scratch);
+    __ movq(scratch, Address(CpuRegister(RSP), src));
+    __ movq(Address(CpuRegister(RSP), dest), scratch);
   } else {
     __ movl(scratch, Address(CpuRegister(RSP), src));
     __ movl(Address(CpuRegister(RSP), dest), scratch);
@@ -622,14 +618,11 @@
       UNREACHABLE();
   }
 
-  // TODO: Compare the memory location with immediate 0.
-  CpuRegister scratch = GetScratchRegister();
-  DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
-  __ gs()->movl(scratch, Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>(), true));
-
-  // TEST reg, reg
+  // CMP self->tls32_.is_gc_marking, 0
   // Jcc <Offset>
-  __ testq(scratch, scratch);
+  DCHECK_EQ(Thread::IsGcMarkingSize(), 4u);
+  __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>(), true),
+                Immediate(0));
   __ j(x86_64_cond, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
 }
 
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index e1b8e52..5c80589 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -79,9 +79,9 @@
 END_FUNCTION art_jni_dlsym_lookup_stub
 
 DEFINE_FUNCTION art_jni_dlsym_lookup_critical_stub
-    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is r11.
+    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is RAX.
     // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
-    testq LITERAL(1), %r11
+    testq LITERAL(1), %rax
     jnz art_jni_dlsym_lookup_stub
 
     // We need to create a GenericJNI managed frame above the stack args.
@@ -121,10 +121,10 @@
     subq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp
     CFI_ADJUST_CFA_OFFSET(__SIZEOF_POINTER__)
     // Save hidden arg.
-    PUSH_ARG r11
+    PUSH_ARG rax
 
     // Call artCriticalNativeOutArgsSize(method).
-    movq %r11, %rdi  // Pass the method from hidden arg.
+    movq %rax, %rdi  // Pass the method from hidden arg.
     call SYMBOL(artCriticalNativeOutArgsSize)
 
     // Calculate the address of the end of the move destination and redefine CFI to take
@@ -148,7 +148,7 @@
     movq %rax, (%rdi)
 
     // Pop the hidden arg and alignment padding.
-    popq %r11    // No `.cfi_adjust_cfa_offset`, CFA register is currently R10, not RSP.
+    popq %rax    // No `.cfi_adjust_cfa_offset`, CFA register is currently R10, not RSP.
     addq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp  // ditto
 
     // Fill the SaveRefsAndArgs frame above the args, without actual args. Note that
@@ -175,7 +175,7 @@
     movq %xmm14, 96(%r10)
     movq %xmm15, 104(%r10)
     // Save the hidden arg as method pointer at the bottom of the stack.
-    movq %r11, (%r10)
+    movq %rax, (%r10)
 
     // Move the frame register to a callee-save register.
     movq %r10, %rbp
@@ -255,7 +255,7 @@
     CFI_RESTORE_STATE_AND_DEF_CFA(%rbp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
 
 2:
-    // Drop the args from the stack (the r11 and padding was already removed).
+    // Drop the args from the stack (the RAX and padding was already removed).
     addq LITERAL(14 * __SIZEOF_POINTER__), %rsp
 
     DELIVER_PENDING_EXCEPTION_FRAME_READY
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index abc3a8a..6a19bbb 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1676,13 +1676,16 @@
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
 
-    // Load hidden arg (r11) for @CriticalNative.
-    movq 64(%rsp), %r11
+    // Save call target in scratch register.
+    movq %rax, %r11
+
+    // Load hidden arg (rax) for @CriticalNative.
+    movq 64(%rsp), %rax
     // Load SP for out args, releasing unneeded reserved area.
     movq 72(%rsp), %rsp
 
     // native call
-    call *%rax
+    call *%r11
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
diff --git a/runtime/oat.h b/runtime/oat.h
index 8c81844..c1659e4 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Allow late lookup for @CriticalNative.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '8', '0', '\0' } };
+  // Last oat version changed reason: Change x86-64 @CriticalNative hidden arg register to RAX.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '8', '1', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";