ART: Rewrite stub-test inline assembly

Rewrite the x86 and x86-64 inline assembly to spill and restore
registers manually. GCC outright rejected clobbering some registers,
but Clang silently ignored the problematic cases, which breaks,
for example, in ASAN builds.

Bug: 24294564
Change-Id: Iab45da0d6082ad77435e75bdc41d547619443d47
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 0d2457e..1d10e5d 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -71,351 +71,7 @@
   // TODO: Set up a frame according to referrer's specs.
   size_t Invoke3WithReferrer(size_t arg0, size_t arg1, size_t arg2, uintptr_t code, Thread* self,
                              ArtMethod* referrer) {
-    // Push a transition back into managed code onto the linked list in thread.
-    ManagedStack fragment;
-    self->PushManagedStackFragment(&fragment);
-
-    size_t result;
-    size_t fpr_result = 0;
-#if defined(__i386__)
-    // TODO: Set the thread?
-    __asm__ __volatile__(
-        "subl $12, %%esp\n\t"       // Align stack.
-        "pushl %[referrer]\n\t"     // Store referrer.
-        "call *%%edi\n\t"           // Call the stub
-        "addl $16, %%esp"           // Pop referrer
-        : "=a" (result)
-          // Use the result from eax
-        : "a"(arg0), "c"(arg1), "d"(arg2), "D"(code), [referrer]"r"(referrer)
-          // This places code into edi, arg0 into eax, arg1 into ecx, and arg2 into edx
-        : "memory");  // clobber.
-    // TODO: Should we clobber the other registers? EBX gets clobbered by some of the stubs,
-    //       but compilation fails when declaring that.
-#elif defined(__arm__)
-    __asm__ __volatile__(
-        "push {r1-r12, lr}\n\t"     // Save state, 13*4B = 52B
-        ".cfi_adjust_cfa_offset 52\n\t"
-        "push {r9}\n\t"
-        ".cfi_adjust_cfa_offset 4\n\t"
-        "mov r9, %[referrer]\n\n"
-        "str r9, [sp, #-8]!\n\t"   // Push referrer, +8B padding so 16B aligned
-        ".cfi_adjust_cfa_offset 8\n\t"
-        "ldr r9, [sp, #8]\n\t"
-
-        // Push everything on the stack, so we don't rely on the order. What a mess. :-(
-        "sub sp, sp, #20\n\t"
-        "str %[arg0], [sp]\n\t"
-        "str %[arg1], [sp, #4]\n\t"
-        "str %[arg2], [sp, #8]\n\t"
-        "str %[code], [sp, #12]\n\t"
-        "str %[self], [sp, #16]\n\t"
-        "ldr r0, [sp]\n\t"
-        "ldr r1, [sp, #4]\n\t"
-        "ldr r2, [sp, #8]\n\t"
-        "ldr r3, [sp, #12]\n\t"
-        "ldr r9, [sp, #16]\n\t"
-        "add sp, sp, #20\n\t"
-
-        "blx r3\n\t"                // Call the stub
-        "add sp, sp, #12\n\t"       // Pop null and padding
-        ".cfi_adjust_cfa_offset -12\n\t"
-        "pop {r1-r12, lr}\n\t"      // Restore state
-        ".cfi_adjust_cfa_offset -52\n\t"
-        "mov %[result], r0\n\t"     // Save the result
-        : [result] "=r" (result)
-          // Use the result from r0
-        : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
-          [referrer] "r"(referrer)
-        : "r0", "memory");  // clobber.
-#elif defined(__aarch64__)
-    __asm__ __volatile__(
-        // Spill x0-x7 which we say we don't clobber. May contain args.
-        "sub sp, sp, #64\n\t"
-        ".cfi_adjust_cfa_offset 64\n\t"
-        "stp x0, x1, [sp]\n\t"
-        "stp x2, x3, [sp, #16]\n\t"
-        "stp x4, x5, [sp, #32]\n\t"
-        "stp x6, x7, [sp, #48]\n\t"
-
-        "sub sp, sp, #16\n\t"          // Reserve stack space, 16B aligned
-        ".cfi_adjust_cfa_offset 16\n\t"
-        "str %[referrer], [sp]\n\t"    // referrer
-
-        // Push everything on the stack, so we don't rely on the order. What a mess. :-(
-        "sub sp, sp, #48\n\t"
-        ".cfi_adjust_cfa_offset 48\n\t"
-        // All things are "r" constraints, so direct str/stp should work.
-        "stp %[arg0], %[arg1], [sp]\n\t"
-        "stp %[arg2], %[code], [sp, #16]\n\t"
-        "str %[self], [sp, #32]\n\t"
-
-        // Now we definitely have x0-x3 free, use it to garble d8 - d15
-        "movk x0, #0xfad0\n\t"
-        "movk x0, #0xebad, lsl #16\n\t"
-        "movk x0, #0xfad0, lsl #32\n\t"
-        "movk x0, #0xebad, lsl #48\n\t"
-        "fmov d8, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d9, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d10, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d11, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d12, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d13, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d14, x0\n\t"
-        "add x0, x0, 1\n\t"
-        "fmov d15, x0\n\t"
-
-        // Load call params into the right registers.
-        "ldp x0, x1, [sp]\n\t"
-        "ldp x2, x3, [sp, #16]\n\t"
-        "ldr x19, [sp, #32]\n\t"
-        "add sp, sp, #48\n\t"
-        ".cfi_adjust_cfa_offset -48\n\t"
-
-
-        "blr x3\n\t"              // Call the stub
-        "mov x8, x0\n\t"          // Store result
-        "add sp, sp, #16\n\t"     // Drop the quick "frame"
-        ".cfi_adjust_cfa_offset -16\n\t"
-
-        // Test d8 - d15. We can use x1 and x2.
-        "movk x1, #0xfad0\n\t"
-        "movk x1, #0xebad, lsl #16\n\t"
-        "movk x1, #0xfad0, lsl #32\n\t"
-        "movk x1, #0xebad, lsl #48\n\t"
-        "fmov x2, d8\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d9\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d10\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d11\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d12\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d13\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d14\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-        "add x1, x1, 1\n\t"
-
-        "fmov x2, d15\n\t"
-        "cmp x1, x2\n\t"
-        "b.ne 1f\n\t"
-
-        "mov x9, #0\n\t"              // Use x9 as flag, in clobber list
-
-        // Finish up.
-        "2:\n\t"
-        "ldp x0, x1, [sp]\n\t"        // Restore stuff not named clobbered, may contain fpr_result
-        "ldp x2, x3, [sp, #16]\n\t"
-        "ldp x4, x5, [sp, #32]\n\t"
-        "ldp x6, x7, [sp, #48]\n\t"
-        "add sp, sp, #64\n\t"         // Free stack space, now sp as on entry
-        ".cfi_adjust_cfa_offset -64\n\t"
-
-        "str x9, %[fpr_result]\n\t"   // Store the FPR comparison result
-        "mov %[result], x8\n\t"              // Store the call result
-
-        "b 3f\n\t"                     // Goto end
-
-        // Failed fpr verification.
-        "1:\n\t"
-        "mov x9, #1\n\t"
-        "b 2b\n\t"                     // Goto finish-up
-
-        // End
-        "3:\n\t"
-        : [result] "=r" (result)
-          // Use the result from r0
-        : [arg0] "0"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
-          [referrer] "r"(referrer), [fpr_result] "m" (fpr_result)
-        : "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
-          "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
-          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-          "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
-          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
-          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
-          "memory");  // clobber.
-#elif defined(__mips__) && !defined(__LP64__)
-    __asm__ __volatile__ (
-        // Spill a0-a3 and t0-t7 which we say we don't clobber. May contain args.
-        "addiu $sp, $sp, -64\n\t"
-        "sw $a0, 0($sp)\n\t"
-        "sw $a1, 4($sp)\n\t"
-        "sw $a2, 8($sp)\n\t"
-        "sw $a3, 12($sp)\n\t"
-        "sw $t0, 16($sp)\n\t"
-        "sw $t1, 20($sp)\n\t"
-        "sw $t2, 24($sp)\n\t"
-        "sw $t3, 28($sp)\n\t"
-        "sw $t4, 32($sp)\n\t"
-        "sw $t5, 36($sp)\n\t"
-        "sw $t6, 40($sp)\n\t"
-        "sw $t7, 44($sp)\n\t"
-        // Spill gp register since it is caller save.
-        "sw $gp, 52($sp)\n\t"
-
-        "addiu $sp, $sp, -16\n\t"  // Reserve stack space, 16B aligned.
-        "sw %[referrer], 0($sp)\n\t"
-
-        // Push everything on the stack, so we don't rely on the order.
-        "addiu $sp, $sp, -20\n\t"
-        "sw %[arg0], 0($sp)\n\t"
-        "sw %[arg1], 4($sp)\n\t"
-        "sw %[arg2], 8($sp)\n\t"
-        "sw %[code], 12($sp)\n\t"
-        "sw %[self], 16($sp)\n\t"
-
-        // Load call params into the right registers.
-        "lw $a0, 0($sp)\n\t"
-        "lw $a1, 4($sp)\n\t"
-        "lw $a2, 8($sp)\n\t"
-        "lw $t9, 12($sp)\n\t"
-        "lw $s1, 16($sp)\n\t"
-        "addiu $sp, $sp, 20\n\t"
-
-        "jalr $t9\n\t"             // Call the stub.
-        "nop\n\t"
-        "addiu $sp, $sp, 16\n\t"   // Drop the quick "frame".
-
-        // Restore stuff not named clobbered.
-        "lw $a0, 0($sp)\n\t"
-        "lw $a1, 4($sp)\n\t"
-        "lw $a2, 8($sp)\n\t"
-        "lw $a3, 12($sp)\n\t"
-        "lw $t0, 16($sp)\n\t"
-        "lw $t1, 20($sp)\n\t"
-        "lw $t2, 24($sp)\n\t"
-        "lw $t3, 28($sp)\n\t"
-        "lw $t4, 32($sp)\n\t"
-        "lw $t5, 36($sp)\n\t"
-        "lw $t6, 40($sp)\n\t"
-        "lw $t7, 44($sp)\n\t"
-        // Restore gp.
-        "lw $gp, 52($sp)\n\t"
-        "addiu $sp, $sp, 64\n\t"   // Free stack space, now sp as on entry.
-
-        "move %[result], $v0\n\t"  // Store the call result.
-        : [result] "=r" (result)
-        : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
-          [referrer] "r"(referrer)
-        : "at", "v0", "v1", "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "t8", "t9", "k0", "k1",
-          "fp", "ra",
-          "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
-          "$f12", "$f13", "$f14", "$f15", "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22",
-          "$f23", "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31",
-          "memory");  // clobber.
-#elif defined(__mips__) && defined(__LP64__)
-    __asm__ __volatile__ (
-        // Spill a0-a7 which we say we don't clobber. May contain args.
-        "daddiu $sp, $sp, -64\n\t"
-        "sd $a0, 0($sp)\n\t"
-        "sd $a1, 8($sp)\n\t"
-        "sd $a2, 16($sp)\n\t"
-        "sd $a3, 24($sp)\n\t"
-        "sd $a4, 32($sp)\n\t"
-        "sd $a5, 40($sp)\n\t"
-        "sd $a6, 48($sp)\n\t"
-        "sd $a7, 56($sp)\n\t"
-
-        "daddiu $sp, $sp, -16\n\t"  // Reserve stack space, 16B aligned.
-        "sd %[referrer], 0($sp)\n\t"
-
-        // Push everything on the stack, so we don't rely on the order.
-        "daddiu $sp, $sp, -40\n\t"
-        "sd %[arg0], 0($sp)\n\t"
-        "sd %[arg1], 8($sp)\n\t"
-        "sd %[arg2], 16($sp)\n\t"
-        "sd %[code], 24($sp)\n\t"
-        "sd %[self], 32($sp)\n\t"
-
-        // Load call params into the right registers.
-        "ld $a0, 0($sp)\n\t"
-        "ld $a1, 8($sp)\n\t"
-        "ld $a2, 16($sp)\n\t"
-        "ld $t9, 24($sp)\n\t"
-        "ld $s1, 32($sp)\n\t"
-        "daddiu $sp, $sp, 40\n\t"
-
-        "jalr $t9\n\t"              // Call the stub.
-        "nop\n\t"
-        "daddiu $sp, $sp, 16\n\t"   // Drop the quick "frame".
-
-        // Restore stuff not named clobbered.
-        "ld $a0, 0($sp)\n\t"
-        "ld $a1, 8($sp)\n\t"
-        "ld $a2, 16($sp)\n\t"
-        "ld $a3, 24($sp)\n\t"
-        "ld $a4, 32($sp)\n\t"
-        "ld $a5, 40($sp)\n\t"
-        "ld $a6, 48($sp)\n\t"
-        "ld $a7, 56($sp)\n\t"
-        "daddiu $sp, $sp, 64\n\t"
-
-        "move %[result], $v0\n\t"   // Store the call result.
-        : [result] "=r" (result)
-        : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
-          [referrer] "r"(referrer)
-        : "at", "v0", "v1", "t0", "t1", "t2", "t3", "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
-          "t8", "t9", "k0", "k1", "fp", "ra",
-          "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13",
-          "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", "f24", "f25", "f26",
-          "f27", "f28", "f29", "f30", "f31",
-          "memory");  // clobber.
-#elif defined(__x86_64__) && !defined(__APPLE__) && defined(__clang__)
-    // Note: Uses the native convention
-    // TODO: Set the thread?
-    __asm__ __volatile__(
-        "pushq %[referrer]\n\t"        // Push referrer
-        "pushq (%%rsp)\n\t"             // & 16B alignment padding
-        ".cfi_adjust_cfa_offset 16\n\t"
-        "call *%%rax\n\t"              // Call the stub
-        "addq $16, %%rsp\n\t"          // Pop null and padding
-        ".cfi_adjust_cfa_offset -16\n\t"
-        : "=a" (result)
-          // Use the result from rax
-        : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code), [referrer] "c"(referrer)
-          // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
-        : "rbx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-          "memory");  // clobber all
-    // TODO: Should we clobber the other registers?
-#else
-    UNUSED(arg0, arg1, arg2, code, referrer);
-    LOG(WARNING) << "Was asked to invoke for an architecture I do not understand.";
-    result = 0;
-#endif
-    // Pop transition.
-    self->PopManagedStackFragment(fragment);
-
-    fp_result = fpr_result;
-    EXPECT_EQ(0U, fp_result);
-
-    return result;
+    return Invoke3WithReferrerAndHidden(arg0, arg1, arg2, code, self, referrer, 0);
   }
 
   // TODO: Set up a frame according to referrer's specs.
@@ -429,19 +85,55 @@
     size_t fpr_result = 0;
 #if defined(__i386__)
     // TODO: Set the thread?
+#define PUSH(reg) "push " # reg "\n\t .cfi_adjust_cfa_offset 4\n\t"
+#define POP(reg) "pop " # reg "\n\t .cfi_adjust_cfa_offset -4\n\t"
     __asm__ __volatile__(
-        "movd %[hidden], %%xmm7\n\t"
-        "subl $12, %%esp\n\t"       // Align stack.
-        "pushl %[referrer]\n\t"     // Store referrer
+        "movd %[hidden], %%xmm7\n\t"  // This is a memory op, so do this early. If it is off of
+                                      // esp, then we won't be able to access it after spilling.
+
+        // Spill 6 registers.
+        PUSH(%%ebx)
+        PUSH(%%ecx)
+        PUSH(%%edx)
+        PUSH(%%esi)
+        PUSH(%%edi)
+        PUSH(%%ebp)
+
+        // Store the inputs to the stack, but keep the referrer up top, less work.
+        PUSH(%[referrer])           // Align stack.
+        PUSH(%[referrer])           // Store referrer
+
+        PUSH(%[arg0])
+        PUSH(%[arg1])
+        PUSH(%[arg2])
+        PUSH(%[code])
+        // Now read them back into the required registers.
+        POP(%%edi)
+        POP(%%edx)
+        POP(%%ecx)
+        POP(%%eax)
+        // Call is prepared now.
+
         "call *%%edi\n\t"           // Call the stub
-        "addl $16, %%esp"           // Pop referrer
+        "addl $8, %%esp\n\t"        // Pop referrer and padding.
+        ".cfi_adjust_cfa_offset -8\n\t"
+
+        // Restore 6 registers.
+        POP(%%ebp)
+        POP(%%edi)
+        POP(%%esi)
+        POP(%%edx)
+        POP(%%ecx)
+        POP(%%ebx)
+
         : "=a" (result)
           // Use the result from eax
-        : "a"(arg0), "c"(arg1), "d"(arg2), "D"(code), [referrer]"r"(referrer), [hidden]"m"(hidden)
+        : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code),
+          [referrer]"r"(referrer), [hidden]"m"(hidden)
           // This places code into edi, arg0 into eax, arg1 into ecx, and arg2 into edx
-        : "memory");  // clobber.
-    // TODO: Should we clobber the other registers? EBX gets clobbered by some of the stubs,
-    //       but compilation fails when declaring that.
+        : "memory", "xmm7");  // clobber.
+#undef PUSH
+#undef POP
 #elif defined(__arm__)
     __asm__ __volatile__(
         "push {r1-r12, lr}\n\t"     // Save state, 13*4B = 52B
@@ -743,23 +435,72 @@
           "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", "f24", "f25", "f26",
           "f27", "f28", "f29", "f30", "f31",
           "memory");  // clobber.
-#elif defined(__x86_64__) && !defined(__APPLE__) && defined(__clang__)
-    // Note: Uses the native convention
+#elif defined(__x86_64__) && !defined(__APPLE__)
+#define PUSH(reg) "pushq " # reg "\n\t .cfi_adjust_cfa_offset 8\n\t"
+#define POP(reg) "popq " # reg "\n\t .cfi_adjust_cfa_offset -8\n\t"
+    // Note: Uses the native convention. We do a callee-save regimen by manually spilling and
+    //       restoring almost all registers.
     // TODO: Set the thread?
     __asm__ __volatile__(
-        "pushq %[referrer]\n\t"        // Push referrer
-        "pushq (%%rsp)\n\t"            // & 16B alignment padding
-        ".cfi_adjust_cfa_offset 16\n\t"
-        "call *%%rbx\n\t"              // Call the stub
-        "addq $16, %%rsp\n\t"          // Pop null and padding
+        // Spill almost everything (except rax, rsp). 14 registers.
+        PUSH(%%rbx)
+        PUSH(%%rcx)
+        PUSH(%%rdx)
+        PUSH(%%rsi)
+        PUSH(%%rdi)
+        PUSH(%%rbp)
+        PUSH(%%r8)
+        PUSH(%%r9)
+        PUSH(%%r10)
+        PUSH(%%r11)
+        PUSH(%%r12)
+        PUSH(%%r13)
+        PUSH(%%r14)
+        PUSH(%%r15)
+
+        PUSH(%[referrer])              // Push referrer & 16B alignment padding
+        PUSH(%[referrer])
+
+        // Now juggle the input registers.
+        PUSH(%[arg0])
+        PUSH(%[arg1])
+        PUSH(%[arg2])
+        PUSH(%[hidden])
+        PUSH(%[code])
+        POP(%%r8)
+        POP(%%rax)
+        POP(%%rdx)
+        POP(%%rsi)
+        POP(%%rdi)
+
+        "call *%%r8\n\t"                  // Call the stub
+        "addq $16, %%rsp\n\t"             // Pop null and padding
         ".cfi_adjust_cfa_offset -16\n\t"
+
+        POP(%%r15)
+        POP(%%r14)
+        POP(%%r13)
+        POP(%%r12)
+        POP(%%r11)
+        POP(%%r10)
+        POP(%%r9)
+        POP(%%r8)
+        POP(%%rbp)
+        POP(%%rdi)
+        POP(%%rsi)
+        POP(%%rdx)
+        POP(%%rcx)
+        POP(%%rbx)
+
         : "=a" (result)
         // Use the result from rax
-        : "D"(arg0), "S"(arg1), "d"(arg2), "b"(code), [referrer] "c"(referrer), [hidden] "a"(hidden)
-        // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
-        : "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-          "memory");  // clobber all
-    // TODO: Should we clobber the other registers?
+        : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code),
+          [referrer] "r"(referrer), [hidden] "r"(hidden)
+        // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into some other
+        // register. We can't use "b" (rbx), as ASAN uses this for the frame pointer.
+        : "memory");  // We spill and restore (almost) all registers, so only mention memory here.
+#undef PUSH
+#undef POP
 #else
     UNUSED(arg0, arg1, arg2, code, referrer, hidden);
     LOG(WARNING) << "Was asked to invoke for an architecture I do not understand.";