ART: Fix stub_test inline assembly for ARM64

The rebased Clang resolves register constraints differently from
before. Change the inline assembly.

Bug: 15393246
Change-Id: Iea2c2693f0a035db541ff91f796ee5f997a2ced7
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 44edd4b..0b7f268c 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -125,27 +125,24 @@
         : );  // clobber.
 #elif defined(__aarch64__)
     __asm__ __volatile__(
-        // Spill space for d8 - d15
+        // Spill x0-x7 which we say we don't clobber. May contain args.
         "sub sp, sp, #64\n\t"
         ".cfi_adjust_cfa_offset 64\n\t"
-        "stp d8, d9,   [sp]\n\t"
-        "stp d10, d11, [sp, #16]\n\t"
-        "stp d12, d13, [sp, #32]\n\t"
-        "stp d14, d15, [sp, #48]\n\t"
+        "stp x0, x1, [sp]\n\t"
+        "stp x2, x3, [sp, #16]\n\t"
+        "stp x4, x5, [sp, #32]\n\t"
+        "stp x6, x7, [sp, #48]\n\t"
 
-        "sub sp, sp, #48\n\t"          // Reserve stack space, 16B aligned
-        ".cfi_adjust_cfa_offset 48\n\t"
-        "stp %[referrer], x1, [sp]\n\t"// referrer, x1
-        "stp x2, x3,   [sp, #16]\n\t"   // Save x2, x3
-        "stp x18, x30, [sp, #32]\n\t"   // Save x18(xSELF), xLR
+        "sub sp, sp, #16\n\t"          // Reserve stack space, 16B aligned
+        ".cfi_adjust_cfa_offset 16\n\t"
+        "str %[referrer], [sp]\n\t"    // referrer
 
         // Push everything on the stack, so we don't rely on the order. What a mess. :-(
         "sub sp, sp, #48\n\t"
         ".cfi_adjust_cfa_offset 48\n\t"
-        "str %[arg0], [sp]\n\t"
-        "str %[arg1], [sp, #8]\n\t"
-        "str %[arg2], [sp, #16]\n\t"
-        "str %[code], [sp, #24]\n\t"
+        // All things are "r" constraints, so direct str/stp should work.
+        "stp %[arg0], %[arg1], [sp]\n\t"
+        "stp %[arg2], %[code], [sp, #16]\n\t"
         "str %[self], [sp, #32]\n\t"
 
         // Now we definitely have x0-x3 free, use it to garble d8 - d15
@@ -169,17 +166,18 @@
         "add x0, x0, 1\n\t"
         "fmov d15, x0\n\t"
 
-        // Load call params
-        "ldr x0, [sp]\n\t"
-        "ldr x1, [sp, #8]\n\t"
-        "ldr x2, [sp, #16]\n\t"
-        "ldr x3, [sp, #24]\n\t"
+        // Load call params into the right registers.
+        "ldp x0, x1, [sp]\n\t"
+        "ldp x2, x3, [sp, #16]\n\t"
         "ldr x18, [sp, #32]\n\t"
         "add sp, sp, #48\n\t"
         ".cfi_adjust_cfa_offset -48\n\t"
 
 
         "blr x3\n\t"              // Call the stub
+        "mov x8, x0\n\t"          // Store result
+        "add sp, sp, #16\n\t"     // Drop the quick "frame"
+        ".cfi_adjust_cfa_offset -16\n\t"
 
         // Test d8 - d15. We can use x1 and x2.
         "movk x1, #0xfad0\n\t"
@@ -225,31 +223,25 @@
         "cmp x1, x2\n\t"
         "b.ne 1f\n\t"
 
-        "mov x2, #0\n\t"
-        "str x2, %[fpr_result]\n\t"
+        "mov x9, #0\n\t"              // Use x9 as flag, in clobber list
 
         // Finish up.
         "2:\n\t"
-        "ldp x1, x2, [sp, #8]\n\t"     // Restore x1, x2
-        "ldp x3, x18, [sp, #24]\n\t"   // Restore x3, xSELF
-        "ldr x30, [sp, #40]\n\t"       // Restore xLR
-        "add sp, sp, #48\n\t"          // Free stack space
-        ".cfi_adjust_cfa_offset -48\n\t"
-        "mov %[result], x0\n\t"        // Save the result
-
-        "ldp d8, d9,   [sp]\n\t"       // Restore d8 - d15
-        "ldp d10, d11, [sp, #16]\n\t"
-        "ldp d12, d13, [sp, #32]\n\t"
-        "ldp d14, d15, [sp, #48]\n\t"
-        "add sp, sp, #64\n\t"
+        "ldp x0, x1, [sp]\n\t"        // Restore stuff not named clobbered, may contain fpr_result
+        "ldp x2, x3, [sp, #16]\n\t"
+        "ldp x4, x5, [sp, #32]\n\t"
+        "ldp x6, x7, [sp, #48]\n\t"
+        "add sp, sp, #64\n\t"         // Free stack space, now sp as on entry
         ".cfi_adjust_cfa_offset -64\n\t"
 
+        "str x9, %[fpr_result]\n\t"   // Store the FPR comparison result
+        "mov %[result], x8\n\t"              // Store the call result
+
         "b 3f\n\t"                     // Goto end
 
         // Failed fpr verification.
         "1:\n\t"
-        "mov x2, #1\n\t"
-        "str x2, %[fpr_result]\n\t"
+        "mov x9, #1\n\t"
         "b 2b\n\t"                     // Goto finish-up
 
         // End
@@ -258,7 +250,12 @@
           // Use the result from r0
         : [arg0] "0"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
           [referrer] "r"(referrer), [fpr_result] "m" (fpr_result)
-        : "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17");  // clobber.
+        : "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
+          "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+          "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");  // clobber.
 #elif defined(__x86_64__)
     // Note: Uses the native convention
     // TODO: Set the thread?
@@ -351,29 +348,25 @@
             : );  // clobber.
 #elif defined(__aarch64__)
     __asm__ __volatile__(
-        // Spill space for d8 - d15
+        // Spill x0-x7 which we say we don't clobber. May contain args.
         "sub sp, sp, #64\n\t"
         ".cfi_adjust_cfa_offset 64\n\t"
-        "stp d8, d9,   [sp]\n\t"
-        "stp d10, d11, [sp, #16]\n\t"
-        "stp d12, d13, [sp, #32]\n\t"
-        "stp d14, d15, [sp, #48]\n\t"
+        "stp x0, x1, [sp]\n\t"
+        "stp x2, x3, [sp, #16]\n\t"
+        "stp x4, x5, [sp, #32]\n\t"
+        "stp x6, x7, [sp, #48]\n\t"
 
-        "sub sp, sp, #48\n\t"          // Reserve stack space, 16B aligned
-        ".cfi_adjust_cfa_offset 48\n\t"
-        "stp %[referrer], x1, [sp]\n\t"// referrer, x1
-        "stp x2, x3,   [sp, #16]\n\t"   // Save x2, x3
-        "stp x18, x30, [sp, #32]\n\t"   // Save x18(xSELF), xLR
+        "sub sp, sp, #16\n\t"          // Reserve stack space, 16B aligned
+        ".cfi_adjust_cfa_offset 16\n\t"
+        "str %[referrer], [sp]\n\t"    // referrer
 
         // Push everything on the stack, so we don't rely on the order. What a mess. :-(
         "sub sp, sp, #48\n\t"
         ".cfi_adjust_cfa_offset 48\n\t"
-        "str %[arg0], [sp]\n\t"
-        "str %[arg1], [sp, #8]\n\t"
-        "str %[arg2], [sp, #16]\n\t"
-        "str %[code], [sp, #24]\n\t"
-        "str %[self], [sp, #32]\n\t"
-        "str %[hidden], [sp, #40]\n\t"
+        // All things are "r" constraints, so direct str/stp should work.
+        "stp %[arg0], %[arg1], [sp]\n\t"
+        "stp %[arg2], %[code], [sp, #16]\n\t"
+        "stp %[self], %[hidden], [sp, #32]\n\t"
 
         // Now we definitely have x0-x3 free, use it to garble d8 - d15
         "movk x0, #0xfad0\n\t"
@@ -396,18 +389,17 @@
         "add x0, x0, 1\n\t"
         "fmov d15, x0\n\t"
 
-        // Load call params
-        "ldr x0, [sp]\n\t"
-        "ldr x1, [sp, #8]\n\t"
-        "ldr x2, [sp, #16]\n\t"
-        "ldr x3, [sp, #24]\n\t"
-        "ldr x18, [sp, #32]\n\t"
-        "ldr x12, [sp, #40]\n\t"
+        // Load call params into the right registers.
+        "ldp x0, x1, [sp]\n\t"
+        "ldp x2, x3, [sp, #16]\n\t"
+        "ldp x18, x12, [sp, #32]\n\t"
         "add sp, sp, #48\n\t"
         ".cfi_adjust_cfa_offset -48\n\t"
 
-
         "blr x3\n\t"              // Call the stub
+        "mov x8, x0\n\t"          // Store result
+        "add sp, sp, #16\n\t"     // Drop the quick "frame"
+        ".cfi_adjust_cfa_offset -16\n\t"
 
         // Test d8 - d15. We can use x1 and x2.
         "movk x1, #0xfad0\n\t"
@@ -453,38 +445,39 @@
         "cmp x1, x2\n\t"
         "b.ne 1f\n\t"
 
-        "mov %[fpr_result], #0\n\t"
+        "mov x9, #0\n\t"              // Use x9 as flag, in clobber list
 
         // Finish up.
         "2:\n\t"
-        "ldp x1, x2, [sp, #8]\n\t"     // Restore x1, x2
-        "ldp x3, x18, [sp, #24]\n\t"   // Restore x3, xSELF
-        "ldr x30, [sp, #40]\n\t"       // Restore xLR
-        "add sp, sp, #48\n\t"          // Free stack space
-        ".cfi_adjust_cfa_offset -48\n\t"
-        "mov %[result], x0\n\t"        // Save the result
-
-        "ldp d8, d9,   [sp]\n\t"       // Restore d8 - d15
-        "ldp d10, d11, [sp, #16]\n\t"
-        "ldp d12, d13, [sp, #32]\n\t"
-        "ldp d14, d15, [sp, #48]\n\t"
-        "add sp, sp, #64\n\t"
+        "ldp x0, x1, [sp]\n\t"        // Restore stuff not named clobbered, may contain fpr_result
+        "ldp x2, x3, [sp, #16]\n\t"
+        "ldp x4, x5, [sp, #32]\n\t"
+        "ldp x6, x7, [sp, #48]\n\t"
+        "add sp, sp, #64\n\t"         // Free stack space, now sp as on entry
         ".cfi_adjust_cfa_offset -64\n\t"
 
+        "str x9, %[fpr_result]\n\t"   // Store the FPR comparison result
+        "mov %[result], x8\n\t"              // Store the call result
+
         "b 3f\n\t"                     // Goto end
 
         // Failed fpr verification.
         "1:\n\t"
-        "mov %[fpr_result], #1\n\t"
+        "mov x9, #1\n\t"
         "b 2b\n\t"                     // Goto finish-up
 
         // End
         "3:\n\t"
-        : [result] "=r" (result), [fpr_result] "=r" (fpr_result)
-        // Use the result from r0
+        : [result] "=r" (result)
+          // Use the result from r0
         : [arg0] "0"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
-          [referrer] "r"(referrer), [hidden] "r"(hidden)
-        : "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17");  // clobber.
+          [referrer] "r"(referrer), [hidden] "r"(hidden), [fpr_result] "m" (fpr_result)
+        : "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
+          "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+          "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");  // clobber.
 #elif defined(__x86_64__)
     // Note: Uses the native convention
     // TODO: Set the thread?