Made art/runtime/arch/stub_test.cc compile with -O2 again.

The test file art/runtime/arch/stub_test.cc wasn't compiling with -O2
as the optimisations interacted with -fstack-protector-strong. The
aarch64 _asm_ block in the Invoke3WithReferrerAndHidden function was
clobbering all possible registers, and clang requires at least one
register to be live across an _asm_ block to do the checking.

The fix was to remove a callee-saved register, x20, from the clobber
list of the aarch64 asm block of Invoke3WithReferrerAndHidden. The
block was also modified to save and restore x20 to ensure that it
won't be clobbered by the stubs invoked by the blr instruction. Also
added some comments above the clobber list.

Change-Id: I03597fd2d14cf2d6e32edf02835aee2eb68bab17
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 852dcf1..b3832ac 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -578,9 +578,6 @@
     LOCAL_MODULE_PATH_64 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_64)
     LOCAL_MULTILIB := both
     LOCAL_CLANG_CFLAGS += -Wno-used-but-marked-unused -Wno-deprecated -Wno-missing-noreturn  # gtest issue
-    # clang fails to compile art/runtime/arch/stub_test.cc for arm64 without -O1
-    # b/26275713
-    LOCAL_CLANG_CFLAGS_arm64 += -O1
     include $$(BUILD_EXECUTABLE)
     library_path :=
     2nd_library_path :=
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index d4b873e..d5807e2 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -175,12 +175,16 @@
 #elif defined(__aarch64__)
     __asm__ __volatile__(
         // Spill x0-x7 which we say we don't clobber. May contain args.
-        "sub sp, sp, #64\n\t"
-        ".cfi_adjust_cfa_offset 64\n\t"
+        "sub sp, sp, #80\n\t"
+        ".cfi_adjust_cfa_offset 80\n\t"
         "stp x0, x1, [sp]\n\t"
         "stp x2, x3, [sp, #16]\n\t"
         "stp x4, x5, [sp, #32]\n\t"
         "stp x6, x7, [sp, #48]\n\t"
+        // To be extra defensive, store x20. We do this because some of the stubs might make a
+        // transition into the runtime via the blr instruction below and *not* save x20.
+        "str x20, [sp, #64]\n\t"
+        // 8 byte buffer
 
         "sub sp, sp, #16\n\t"          // Reserve stack space, 16B aligned
         ".cfi_adjust_cfa_offset 16\n\t"
@@ -279,8 +283,9 @@
         "ldp x2, x3, [sp, #16]\n\t"
         "ldp x4, x5, [sp, #32]\n\t"
         "ldp x6, x7, [sp, #48]\n\t"
-        "add sp, sp, #64\n\t"         // Free stack space, now sp as on entry
-        ".cfi_adjust_cfa_offset -64\n\t"
+        "ldr x20, [sp, #64]\n\t"
+        "add sp, sp, #80\n\t"         // Free stack space, now sp as on entry
+        ".cfi_adjust_cfa_offset -80\n\t"
 
         "str x9, %[fpr_result]\n\t"   // Store the FPR comparison result
         "mov %[result], x8\n\t"              // Store the call result
@@ -298,13 +303,17 @@
           // Use the result from r0
         : [arg0] "0"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
           [referrer] "r"(referrer), [hidden] "r"(hidden), [fpr_result] "m" (fpr_result)
-        : "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
+          // Leave one register unclobbered, which is needed for compiling with
+          // -fstack-protector-strong. According to AAPCS64 registers x9-x15 are caller-saved,
+          // which means we should unclobber one of the callee-saved registers that are unused.
+          // Here we use x20.
+        : "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
           "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
           "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
           "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
-          "memory");  // clobber.
+          "memory");
 #elif defined(__mips__) && !defined(__LP64__)
     __asm__ __volatile__ (
         // Spill a0-a3 and t0-t7 which we say we don't clobber. May contain args.