Retry cache flushes on ARMv7 devices

On ARMv7, CPU cache flushing requires a system call. This system call
can fail and return an error. This change moves to using the system
call directly (cacheflush) so flush failures can be detected and
flushing can be re-attempted. For other platforms we continue using
__builtin___clear_cache which is an intrinsic with a void return
type.

The strategy for ARMv7 is to attempt to flush the entire range
required. If this fails (a rare occurance), we visit the pages in the
flush range sequentially, first reading a byte from the page to
maximize it's chance of being resident and then flushing the cache
lines. We repeat this up to 4 times per page if there are failures.

As a final fallback, when neither approach to flushing the JIT code
cache pages succeeds, the code is not committed to the JIT code cache
as the cache lines for the new code are in an unknown state.

This complexity is necessary for the dual view JIT because the
executable range is not writable so the kernel logic does not
(appear to) anticipate the need to flush (or invalidate) cache lines
there. Previously the failing cache flush operations went undetected
and result in bad i-cache state and cause crashes. These issues have
only been reported on devices with 32-bit kernels.

(cherry picked from commit eb0223f14a9dbd9e7caaa177d055684b4819799)

Bug: 132205399
Test: art/test.py --host --jit -j32
Test: Manual (described in bug)
Merged-In: I63b56beaac610ea973def0a57118be9a2647da23
Change-Id: I63b56beaac610ea973def0a57118be9a2647da23
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index a44b9ae..18f00e2 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -99,7 +99,7 @@
   int result = mprotect(reinterpret_cast<void*>(base), len, PROT_READ | PROT_WRITE | PROT_EXEC);
   CHECK_EQ(result, 0);
 
-  FlushInstructionCache(reinterpret_cast<void*>(base), reinterpret_cast<void*>(base + len));
+  CHECK(FlushCpuCaches(reinterpret_cast<void*>(base), reinterpret_cast<void*>(base + len)));
 }
 
 void CommonCompilerTest::SetUp() {
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index 30423a4..5af80f4 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -29,6 +29,7 @@
 #include "android-base/stringprintf.h"
 #include "android-base/strings.h"
 
+#include "bit_utils.h"
 #include "os.h"
 
 #if defined(__APPLE__)
@@ -62,6 +63,98 @@
 using android::base::ReadFileToString;
 using android::base::StringPrintf;
 
+#if defined(__arm__)
+
+namespace {
+
+// Bitmap of caches to flush for cacheflush(2). Must be zero for ARM.
+static constexpr int kCacheFlushFlags = 0x0;
+
+// Number of retry attempts when flushing cache ranges.
+static constexpr size_t kMaxFlushAttempts = 4;
+
+int CacheFlush(uintptr_t start, uintptr_t limit) {
+  // The signature of cacheflush(2) seems to vary by source. On ARM the system call wrapper
+  //    (bionic/SYSCALLS.TXT) has the form: int cacheflush(long start, long end, long flags);
+  int r = cacheflush(start, limit, kCacheFlushFlags);
+  if (r == -1) {
+    CHECK_NE(errno, EINVAL);
+  }
+  return r;
+}
+
+bool TouchAndFlushCacheLinesWithinPage(uintptr_t start, uintptr_t limit, size_t attempts) {
+  CHECK_LT(start, limit);
+  CHECK_EQ(RoundDown(start, kPageSize), RoundDown(limit - 1, kPageSize)) << "range spans pages";
+  // Declare a volatile variable so the compiler does not elide reads from the page being touched.
+  volatile uint8_t v = 0;
+  for (size_t i = 0; i < attempts; ++i) {
+    // Touch page to maximize chance page is resident.
+    v = *reinterpret_cast<uint8_t*>(start);
+
+    if (LIKELY(CacheFlush(start, limit) == 0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+bool FlushCpuCaches(void* begin, void* end) {
+  // This method is specialized for ARM as the generic implementation below uses the
+  // __builtin___clear_cache() intrinsic which is declared as void. On ARMv7 flushing the CPU
+  // caches is a privileged operation. The Linux kernel allows these operations to fail when they
+  // trigger a fault (e.g. page not resident). We use a wrapper for the ARM specific cacheflush()
+  // system call to detect the failure and potential erroneous state of the data and instruction
+  // caches.
+  //
+  // The Android bug for this is b/132205399 and there's a similar discussion on
+  // https://reviews.llvm.org/D37788. This is primarily an issue for the dual view JIT where the
+  // pages where code is executed are only ever RX and never RWX. When attempting to invalidate
+  // instruction cache lines in the RX mapping after writing fresh code in the RW mapping, the
+  // page may not be resident (due to memory pressure), and this means that a fault is raised in
+  // the midst of a cacheflush() call and the instruction cache lines are not invalidated and so
+  // have stale code.
+  //
+  // Other architectures fair better for reasons such as:
+  //
+  // (1) stronger coherence between the data and instruction caches.
+  //
+  // (2) fault handling that allows flushing/invalidation to continue after
+  //     a missing page has been faulted in.
+
+  // In the common case, this flush of the complete range succeeds.
+  uintptr_t start = reinterpret_cast<uintptr_t>(begin);
+  const uintptr_t limit = reinterpret_cast<uintptr_t>(end);
+  if (LIKELY(CacheFlush(start, limit) == 0)) {
+    return true;
+  }
+
+  // A rare failure has occurred implying that part of the range (begin, end] has been swapped
+  // out. Retry flushing but this time grouping cache-line flushes on individual pages and
+  // touching each page before flushing.
+  uintptr_t next_page = RoundUp(start + 1, kPageSize);
+  while (start < limit) {
+    uintptr_t boundary = std::min(next_page, limit);
+    if (!TouchAndFlushCacheLinesWithinPage(start, boundary, kMaxFlushAttempts)) {
+      return false;
+    }
+    start = boundary;
+    next_page += kPageSize;
+  }
+  return true;
+}
+
+#else
+
+bool FlushCpuCaches(void* begin, void* end) {
+  __builtin___clear_cache(reinterpret_cast<char*>(begin), reinterpret_cast<char*>(end));
+  return true;
+}
+
+#endif
+
 pid_t GetTid() {
 #if defined(__APPLE__)
   uint64_t owner;
diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h
index 9284950..f434cb4 100644
--- a/libartbase/base/utils.h
+++ b/libartbase/base/utils.h
@@ -113,15 +113,8 @@
 // Sleep forever and never come back.
 NO_RETURN void SleepForever();
 
-inline void FlushDataCache(void* begin, void* end) {
-  __builtin___clear_cache(reinterpret_cast<char*>(begin), reinterpret_cast<char*>(end));
-}
-
-inline void FlushInstructionCache(void* begin, void* end) {
-  // Same as FlushInstructionCache for lack of other builtin. __builtin___clear_cache
-  // flushes both caches.
-  __builtin___clear_cache(reinterpret_cast<char*>(begin), reinterpret_cast<char*>(end));
-}
+// Flush CPU caches. Returns true on success, false if flush failed.
+WARN_UNUSED bool FlushCpuCaches(void* begin, void* end);
 
 template <typename T>
 constexpr PointerSize ConvertToPointerSize(T any) {
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index fe2d309..333f2da 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -992,6 +992,7 @@
   }
 
   OatQuickMethodHeader* method_header = nullptr;
+  uint8_t* nox_memory = nullptr;
   uint8_t* code_ptr = nullptr;
 
   MutexLock mu(self, lock_);
@@ -1008,7 +1009,7 @@
 
     // AllocateCode allocates memory in non-executable region for alignment header and code. The
     // header size may include alignment padding.
-    uint8_t* nox_memory = AllocateCode(total_size);
+    nox_memory = AllocateCode(total_size);
     if (nox_memory == nullptr) {
       return nullptr;
     }
@@ -1052,14 +1053,25 @@
     // For reference, this behavior is caused by this commit:
     // https://android.googlesource.com/kernel/msm/+/3fbe6bc28a6b9939d0650f2f17eb5216c719950c
     //
+    bool cache_flush_success = true;
     if (HasDualCodeMapping()) {
       // Flush the data cache lines associated with the non-executable copy of the code just added.
-      FlushDataCache(nox_memory, nox_memory + total_size);
+      cache_flush_success = FlushCpuCaches(nox_memory, nox_memory + total_size);
     }
-    // FlushInstructionCache() flushes both data and instruction caches lines. The cacheline range
-    // flushed is for the executable mapping of the code just added.
+
+    // Invalidate i-cache for the executable mapping.
     uint8_t* x_memory = reinterpret_cast<uint8_t*>(method_header);
-    FlushInstructionCache(x_memory, x_memory + total_size);
+    if (cache_flush_success) {
+      cache_flush_success = FlushCpuCaches(x_memory, x_memory + total_size);
+    }
+
+    // If flushing the cache has failed, reject the allocation because we can't guarantee
+    // correctness of the instructions present in the processor caches.
+    if (!cache_flush_success) {
+      PLOG(ERROR) << "Cache flush failed for JIT code, code not committed.";
+      FreeCode(nox_memory);
+      return nullptr;
+    }
 
     // Ensure CPU instruction pipelines are flushed for all cores. This is necessary for
     // correctness as code may still be in instruction pipelines despite the i-cache flush. It is
@@ -1127,7 +1139,13 @@
       FillRootTable(roots_data, roots);
       {
         // Flush data cache, as compiled code references literals in it.
-        FlushDataCache(roots_data, roots_data + data_size);
+        // TODO(oth): establish whether this is necessary.
+        if (!FlushCpuCaches(roots_data, roots_data + data_size)) {
+          PLOG(ERROR) << "Cache flush failed for JIT data, code not committed.";
+          ScopedCodeCacheWrite scc(this);
+          FreeCode(nox_memory);
+          return nullptr;
+        }
       }
       method_code_map_.Put(code_ptr, method);
       if (osr) {