ART: Add FlushInstructionPipeline()

Use membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED), where available, to
flush CPU instruction pipelines after JIT code cache updates. This is
needed on architectures where TLB updates do not require a TLB
shootdown.

Bug: 65312375
Bug: 66095511
Bug: 111199492
Test: manual (requires kernel >= 4.16).
Change-Id: I96811c611133ba765a546a09432c0c951ad39e10
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index 761c611..74cc5b9 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -38,6 +38,12 @@
 #include "AvailabilityMacros.h"  // For MAC_OS_X_VERSION_MAX_ALLOWED
 #endif
 
+#if defined(__BIONIC__)
+// membarrier(2) is only supported for target builds (b/111199492).
+#include <linux/membarrier.h>
+#include <sys/syscall.h>
+#endif
+
 #if defined(__linux__)
 #include <linux/unistd.h>
 #endif
@@ -207,4 +213,42 @@
   }
 }
 
+bool FlushInstructionPipeline() {
+  // membarrier(2) is only supported for target builds (b/111199492).
+#if defined(__BIONIC__)
+  static constexpr int kSyncCoreMask =
+      MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE |
+      MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE;
+  static bool have_probed = false;
+  static bool have_sync_core = false;
+
+  if (UNLIKELY(!have_probed)) {
+    // Probe membarrier(2) commands supported by kernel.
+    int commands = syscall(__NR_membarrier, MEMBARRIER_CMD_QUERY, 0);
+    if (commands >= 0) {
+      have_sync_core = (commands & kSyncCoreMask) == kSyncCoreMask;
+      if (have_sync_core) {
+        // Register with kernel that we'll be using the private expedited sync core command.
+        CheckedCall(syscall,
+                    "membarrier register sync core",
+                    __NR_membarrier,
+                    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+                    0);
+      }
+    }
+    have_probed = true;
+  }
+
+  if (have_sync_core) {
+    CheckedCall(syscall,
+                "membarrier sync core",
+                __NR_membarrier,
+                MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE,
+                0);
+    return true;
+  }
+#endif  // defined(__BIONIC__)
+  return false;
+}
+
 }  // namespace art
diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h
index ba61e1b..4449941 100644
--- a/libartbase/base/utils.h
+++ b/libartbase/base/utils.h
@@ -179,16 +179,19 @@
 // Sleep forever and never come back.
 NO_RETURN void SleepForever();
 
-inline void FlushInstructionCache(char* begin, char* end) {
-  __builtin___clear_cache(begin, end);
-}
-
 inline void FlushDataCache(char* begin, char* end) {
   // Same as FlushInstructionCache for lack of other builtin. __builtin___clear_cache
   // flushes both caches.
   __builtin___clear_cache(begin, end);
 }
 
+inline void FlushInstructionCache(char* begin, char* end) {
+  __builtin___clear_cache(begin, end);
+}
+
+// Flush instruction pipeline. Returns true on success, false if feature is unsupported.
+bool FlushInstructionPipeline();
+
 template <typename T>
 constexpr PointerSize ConvertToPointerSize(T any) {
   if (any == 4 || any == 8) {
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 2b2898c..461eb81 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -28,6 +28,7 @@
 #include "base/stl_util.h"
 #include "base/systrace.h"
 #include "base/time_utils.h"
+#include "base/utils.h"
 #include "cha.h"
 #include "debugger_interface.h"
 #include "dex/dex_file_loader.h"
@@ -53,8 +54,9 @@
 namespace art {
 namespace jit {
 
-static constexpr int kProtData = PROT_READ | PROT_WRITE;
 static constexpr int kProtCode = PROT_READ | PROT_EXEC;
+static constexpr int kProtData = PROT_READ | PROT_WRITE;
+static constexpr int kProtProfile = PROT_READ;
 
 static constexpr size_t kCodeSizeLogThreshold = 50 * KB;
 static constexpr size_t kStackMapSizeLogThreshold = 50 * KB;
@@ -192,7 +194,7 @@
   //         to profile system server.
   // NOTE 2: We could just not create the code section at all but we will need to
   //         special case too many cases.
-  int memmap_flags_prot_code = used_only_for_profile_data ? (kProtCode & ~PROT_EXEC) : kProtCode;
+  int memmap_flags_prot_code = used_only_for_profile_data ? kProtProfile : kProtCode;
 
   std::string error_str;
   // Map name specific for android_os_Debug.cpp accounting.
@@ -801,6 +803,16 @@
     // https://android.googlesource.com/kernel/msm/+/3fbe6bc28a6b9939d0650f2f17eb5216c719950c
     FlushInstructionCache(reinterpret_cast<char*>(code_ptr),
                           reinterpret_cast<char*>(code_ptr + code_size));
+
+    // Ensure CPU instruction pipelines are flushed for all cores. This is necessary for
+    // correctness as code may still be in instruction pipelines despite the i-cache flush. It is
+    // not safe to assume that changing permissions with mprotect (RX->RWX->RX) will cause a TLB
+    // shootdown (incidentally invalidating the CPU pipelines by sending an IPI to all cores to
+    // notify them of the TLB invalidation). Some architectures, notably ARM and ARM64, have
+    // hardware support that broadcasts TLB invalidations and so their kernels have no software
+    // based TLB shootdown. FlushInstructionPipeline() is a wrapper around the Linux
+    // membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED) syscall which does the appropriate flushing.
+    FlushInstructionPipeline();
     DCHECK(!Runtime::Current()->IsAotCompiler());
     if (has_should_deoptimize_flag) {
       method_header->SetHasShouldDeoptimizeFlag();