Fix broken runtime SetStatsEnabled logic

Previously, Runtime::SetStatsEnabled wouldn't take stats_enabled_
into account when deciding whether or not to increment / decrement
teh stats enabled counter. This resulted in counter underflows and
other errors which caused some CTS tests to fail.

Also added some locking to prevent race conditions.

Bug: 17360878

(cherry picked from commit a98ffd745bbecb2e84a492194950c0b94966546b)

Change-Id: I21d241a58d35bd6a607aa2305c6da81720bd0886
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 2c95ede..4383a7c 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -37,6 +37,7 @@
 ReaderWriterMutex* Locks::classlinker_classes_lock_ = nullptr;
 Mutex* Locks::deoptimization_lock_ = nullptr;
 ReaderWriterMutex* Locks::heap_bitmap_lock_ = nullptr;
+Mutex* Locks::instrument_entrypoints_lock_ = nullptr;
 Mutex* Locks::intern_table_lock_ = nullptr;
 Mutex* Locks::jni_libraries_lock_ = nullptr;
 Mutex* Locks::logging_lock_ = nullptr;
@@ -876,6 +877,10 @@
       } \
       current_lock_level = new_level;
 
+    UPDATE_CURRENT_LOCK_LEVEL(kInstrumentEntrypointsLock);
+    DCHECK(instrument_entrypoints_lock_ == nullptr);
+    instrument_entrypoints_lock_ = new Mutex("instrument entrypoint lock", current_lock_level);
+
     UPDATE_CURRENT_LOCK_LEVEL(kMutatorLock);
     DCHECK(mutator_lock_ == nullptr);
     mutator_lock_ = new ReaderWriterMutex("mutator lock", current_lock_level);
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 8d2cdce..516fa07 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -100,6 +100,7 @@
   kTraceLock,
   kHeapBitmapLock,
   kMutatorLock,
+  kInstrumentEntrypointsLock,
   kThreadListSuspendThreadLock,
   kZygoteCreationLock,
 
@@ -491,6 +492,9 @@
   // potential deadlock cycle.
   static Mutex* thread_list_suspend_thread_lock_;
 
+  // Guards allocation entrypoint instrumenting.
+  static Mutex* instrument_entrypoints_lock_ ACQUIRED_AFTER(thread_list_suspend_thread_lock_);
+
   // The mutator_lock_ is used to allow mutators to execute in a shared (reader) mode or to block
   // mutators by having an exclusive (writer) owner. In normal execution each mutator thread holds
   // a share on the mutator_lock_. The garbage collector may also execute with shared access but
@@ -549,7 +553,7 @@
   // else                                          |  .. running ..
   //   Goto x                                      |  .. running ..
   //  .. running ..                                |  .. running ..
-  static ReaderWriterMutex* mutator_lock_ ACQUIRED_AFTER(thread_list_suspend_thread_lock_);
+  static ReaderWriterMutex* mutator_lock_ ACQUIRED_AFTER(instrument_entrypoints_lock_);
 
   // Allow reader-writer mutual exclusion on the mark and live bitmaps of the heap.
   static ReaderWriterMutex* heap_bitmap_lock_ ACQUIRED_AFTER(mutator_lock_);
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 5729ad8..96b44bf 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -4453,7 +4453,7 @@
       recent_allocation_records_ = new AllocRecord[alloc_record_max_];
       CHECK(recent_allocation_records_ != nullptr);
     }
-    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints(false);
+    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
   } else {
     {
       ScopedObjectAccess soa(self);  // For type_cache_.Clear();
@@ -4469,7 +4469,7 @@
       type_cache_.Clear();
     }
     // If an allocation comes in before we uninstrument, we will safely drop it on the floor.
-    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints(false);
+    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints();
   }
 }
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 56ab99d..d672510 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -426,7 +426,7 @@
     }
   }
   if (running_on_valgrind_) {
-    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints(false);
+    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
   }
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() exiting";
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index a2e88a6..15be6b7 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -597,45 +597,52 @@
   thread->ResetQuickAllocEntryPointsForThread();
 }
 
-void Instrumentation::SetEntrypointsInstrumented(bool instrumented, bool suspended) {
+void Instrumentation::SetEntrypointsInstrumented(bool instrumented) {
+  Thread* self = Thread::Current();
   Runtime* runtime = Runtime::Current();
   ThreadList* tl = runtime->GetThreadList();
-  if (suspended) {
-    Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
-  }
-  if (runtime->IsStarted() && !suspended) {
+  Locks::mutator_lock_->AssertNotHeld(self);
+  Locks::instrument_entrypoints_lock_->AssertHeld(self);
+  if (runtime->IsStarted()) {
     tl->SuspendAll();
   }
   {
-    MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+    MutexLock mu(self, *Locks::runtime_shutdown_lock_);
     SetQuickAllocEntryPointsInstrumented(instrumented);
     ResetQuickAllocEntryPoints();
   }
-  if (runtime->IsStarted() && !suspended) {
+  if (runtime->IsStarted()) {
     tl->ResumeAll();
   }
 }
 
-void Instrumentation::InstrumentQuickAllocEntryPoints(bool suspended) {
-  // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
-  //       should be guarded by a lock.
-  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.LoadSequentiallyConsistent(), 0);
-  const bool enable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.FetchAndAddSequentiallyConsistent(1) == 0;
-  if (enable_instrumentation) {
-    SetEntrypointsInstrumented(true, suspended);
-  }
+void Instrumentation::InstrumentQuickAllocEntryPoints() {
+  MutexLock mu(Thread::Current(), *Locks::instrument_entrypoints_lock_);
+  InstrumentQuickAllocEntryPointsLocked();
 }
 
-void Instrumentation::UninstrumentQuickAllocEntryPoints(bool suspended) {
-  // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
-  //       should be guarded by a lock.
-  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.LoadSequentiallyConsistent(), 0);
-  const bool disable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.FetchAndSubSequentiallyConsistent(1) == 1;
-  if (disable_instrumentation) {
-    SetEntrypointsInstrumented(false, suspended);
+void Instrumentation::UninstrumentQuickAllocEntryPoints() {
+  MutexLock mu(Thread::Current(), *Locks::instrument_entrypoints_lock_);
+  UninstrumentQuickAllocEntryPointsLocked();
+}
+
+void Instrumentation::InstrumentQuickAllocEntryPointsLocked() {
+  Locks::instrument_entrypoints_lock_->AssertHeld(Thread::Current());
+  if (quick_alloc_entry_points_instrumentation_counter_ == 0) {
+    SetEntrypointsInstrumented(true);
   }
+  ++quick_alloc_entry_points_instrumentation_counter_;
+  LOG(INFO) << "Counter: " << quick_alloc_entry_points_instrumentation_counter_;
+}
+
+void Instrumentation::UninstrumentQuickAllocEntryPointsLocked() {
+  Locks::instrument_entrypoints_lock_->AssertHeld(Thread::Current());
+  CHECK_GT(quick_alloc_entry_points_instrumentation_counter_, 0U);
+  --quick_alloc_entry_points_instrumentation_counter_;
+  if (quick_alloc_entry_points_instrumentation_counter_ == 0) {
+    SetEntrypointsInstrumented(false);
+  }
+  LOG(INFO) << "Counter: " << quick_alloc_entry_points_instrumentation_counter_;
 }
 
 void Instrumentation::ResetQuickAllocEntryPoints() {
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 3c1c756..3017bf6 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -182,9 +182,13 @@
     return interpreter_handler_table_;
   }
 
-  void InstrumentQuickAllocEntryPoints(bool suspended)
+  void InstrumentQuickAllocEntryPoints() LOCKS_EXCLUDED(Locks::instrument_entrypoints_lock_);
+  void UninstrumentQuickAllocEntryPoints() LOCKS_EXCLUDED(Locks::instrument_entrypoints_lock_);
+  void InstrumentQuickAllocEntryPointsLocked()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::instrument_entrypoints_lock_)
       LOCKS_EXCLUDED(Locks::thread_list_lock_, Locks::runtime_shutdown_lock_);
-  void UninstrumentQuickAllocEntryPoints(bool suspended)
+  void UninstrumentQuickAllocEntryPointsLocked()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::instrument_entrypoints_lock_)
       LOCKS_EXCLUDED(Locks::thread_list_lock_, Locks::runtime_shutdown_lock_);
   void ResetQuickAllocEntryPoints() EXCLUSIVE_LOCKS_REQUIRED(Locks::runtime_shutdown_lock_);
 
@@ -350,7 +354,7 @@
 
   // No thread safety analysis to get around SetQuickAllocEntryPointsInstrumented requiring
   // exclusive access to mutator lock which you can't get if the runtime isn't started.
-  void SetEntrypointsInstrumented(bool instrumented, bool suspended) NO_THREAD_SAFETY_ANALYSIS;
+  void SetEntrypointsInstrumented(bool instrumented) NO_THREAD_SAFETY_ANALYSIS;
 
   void MethodEnterEventImpl(Thread* thread, mirror::Object* this_object,
                             mirror::ArtMethod* method, uint32_t dex_pc) const
@@ -455,8 +459,8 @@
   InterpreterHandlerTable interpreter_handler_table_ GUARDED_BY(Locks::mutator_lock_);
 
   // Greater than 0 if quick alloc entry points instrumented.
-  // TODO: The access and changes to this is racy and should be guarded by a lock.
-  AtomicInteger quick_alloc_entry_points_instrumentation_counter_;
+  size_t quick_alloc_entry_points_instrumentation_counter_
+      GUARDED_BY(Locks::instrument_entrypoints_lock_);
 
   DISALLOW_COPY_AND_ASSIGN(Instrumentation);
 };
diff --git a/runtime/native/dalvik_system_VMDebug.cc b/runtime/native/dalvik_system_VMDebug.cc
index d8a537f..ceff206 100644
--- a/runtime/native/dalvik_system_VMDebug.cc
+++ b/runtime/native/dalvik_system_VMDebug.cc
@@ -60,11 +60,11 @@
 }
 
 static void VMDebug_startAllocCounting(JNIEnv*, jclass) {
-  Runtime::Current()->SetStatsEnabled(true, false);
+  Runtime::Current()->SetStatsEnabled(true);
 }
 
 static void VMDebug_stopAllocCounting(JNIEnv*, jclass) {
-  Runtime::Current()->SetStatsEnabled(false, false);
+  Runtime::Current()->SetStatsEnabled(false);
 }
 
 static jint VMDebug_getAllocCount(JNIEnv*, jclass, jint kind) {
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 3432aa8..49f8c63 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1007,14 +1007,18 @@
   }
 }
 
-void Runtime::SetStatsEnabled(bool new_state, bool suspended) {
+void Runtime::SetStatsEnabled(bool new_state) {
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::instrument_entrypoints_lock_);
   if (new_state == true) {
     GetStats()->Clear(~0);
     // TODO: wouldn't it make more sense to clear _all_ threads' stats?
-    Thread::Current()->GetStats()->Clear(~0);
-    GetInstrumentation()->InstrumentQuickAllocEntryPoints(suspended);
-  } else {
-    GetInstrumentation()->UninstrumentQuickAllocEntryPoints(suspended);
+    self->GetStats()->Clear(~0);
+    if (stats_enabled_ != new_state) {
+      GetInstrumentation()->InstrumentQuickAllocEntryPointsLocked();
+    }
+  } else if (stats_enabled_ != new_state) {
+    GetInstrumentation()->UninstrumentQuickAllocEntryPointsLocked();
   }
   stats_enabled_ = new_state;
 }
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 84e40ad..35e3a88 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -389,7 +389,8 @@
 
   void ResetStats(int kinds);
 
-  void SetStatsEnabled(bool new_state, bool suspended);
+  void SetStatsEnabled(bool new_state) LOCKS_EXCLUDED(Locks::instrument_entrypoints_lock_,
+                                                      Locks::mutator_lock_);
 
   enum class NativeBridgeAction {  // private
     kUnload,
diff --git a/runtime/trace.cc b/runtime/trace.cc
index b32e042..4bb388f 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -361,6 +361,10 @@
   }
 
   Runtime* runtime = Runtime::Current();
+
+  // Enable count of allocs if specified in the flags.
+  bool enable_stats = false;
+
   runtime->GetThreadList()->SuspendAll();
 
   // Create Trace object.
@@ -369,13 +373,8 @@
     if (the_trace_ != NULL) {
       LOG(ERROR) << "Trace already in progress, ignoring this request";
     } else {
+      enable_stats = (flags && kTraceCountAllocs) != 0;
       the_trace_ = new Trace(trace_file.release(), buffer_size, flags, sampling_enabled);
-
-      // Enable count of allocs if specified in the flags.
-      if ((flags && kTraceCountAllocs) != 0) {
-        runtime->SetStatsEnabled(true, true);
-      }
-
       if (sampling_enabled) {
         CHECK_PTHREAD_CALL(pthread_create, (&sampling_pthread_, NULL, &RunSamplingThread,
                                             reinterpret_cast<void*>(interval_us)),
@@ -391,9 +390,15 @@
   }
 
   runtime->GetThreadList()->ResumeAll();
+
+  // Can't call this when holding the mutator lock.
+  if (enable_stats) {
+    runtime->SetStatsEnabled(true);
+  }
 }
 
 void Trace::Stop() {
+  bool stop_alloc_counting = false;
   Runtime* runtime = Runtime::Current();
   runtime->GetThreadList()->SuspendAll();
   Trace* the_trace = NULL;
@@ -409,6 +414,7 @@
     }
   }
   if (the_trace != NULL) {
+    stop_alloc_counting = (the_trace->flags_ & kTraceCountAllocs) != 0;
     the_trace->FinishTracing();
 
     if (the_trace->sampling_enabled_) {
@@ -425,6 +431,11 @@
   }
   runtime->GetThreadList()->ResumeAll();
 
+  if (stop_alloc_counting) {
+    // Can be racy since SetStatsEnabled is not guarded by any locks.
+    Runtime::Current()->SetStatsEnabled(false);
+  }
+
   if (sampling_pthread != 0U) {
     CHECK_PTHREAD_CALL(pthread_join, (sampling_pthread, NULL), "sampling thread shutdown");
     sampling_pthread_ = 0U;
@@ -489,10 +500,6 @@
 
   size_t final_offset = cur_offset_.LoadRelaxed();
 
-  if ((flags_ & kTraceCountAllocs) != 0) {
-    Runtime::Current()->SetStatsEnabled(false, true);
-  }
-
   std::set<mirror::ArtMethod*> visited_methods;
   GetVisitedMethods(final_offset, &visited_methods);