ART: checkpoint mechanism optimization

GC thread and trim thread are both using checkpoint mechanism. GC thread
will request java threads to mark their thread roots by themselves. Trim
thread will request java threads to trim their jni local reference
tables by themselves.

The checkpint mechanism semantics is that the runnable java threads will
run checkpoint function itself at safepoint, and finally the java
threads and gc thread or trim thread is synchronized via barrier. If the
java threads are not runnable, gc thread or trim thread will suspend
them and then run their checkpoint functions one by one on behalf of
them. If all the java threads are not runnable, then gc thread or trim
thread will do all the work itself. In this case, there is no need
synchronization. This will save unnecessary synchronization and thread
state transitions.

Change-Id: If55940946cb3f8b1af42c7237c334f09c8ec7a9f
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 754e217..7026b21 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -298,7 +298,11 @@
     Thread* self = Thread::Current();
     CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
-    concurrent_copying_->GetBarrier().Pass(self);
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      concurrent_copying_->GetBarrier().Pass(self);
+    }
   }
 
  private:
@@ -431,6 +435,11 @@
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   gc_barrier_->Init(self, 0);
   size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release the mutator lock.
+  if (barrier_count == 0) {
+    return;
+  }
   // Release locks then wait for all mutator threads to pass the barrier.
   Locks::mutator_lock_->SharedUnlock(self);
   {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index d7a9292..cd63d26 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -989,7 +989,11 @@
       mark_sweep_->GetHeap()->RevokeRosAllocThreadLocalBuffers(thread);
       ATRACE_END();
     }
-    mark_sweep_->GetBarrier().Pass(self);
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      mark_sweep_->GetBarrier().Pass(self);
+    }
   }
 
  private:
@@ -1006,7 +1010,11 @@
   // run through the barrier including self.
   size_t barrier_count = thread_list->RunCheckpoint(&check_point);
   // Release locks then wait for all mutator threads to pass the barrier.
-  // TODO: optimize to not release locks when there are no threads to wait for.
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release locks.
+  if (barrier_count == 0) {
+    return;
+  }
   Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
   Locks::mutator_lock_->SharedUnlock(self);
   {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5a60c87..3f3add8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1061,7 +1061,11 @@
     ATRACE_BEGIN("Trimming reference table");
     thread->GetJniEnv()->locals.Trim();
     ATRACE_END();
-    barrier_->Pass(Thread::Current());
+    // If thread is a running mutator, then act on behalf of the trim thread.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      barrier_->Pass(Thread::Current());
+    }
   }
 
  private:
@@ -1079,7 +1083,9 @@
   TrimIndirectReferenceTableClosure closure(&barrier);
   ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
   size_t barrier_count = Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
-  barrier.Increment(self, barrier_count);
+  if (barrier_count != 0) {
+    barrier.Increment(self, barrier_count);
+  }
   ATRACE_END();
 }
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 58e5b9d..ef24efc 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -174,7 +174,9 @@
       MutexLock mu(self, *Locks::logging_lock_);
       *os_ << local_os.str();
     }
-    barrier_.Pass(self);
+    if (thread->GetState() == kRunnable) {
+      barrier_.Pass(self);
+    }
   }
 
   void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {
@@ -202,7 +204,9 @@
   }
   DumpCheckpoint checkpoint(&os);
   size_t threads_running_checkpoint = RunCheckpoint(&checkpoint);
-  checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
+  if (threads_running_checkpoint != 0) {
+    checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
+  }
 }
 
 void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2) {
@@ -324,8 +328,7 @@
     Thread::resume_cond_->Broadcast(self);
   }
 
-  // Add one for self.
-  return count + suspended_count_modified_threads.size() + 1;
+  return count;
 }
 
 // Request that a checkpoint function be run on all active (non-suspended)