Check point root marking.

Added thread list checkpoint function, this goes through every thread and runs
the checkpoint on each thread. Threads that are runnable run the checkpoint
callback themselves in the next suspend check, while suspended threads are
left suspended but have the callback called on them.

Added a checkpoint visitor member to each thread, this visitor called when the
checkpoint request flag is set during transitions to suspended from runnable.

Using the checkpoint to mark the roots reduces the first pause of partial /
full gc to around 1 ms.

Change-Id: I97239cc72ee0e4a3397e9138a62ee559268dce0a
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 5d89c40..178af64 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -181,6 +181,7 @@
 	src/dlmalloc.cc \
 	src/file.cc \
 	src/file_linux.cc \
+	src/gc/barrier.cc \
 	src/gc/card_table.cc \
 	src/gc/heap_bitmap.cc \
 	src/gc/large_object_space.cc \
diff --git a/src/gc/barrier.cc b/src/gc/barrier.cc
new file mode 100644
index 0000000..aa9433b
--- /dev/null
+++ b/src/gc/barrier.cc
@@ -0,0 +1,46 @@
+#include "barrier.h"
+#include "../mutex.h"
+#include "thread.h"
+
+namespace art {
+
+Barrier::Barrier()
+    : count_(0),
+      lock_("GC barrier lock"),
+      condition_("GC barrier condition", lock_) {
+}
+
+void Barrier::Pass(Thread* self) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count_ - 1);
+}
+
+void Barrier::Wait(Thread* self) {
+  Increment(self, -1);
+}
+
+void Barrier::Init(Thread* self, int count) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count);
+}
+
+void Barrier::Increment(Thread* self, int delta) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count_ + delta);
+  if (count_ != 0) {
+    condition_.Wait(self);
+  }
+}
+
+void Barrier::SetCountLocked(Thread* self, int count) {
+  count_ = count;
+  if (count_ == 0) {
+    condition_.Broadcast(self);
+  }
+}
+
+Barrier::~Barrier() {
+  CHECK(!count_) << "Attempted to destory barrier with non zero count";
+}
+
+}
diff --git a/src/gc/barrier.h b/src/gc/barrier.h
new file mode 100644
index 0000000..207536a
--- /dev/null
+++ b/src/gc/barrier.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_SRC_GC_BARRIER_H_
+#define ART_SRC_GC_BARRIER_H_
+
+#include "../mutex.h"
+#include "locks.h"
+#include "UniquePtr.h"
+
+namespace art {
+
+class Barrier {
+ public:
+  Barrier();
+  virtual ~Barrier();
+
+  // Pass through the barrier, decrements the count but does not block.
+  void Pass(Thread* self);
+
+  // Wait on the barrier, decrement the count.
+  void Wait(Thread* self);
+
+  // Set the count to a new value, if the value is 0 then everyone waiting on the condition
+  // variable is resumed.
+  void Init(Thread* self, int count);
+
+  // Increment the count by delta, wait on condition if count is non zero.
+  void Increment(Thread* self, int delta);
+
+ private:
+  void SetCountLocked(Thread* self, int count) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Counter, when this reaches 0 all people blocked on the barrier are signalled.
+  int count_ GUARDED_BY(lock_);
+
+  Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  ConditionVariable condition_ GUARDED_BY(lock_);
+};
+
+}  // namespace art
+#endif  // ART_SRC_GC_BARRIER_H_
+
diff --git a/src/gc/card_table.h b/src/gc/card_table.h
index 2715f52..035c073 100644
--- a/src/gc/card_table.h
+++ b/src/gc/card_table.h
@@ -160,4 +160,4 @@
 };
 
 }  // namespace art
-#endif  // DALVIK_ALLOC_CARDTABLE_H_
+#endif  // ART_SRC_GC_CARDTABLE_H_
diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc
index 980eed1..8637370 100644
--- a/src/gc/mark_sweep.cc
+++ b/src/gc/mark_sweep.cc
@@ -19,6 +19,7 @@
 #include <climits>
 #include <vector>
 
+#include "barrier.h"
 #include "card_table.h"
 #include "class_loader.h"
 #include "dex_cache.h"
@@ -37,10 +38,10 @@
 #include "thread.h"
 #include "thread_list.h"
 
-static const bool kUseMarkStackPrefetch = true;
-
 namespace art {
 
+static const bool kUseMarkStackPrefetch = true;
+
 class SetFingerVisitor {
  public:
   SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
@@ -68,7 +69,8 @@
       phantom_reference_list_(NULL),
       cleared_reference_list_(NULL),
       freed_bytes_(0), freed_objects_(0),
-      class_count_(0), array_count_(0), other_count_(0) {
+      class_count_(0), array_count_(0), other_count_(0),
+      gc_barrier_(new Barrier) {
   DCHECK(mark_stack_ != NULL);
 }
 
@@ -200,6 +202,10 @@
   Runtime::Current()->VisitNonConcurrentRoots(MarkObjectVisitor, this);
 }
 
+void MarkSweep::MarkNonThreadRoots() {
+  Runtime::Current()->VisitNonThreadRoots(MarkObjectVisitor, this);
+}
+
 void MarkSweep::MarkConcurrentRoots() {
   Runtime::Current()->VisitConcurrentRoots(MarkObjectVisitor, this);
 }
@@ -519,6 +525,38 @@
   Thread* self;
 };
 
+class CheckpointMarkThreadRoots : public Thread::CheckpointFunction {
+ public:
+  CheckpointMarkThreadRoots(MarkSweep* mark_sweep) : mark_sweep_(mark_sweep) {
+
+  }
+
+  virtual void Run(Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc);
+    WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+    thread->VisitRoots(MarkSweep::MarkObjectVisitor, mark_sweep_);
+    mark_sweep_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkSweep* mark_sweep_;
+};
+
+Barrier& MarkSweep::GetBarrier() {
+  return *gc_barrier_;
+}
+
+void MarkSweep::MarkRootsCheckpoint() {
+  UniquePtr<CheckpointMarkThreadRoots> check_point(new CheckpointMarkThreadRoots(this));
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+  // Increment the count of the barrier. If all of the checkpoints have already been finished then
+  // will hit 0 and continue. Otherwise we are still waiting for some checkpoints, so the counter
+  // will go positive and we will unblock when it hits zero.
+  gc_barrier_->Increment(Thread::Current(), thread_list->RunCheckpoint(check_point.get()));
+}
+
 void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
   size_t freed_objects = num_ptrs;
   size_t freed_bytes = 0;
@@ -538,8 +576,7 @@
     freed_bytes += space->FreeList(self, num_ptrs, ptrs);
   } else {
     for (size_t i = 0; i < num_ptrs; ++i) {
-      Object* obj = static_cast<Object*>(ptrs[i]);
-      freed_bytes += space->Free(self, obj);
+      freed_bytes += space->Free(self, ptrs[i]);
     }
   }
 
diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h
index ed74f99..d64439f 100644
--- a/src/gc/mark_sweep.h
+++ b/src/gc/mark_sweep.h
@@ -25,6 +25,7 @@
 
 namespace art {
 
+class Barrier;
 class CheckObjectVisitor;
 class Class;
 class Heap;
@@ -52,9 +53,15 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void MarkNonThreadRoots()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
   void MarkConcurrentRoots();
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  void MarkRootsCheckpoint();
+       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
   // Verify that image roots point to only marked objects within the alloc space.
   void VerifyImageRoots() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -183,6 +190,11 @@
     }
   }
 
+  static void MarkObjectVisitor(const Object* root, void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+  Barrier& GetBarrier();
+
  private:
   // Returns true if the object has its bit set in the mark bitmap.
   bool IsMarked(const Object* object) const;
@@ -193,9 +205,6 @@
   static bool IsMarkedArrayCallback(const Object* object, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void MarkObjectVisitor(const Object* root, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
   static void ReMarkObjectVisitor(const Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -417,6 +426,8 @@
   size_t array_count_;
   size_t other_count_;
 
+  UniquePtr<Barrier> gc_barrier_;
+
   friend class AddIfReachesAllocSpaceVisitor; // Used by mod-union table.
   friend class CheckBitmapVisitor;
   friend class CheckObjectVisitor;
diff --git a/src/heap.cc b/src/heap.cc
index 584c5b2..33f8366 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -1509,16 +1509,12 @@
 void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_cause,
                                                  bool clear_soft_references) {
   TimingLogger timings("ConcurrentCollectGarbageInternal", true);
-  uint64_t root_begin = NanoTime(), root_end = 0, dirty_begin = 0, dirty_end = 0;
+  uint64_t gc_begin = NanoTime(), root_begin = 0, root_end = 0, dirty_begin = 0, dirty_end = 0;
   std::stringstream gc_type_str;
   gc_type_str << gc_type << " ";
 
-  // Suspend all threads are get exclusive access to the heap.
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
-  thread_list->SuspendAll();
-  timings.AddSplit("SuspendAll");
-  Locks::mutator_lock_->AssertExclusiveHeld(self);
-
+  std::vector<byte*> dirty_cards;
   size_t bytes_freed = 0;
   Object* cleared_references = NULL;
   {
@@ -1528,48 +1524,9 @@
     mark_sweep.Init();
     timings.AddSplit("Init");
 
-    if (verify_pre_gc_heap_) {
-      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      if (!VerifyHeapReferences()) {
-        LOG(FATAL) << "Pre " << gc_type_str.str() << "Gc verification failed";
-      }
-      timings.AddSplit("VerifyHeapReferencesPreGC");
-    }
-
-    // Swap the stacks, this is safe since all the mutators are suspended at this point.
-    SwapStacks();
-
-    // Check that all objects which reference things in the live stack are on dirty cards.
-    if (verify_missing_card_marks_) {
-      ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      // Sort the live stack so that we can quickly binary search it later.
-      std::sort(live_stack_->Begin(), live_stack_->End());
-      if (!VerifyMissingCardMarks()) {
-        LOG(FATAL) << "Pre GC verification of missing card marks failed";
-      }
-    }
-
-    // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
-    // TODO: Investigate using a mark stack instead of a vector.
-    std::vector<byte*> dirty_cards;
-    if (gc_type == kGcTypeSticky) {
-      dirty_cards.reserve(4 * KB);
-      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-        card_table_->GetDirtyCards(*it, dirty_cards);
-      }
-      timings.AddSplit("GetDirtyCards");
-    }
-
-    // Clear image space cards and keep track of cards we cleared in the mod-union table.
-    ClearCards(timings);
-
     {
       WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
 
-      for (Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-        DCHECK(!GetLiveBitmap()->Test(*it));
-      }
-
       if (gc_type == kGcTypePartial) {
         // Copy the mark bits over from the live bits, do this as early as possible or else we can
         // accidentally un-mark roots.
@@ -1595,25 +1552,65 @@
                                   reinterpret_cast<Object*>(alloc_space_->Begin()));
       }
       mark_sweep.FindDefaultMarkBitmap();
+    }
 
-      // Marking roots is not necessary for sticky mark bits since we only actually require the
-      // remarking of roots.
-      if (gc_type != kGcTypeSticky) {
-        mark_sweep.MarkRoots();
-        timings.AddSplit("MarkRoots");
+    mark_sweep.MarkRootsCheckpoint();
+    timings.AddSplit("MarkRootsCheckpoint");
+
+    {
+      root_begin = NanoTime();
+
+      // Suspend all threads are get exclusive access to the heap.
+      thread_list->SuspendAll();
+      timings.AddSplit("SuspendAll");
+      Locks::mutator_lock_->AssertExclusiveHeld(self);
+
+      if (verify_pre_gc_heap_) {
+        WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+        if (!VerifyHeapReferences()) {
+          LOG(FATAL) << "Pre " << gc_type_str.str() << "Gc verification failed";
+        }
+        timings.AddSplit("VerifyHeapReferencesPreGC");
       }
 
-      if (verify_mod_union_table_) {
-        zygote_mod_union_table_->Update();
-        zygote_mod_union_table_->Verify();
-        mod_union_table_->Update();
-        mod_union_table_->Verify();
+      // Swap the stacks, this is safe since all the mutators are suspended at this point.
+      SwapStacks();
+
+      // Check that all objects which reference things in the live stack are on dirty cards.
+      if (verify_missing_card_marks_) {
+        ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
+        // Sort the live stack so that we can quickly binary search it later.
+        std::sort(live_stack_->Begin(), live_stack_->End());
+        if (!VerifyMissingCardMarks()) {
+          LOG(FATAL) << "Pre GC verification of missing card marks failed";
+        }
       }
+
+      // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
+      // TODO: Investigate using a mark stack instead of a vector.
+      if (gc_type == kGcTypeSticky) {
+        dirty_cards.reserve(4 * KB);
+        for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+          card_table_->GetDirtyCards(*it, dirty_cards);
+        }
+        timings.AddSplit("GetDirtyCards");
+      }
+
+      // Clear image space cards and keep track of cards we cleared in the mod-union table.
+      ClearCards(timings);
     }
 
     // Roots are marked on the bitmap and the mark_stack is empty.
     DCHECK(mark_stack_->IsEmpty());
 
+    if (verify_mod_union_table_) {
+      ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
+      zygote_mod_union_table_->Update();
+      zygote_mod_union_table_->Verify();
+      mod_union_table_->Update();
+      mod_union_table_->Verify();
+    }
+
     // Allow mutators to go again, acquire share on mutator_lock_ to continue.
     thread_list->ResumeAll();
     {
@@ -1622,11 +1619,11 @@
       timings.AddSplit("RootEnd");
 
       // Mark the roots which we can do concurrently.
+      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
       mark_sweep.MarkConcurrentRoots();
       timings.AddSplit("MarkConcurrentRoots");
-
-      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
+      mark_sweep.MarkNonThreadRoots();
+      timings.AddSplit("MarkNonThreadRoots");
 
       if (gc_type != kGcTypeSticky) {
         // Mark everything allocated since the last as GC live so that we can sweep concurrently,
@@ -1636,6 +1633,8 @@
         timings.AddSplit("MarkStackAsLive");
       }
 
+      UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
+
       if (gc_type != kGcTypeSticky) {
         // Recursively mark all the non-image bits set in the mark bitmap.
         mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
@@ -1775,7 +1774,7 @@
   // If the GC was slow, then print timings in the log.
   uint64_t pause_roots = (root_end - root_begin) / 1000 * 1000;
   uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
-  uint64_t duration = (NanoTime() - root_begin) / 1000 * 1000;
+  uint64_t duration = (NanoTime() - gc_begin) / 1000 * 1000;
   total_paused_time_ += pause_roots + pause_dirty;
   if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5) ||
       (gc_cause == kGcCauseForAlloc && duration > MsToNs(20))) {
diff --git a/src/oat/runtime/support_thread.cc b/src/oat/runtime/support_thread.cc
index 2eef424..c33b42e 100644
--- a/src/oat/runtime/support_thread.cc
+++ b/src/oat/runtime/support_thread.cc
@@ -20,18 +20,32 @@
 
 namespace art {
 
+static void CheckSuspend(Thread* thread)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (;;) {
+    if (thread->ReadFlag(kCheckpointRequest)) {
+      thread->RunCheckpointFunction();
+      thread->AtomicClearFlag(kCheckpointRequest);
+    } else if (thread->ReadFlag(kSuspendRequest)) {
+      thread->FullSuspendCheck();
+    } else {
+      break;
+    }
+  }
+}
+
 void CheckSuspendFromCode(Thread* thread)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Called when thread->suspend_count_ != 0 on JNI return. JNI method acts as callee-save frame.
   thread->VerifyStack();
-  thread->FullSuspendCheck();
+  CheckSuspend(thread);
 }
 
 extern "C" void artTestSuspendFromCode(Thread* thread, AbstractMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Called when suspend count check value is 0 and thread->suspend_count_ != 0
   FinishCalleeSaveFrameSetup(thread, sp, Runtime::kRefsOnly);
-  thread->FullSuspendCheck();
+  CheckSuspend(thread);
 }
 
 }  // namespace art
diff --git a/src/runtime.cc b/src/runtime.cc
index 3a5c41c..7bc1b70 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1002,10 +1002,9 @@
   }
 }
 
-void Runtime::VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg) {
+void Runtime::VisitNonThreadRoots(Heap::RootVisitor* visitor, void* arg) {
   Dbg::VisitRoots(visitor, arg);
   java_vm_->VisitRoots(visitor, arg);
-  thread_list_->VisitRoots(visitor, arg);
   if (pre_allocated_OutOfMemoryError_ != NULL) {
     visitor(pre_allocated_OutOfMemoryError_, arg);
   }
@@ -1020,6 +1019,11 @@
   }
 }
 
+void Runtime::VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg) {
+  thread_list_->VisitRoots(visitor, arg);
+  VisitNonThreadRoots(visitor, arg);
+}
+
 void Runtime::DirtyRoots() {
   intern_table_->Dirty();
   class_linker_->Dirty();
diff --git a/src/runtime.h b/src/runtime.h
index 44823a0..02f5206 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -234,6 +234,9 @@
   // Visit all of the roots we can do safely do concurrently.
   void VisitConcurrentRoots(Heap::RootVisitor* visitor, void* arg);
 
+  // Visit all of the non thread roots, we can do this with mutators unpaused.
+  void VisitNonThreadRoots(Heap::RootVisitor* visitor, void* arg);
+
   // Visit all other roots which must be done with mutators suspended.
   void VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/src/scoped_thread_state_change.h b/src/scoped_thread_state_change.h
index 32c3685..39f5c3f 100644
--- a/src/scoped_thread_state_change.h
+++ b/src/scoped_thread_state_change.h
@@ -49,6 +49,7 @@
         // A suspended transition to another effectively suspended transition, ok to use Unsafe.
         self_->SetState(new_thread_state);
       }
+
       if (runnable_transition && old_thread_state_ != new_thread_state) {
         if (new_thread_state == kRunnable) {
           self_->TransitionFromSuspendedToRunnable();
diff --git a/src/signal_catcher.cc b/src/signal_catcher.cc
index 80c37d4..b54c819 100644
--- a/src/signal_catcher.cc
+++ b/src/signal_catcher.cc
@@ -145,8 +145,11 @@
     }
   }
   os << "----- end " << getpid() << " -----\n";
-
   CHECK_EQ(self->SetStateUnsafe(old_state), kRunnable);
+  if (self->ReadFlag(kCheckpointRequest)) {
+    self->RunCheckpointFunction();
+    self->AtomicClearFlag(kCheckpointRequest);
+  }
   self->EndAssertNoThreadSuspension(old_cause);
   thread_list->ResumeAll();
 
@@ -186,7 +189,7 @@
   CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup()));
 
   Thread* self = Thread::Current();
-
+  DCHECK_NE(self->GetState(), kRunnable);
   {
     MutexLock mu(self, signal_catcher->lock_);
     signal_catcher->thread_ = self;
diff --git a/src/thread.cc b/src/thread.cc
index cea919f..67773a5 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -555,6 +555,19 @@
   }
 }
 
+bool Thread::RequestCheckpoint(CheckpointFunction* function) {
+  CHECK(!ReadFlag(kCheckpointRequest)) << "Already have a pending checkpoint request";
+  checkpoint_function_ = function;
+  union StateAndFlags old_state_and_flags = state_and_flags_;
+  // We must be runnable to request a checkpoint.
+  old_state_and_flags.as_struct.state = kRunnable;
+  union StateAndFlags new_state_and_flags = old_state_and_flags;
+  new_state_and_flags.as_struct.flags |= kCheckpointRequest;
+  int succeeded = android_atomic_cmpxchg(old_state_and_flags.as_int, new_state_and_flags.as_int,
+                                         &state_and_flags_.as_int);
+  return succeeded == 0;
+}
+
 void Thread::FullSuspendCheck() {
   VLOG(threads) << this << " self-suspending";
   // Make thread appear suspended to other threads, release mutator_lock_.
@@ -570,7 +583,20 @@
   DCHECK_EQ(this, Thread::Current());
   // Change to non-runnable state, thereby appearing suspended to the system.
   DCHECK_EQ(GetState(), kRunnable);
-  state_and_flags_.as_struct.state = new_state;
+  union StateAndFlags old_state_and_flags;
+  union StateAndFlags new_state_and_flags;
+  do {
+    old_state_and_flags = state_and_flags_;
+    // Copy over flags and try to clear the checkpoint bit if it is set.
+    new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags & ~kCheckpointRequest;
+    new_state_and_flags.as_struct.state = new_state;
+  } while (android_atomic_cmpxchg(old_state_and_flags.as_int, new_state_and_flags.as_int,
+                                  &state_and_flags_.as_int) != 0);
+  // If we toggled the checkpoint flag we must have cleared it.
+  uint16_t flag_change = new_state_and_flags.as_struct.flags ^ old_state_and_flags.as_struct.flags;
+  if ((flag_change & kCheckpointRequest) != 0) {
+    RunCheckpointFunction();
+  }
   // Release share on mutator_lock_.
   Locks::mutator_lock_->SharedUnlock(this);
 }
@@ -935,6 +961,7 @@
       pthread_self_(0),
       no_thread_suspension_(0),
       last_no_thread_suspension_cause_(NULL),
+      checkpoint_function_(0),
       thread_exit_check_count_(0) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
diff --git a/src/thread.h b/src/thread.h
index d559449..abfd719 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -101,10 +101,17 @@
   kSuspendRequest   = 1,  // If set implies that suspend_count_ > 0.
   kExceptionPending = 2,  // If set implies that exception_ != NULL.
   kEnterInterpreter = 4,  // Instruct managed code it should enter the interpreter.
+  kCheckpointRequest = 8, // Request that the thread do some set of work.
 };
 
 class PACKED Thread {
  public:
+  class CheckpointFunction {
+   public:
+    virtual ~CheckpointFunction() { }
+    virtual void Run(Thread* self) = 0;
+  };
+
   // Space to throw a StackOverflowError in.
 #if !defined(ART_USE_LLVM_COMPILER)
   static const size_t kStackOverflowReservedBytes = 4 * KB;
@@ -167,13 +174,17 @@
     return debug_suspend_count_;
   }
 
-  bool IsSuspended() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return GetState() != kRunnable && ReadFlag(kSuspendRequest);
+  bool IsSuspended() const {
+    union StateAndFlags state_and_flags = state_and_flags_;
+    return state_and_flags.as_struct.state != kRunnable &&
+        (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
 
   void ModifySuspendCount(Thread* self, int delta, bool for_debugger)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
 
+  bool RequestCheckpoint(CheckpointFunction* function);
+
   // Called when thread detected that the thread_suspend_count_ was non-zero. Gives up share of
   // mutator_lock_ and waits until it is resumed and thread_suspend_count_ is zero.
   void FullSuspendCheck()
@@ -570,6 +581,19 @@
     held_mutexes_[level] = mutex;
   }
 
+  void RunCheckpointFunction() {
+    CHECK(checkpoint_function_ != NULL);
+    checkpoint_function_->Run(this);
+  }
+
+  bool ReadFlag(ThreadFlag flag) const {
+    return (state_and_flags_.as_struct.flags & flag) != 0;
+  }
+
+  void AtomicSetFlag(ThreadFlag flag);
+
+  void AtomicClearFlag(ThreadFlag flag);
+
  private:
   // We have no control over the size of 'bool', but want our boolean fields
   // to be 4-byte quantities.
@@ -617,14 +641,6 @@
 
   void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
 
-  bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
-  }
-
-  void AtomicSetFlag(ThreadFlag flag);
-
-  void AtomicClearFlag(ThreadFlag flag);
-
   static void ThreadExitCallback(void* arg);
 
   // TLS key used to retrieve the Thread*.
@@ -759,6 +775,9 @@
   // Cause for last suspension.
   const char* last_no_thread_suspension_cause_;
 
+  // Pending checkpoint functions.
+  CheckpointFunction* checkpoint_function_;
+
  public:
   // Runtime support function pointers
   // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
diff --git a/src/thread_list.cc b/src/thread_list.cc
index 601a93f..4ad25ae 100644
--- a/src/thread_list.cc
+++ b/src/thread_list.cc
@@ -151,6 +151,79 @@
 }
 #endif
 
+size_t ThreadList::RunCheckpoint(Thread::CheckpointFunction* checkpoint_function) {
+  Thread* self = Thread::Current();
+  if (kIsDebugBuild) {
+    Locks::mutator_lock_->AssertNotHeld(self);
+    Locks::thread_list_lock_->AssertNotHeld(self);
+    Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+    CHECK_NE(self->GetState(), kRunnable);
+  }
+
+  std::vector<Thread*> suspended_count_modified_threads;
+  size_t count = 0;
+  {
+    // Call a checkpoint function for each thread, threads which are suspend get their checkpoint
+    // manually called.
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    // TODO: C++0x auto.
+    for (It it = list_.begin(), end = list_.end(); it != end; ++it) {
+      Thread* thread = *it;
+      if (thread != self) {
+        for (;;) {
+          if (thread->RequestCheckpoint(checkpoint_function)) {
+            // This thread will run it's checkpoint some time in the near future.
+            count++;
+            break;
+          } else {
+            // We are probably suspended, try to make sure that we stay suspended.
+            MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+            // The thread switched back to runnable.
+            if (thread->GetState() == kRunnable) {
+              continue;
+            }
+            thread->ModifySuspendCount(self, +1, false);
+            suspended_count_modified_threads.push_back(thread);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Run the checkpoint on ourself while we wait for threads to suspend.
+  checkpoint_function->Run(self);
+
+  // Run the checkpoint on the suspended threads.
+  for (size_t i = 0; i < suspended_count_modified_threads.size(); ++i) {
+    Thread* thread = suspended_count_modified_threads[i];
+    if (!thread->IsSuspended()) {
+      // Wait until the thread is suspended.
+      uint64_t start = NanoTime();
+      do {
+        // Sleep for 100us.
+        usleep(100);
+      } while (!thread->IsSuspended());
+      uint64_t end = NanoTime();
+      // Shouldn't need to wait for longer than 1 millisecond.
+      const uint64_t threshold = 1;
+      if (NsToMs(end - start) > threshold) {
+        LOG(INFO) << "Warning: waited longer than " << threshold << " ms for thrad suspend"
+                  << std::endl;
+      }
+    }
+    // We know for sure that the thread is suspended at this point.
+    thread->RunCheckpointFunction();
+    {
+      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+      thread->ModifySuspendCount(self, -1, false);
+    }
+  }
+
+  // Add one for self.
+  return count + suspended_count_modified_threads.size() + 1;
+}
+
 void ThreadList::SuspendAll() {
   Thread* self = Thread::Current();
 
diff --git a/src/thread_list.h b/src/thread_list.h
index 3142fd3..a41fa57 100644
--- a/src/thread_list.h
+++ b/src/thread_list.h
@@ -55,6 +55,12 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
+  // Run a checkpoint on threads, running threads are not suspended but run the checkpoint inside
+  // of the suspend check. Returns how many checkpoints we should expect to run.
+  size_t RunCheckpoint(Thread::CheckpointFunction* checkpoint_function);
+      LOCKS_EXCLUDED(Locks::thread_list_lock_,
+                     Locks::thread_suspend_count_lock_);
+
   // Suspends all threads
   void SuspendAllForDebugger()
       LOCKS_EXCLUDED(Locks::mutator_lock_,