Sticky mark bits "generational" GC

Sticky mark bits GC. Sticky mark bits implemented using allocation stack which enables us to use the previous GC live bitmap as the mark bitmap.

Removed heap_end_ since checking versus it caused more overhead than it saved.

Removed check for impossible allocations at the start of AllocObject since these allocations will just fall through and fail anyways.
These allocations do not happen often enough for it to be worth checking for.

A bunch of constant optimization performance improvements.

Pre locking regression benchmark improvements:
Deltablue: ~0.3 sec runtime.
CaffeineMark: ~300 total score due to improved string score.

Change-Id: I15016f1ae7fdf76fc3aadb5774b527bf802d9701
diff --git a/src/card_table.cc b/src/card_table.cc
index 6c127b6..5bfdfaf 100644
--- a/src/card_table.cc
+++ b/src/card_table.cc
@@ -100,6 +100,25 @@
   memset(mem_map_->Begin(), GC_CARD_CLEAN, mem_map_->Size());
 }
 
+void CardTable::PreClearCards(Space* space, std::vector<byte*>& out_cards) {
+  byte* card_end = CardFromAddr(space->End());
+  for (byte* card_cur = CardFromAddr(space->Begin()); card_cur < card_end; ++card_cur) {
+    if (*card_cur == GC_CARD_DIRTY) {
+      out_cards.push_back(card_cur);
+      *card_cur = GC_CARD_CLEAN;
+    }
+  }
+}
+
+void CardTable::GetDirtyCards(Space* space, std::vector<byte*>& out_cards) const {
+  byte* card_end = CardFromAddr(space->End());
+  for (byte* card_cur = CardFromAddr(space->Begin());card_cur < card_end; ++card_cur) {
+    if (*card_cur == GC_CARD_DIRTY) {
+      out_cards.push_back(card_cur);
+    }
+  }
+}
+
 void CardTable::CheckAddrIsInCardTable(const byte* addr) const {
   byte* card_addr = biased_begin_ + ((uintptr_t)addr >> GC_CARD_SHIFT);
   if (!IsValidCard(card_addr)) {
diff --git a/src/card_table.h b/src/card_table.h
index e1d0646..66357c9 100644
--- a/src/card_table.h
+++ b/src/card_table.h
@@ -73,8 +73,9 @@
   }
 
   // For every dirty card between begin and end invoke the visitor with the specified argument.
-  template <typename Visitor>
-  void Scan(SpaceBitmap* bitmap, byte* scan_begin, byte* scan_end, const Visitor& visitor) const
+  template <typename Visitor, typename FingerVisitor>
+  void Scan(SpaceBitmap* bitmap, byte* scan_begin, byte* scan_end,
+            const Visitor& visitor, const FingerVisitor& finger_visitor) const
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
     DCHECK(bitmap->HasAddress(scan_begin));
@@ -82,21 +83,22 @@
     byte* card_cur = CardFromAddr(scan_begin);
     byte* card_end = CardFromAddr(scan_end);
     while (card_cur < card_end) {
-      while (card_cur < card_end && *card_cur == GC_CARD_CLEAN) {
-        card_cur++;
+      // Find the first dirty card.
+      card_cur = reinterpret_cast<byte*>(memchr(card_cur, GC_CARD_DIRTY, card_end - card_cur));
+      if (card_cur == NULL) {
+        break;
       }
-      byte* run_start = card_cur;
+      byte* run_start = card_cur++;
 
-      while (card_cur < card_end && *card_cur == GC_CARD_DIRTY) {
+      // Guaranteed to have at least one clean card at the end of the array.
+      while (*card_cur == GC_CARD_DIRTY) {
         card_cur++;
       }
       byte* run_end = card_cur;
 
-      if (run_start != run_end) {
-        uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(run_start));
-        uintptr_t end = reinterpret_cast<uintptr_t>(AddrFromCard(run_end));
-        bitmap->VisitMarkedRange(start, end, visitor);
-      }
+      uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(run_start));
+      uintptr_t end = reinterpret_cast<uintptr_t>(AddrFromCard(run_end));
+      bitmap->VisitMarkedRange(start, end, visitor, finger_visitor);
     }
   }
 
@@ -109,6 +111,12 @@
   // Resets all of the bytes in the card table which do not map to the image space.
   void ClearSpaceCards(Space* space);
 
+  // Clean all the cards which map to a space.
+  void PreClearCards(Space* space, std::vector<byte*>& out_cards);
+
+  // Returns all of the dirty cards which map to a space.
+  void GetDirtyCards(Space* space, std::vector<byte*>& out_cards) const;
+
   // Returns the first address in the heap which maps to this card.
   void* AddrFromCard(const byte *card_addr) const {
     DCHECK(IsValidCard(card_addr))
diff --git a/src/class_linker.cc b/src/class_linker.cc
index 3514612..7576c66 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -927,9 +927,14 @@
   }
 
   // reinit clases_ table
-  const Spaces& vec = heap->GetSpaces();
-  for (Spaces::const_iterator cur = vec.begin(); cur != vec.end(); ++cur) {
-    (*cur)->GetLiveBitmap()->Walk(InitFromImageCallback, this);
+  {
+    ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+    heap->FlushAllocStack();
+    const Spaces& vec = heap->GetSpaces();
+    // TODO: C++0x auto
+    for (Spaces::const_iterator it = vec.begin(); it != vec.end(); ++it) {
+      (*it)->GetLiveBitmap()->Walk(InitFromImageCallback, this);
+    }
   }
 
   // reinit class_roots_
diff --git a/src/heap.cc b/src/heap.cc
index 658755e..2bf1372 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -144,6 +144,7 @@
       concurrent_start_bytes_(std::numeric_limits<size_t>::max()),
       concurrent_start_size_(128 * KB),
       concurrent_min_free_(256 * KB),
+      sticky_gc_count_(0),
       num_bytes_allocated_(0),
       num_objects_allocated_(0),
       last_trim_time_(0),
@@ -237,9 +238,22 @@
   CHECK(zygote_mod_union_table_.get() != NULL) << "Failed to create Zygote mod-union table";
 
   num_bytes_allocated_ = 0;
+  for (Spaces::const_iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+    if ((*it)->IsImageSpace()) {
+      num_bytes_allocated_ += (*it)->AsImageSpace()->Size();
+    }
+  }
+
+  // TODO: Count objects in the image space here.
   num_objects_allocated_ = 0;
 
-  mark_stack_.reset(MarkStack::Create());
+  // Max stack size in bytes.
+  static const size_t max_stack_size = capacity / SpaceBitmap::kAlignment * kWordSize;
+
+  // TODO: Rename MarkStack to a more generic name?
+  mark_stack_.reset(MarkStack::Create("dalvik-mark-stack", max_stack_size));
+  allocation_stack_.reset(MarkStack::Create("dalvik-allocation-stack", max_stack_size));
+  live_stack_.reset(MarkStack::Create("dalvik-live-stack", max_stack_size));
 
   // It's still too early to take a lock because there are no threads yet,
   // but we can create the heap lock now. We don't create it earlier to
@@ -269,11 +283,35 @@
   DCHECK(space->GetMarkBitmap() != NULL);
   mark_bitmap_->AddSpaceBitmap(space->GetMarkBitmap());
   spaces_.push_back(space);
+  if (space->IsAllocSpace()) {
+    alloc_space_ = space->AsAllocSpace();
+  }
+
   // Ensure that spaces remain sorted in increasing order of start address (required for CMS finger)
   std::sort(spaces_.begin(), spaces_.end(), SpaceSorter());
+
+  // Ensure that ImageSpaces < ZygoteSpaces < AllocSpaces so that we can do address based checks to
+  // avoid redundant marking.
+  bool seen_zygote = false, seen_alloc = false;
+  for (Spaces::const_iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+    Space* space = *it;
+    if (space->IsImageSpace()) {
+      DCHECK(!seen_zygote);
+      DCHECK(!seen_alloc);
+    } if (space->IsZygoteSpace()) {
+      DCHECK(!seen_alloc);
+      seen_zygote = true;
+    } else if (space->IsAllocSpace()) {
+      seen_alloc = true;
+    }
+  }
 }
 
 Heap::~Heap() {
+  // If we don't reset then the mark stack complains in it's destructor.
+  allocation_stack_->Reset();
+  live_stack_->Reset();
+
   VLOG(heap) << "~Heap()";
   // We can't take the heap lock here because there might be a daemon thread suspended with the
   // heap lock held. We know though that no non-daemon threads are executing, and we know that
@@ -402,35 +440,31 @@
       Runtime::Current()->GetThreadList()->GetLockOwner() == Thread::Current()->GetTid()) {
     return;
   }
-  {
-    ReaderMutexLock mu(GlobalSynchronization::heap_bitmap_lock_);
-    Heap::VerifyObjectLocked(obj);
-  }
+  VerifyObjectBody(obj);
 }
 #endif
 
 void Heap::DumpSpaces() {
   // TODO: C++0x auto
   for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-    LOG(INFO) << **it;
+    Space* space = *it;
+    LOG(INFO) << *space;
+    LOG(INFO) << *space->GetLiveBitmap();
+    LOG(INFO) << *space->GetMarkBitmap();
   }
 }
 
-void Heap::VerifyObjectLocked(const Object* obj) {
-  GlobalSynchronization::heap_bitmap_lock_->AssertReaderHeld();
+// We want to avoid bit rotting.
+void Heap::VerifyObjectBody(const Object* obj) {
   if (!IsAligned<kObjectAlignment>(obj)) {
     LOG(FATAL) << "Object isn't aligned: " << obj;
   } else if (!GetLiveBitmap()->Test(obj)) {
-    Space* space = FindSpaceFromObject(obj);
-    if (space == NULL) {
-      DumpSpaces();
-      LOG(FATAL) << "Object " << obj << " is not contained in any space";
-    }
-    LOG(FATAL) << "Object is dead: " << obj << " in space " << *space;
+    DumpSpaces();
+    LOG(FATAL) << "Object is dead: " << obj;
   }
-#if !VERIFY_OBJECT_FAST
+
   // Ignore early dawn of the universe verifications
-  if (num_objects_allocated_ > 10) {
+  if (!VERIFY_OBJECT_FAST && num_objects_allocated_ > 10) {
     const byte* raw_addr = reinterpret_cast<const byte*>(obj) +
         Object::ClassOffset().Int32Value();
     const Class* c = *reinterpret_cast<Class* const *>(raw_addr);
@@ -450,12 +484,11 @@
     const Class* c_c_c = *reinterpret_cast<Class* const *>(raw_addr);
     CHECK_EQ(c_c, c_c_c);
   }
-#endif
 }
 
 void Heap::VerificationCallback(Object* obj, void* arg) {
   DCHECK(obj != NULL);
-  reinterpret_cast<Heap*>(arg)->VerifyObjectLocked(obj);
+  reinterpret_cast<Heap*>(arg)->VerifyObjectBody(obj);
 }
 
 void Heap::VerifyHeap() {
@@ -480,31 +513,31 @@
       thread_stats->allocated_bytes += size;
     }
   }
-  {
-    WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-    live_bitmap_->Set(obj);
-  }
+
+  DCHECK(obj);
+
+  allocation_stack_->AtomicPush(obj);
+#if VERIFY_OBJECT_ENABLED
+  WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+  // Verify objects doesn't like objects in allocation stack not being marked as live.
+  live_bitmap_->Set(obj);
+#endif
 }
 
 void Heap::RecordFree(size_t freed_objects, size_t freed_bytes) {
   MutexLock mu(*statistics_lock_);
 
-  if (freed_objects < num_objects_allocated_) {
-    num_objects_allocated_ -= freed_objects;
-  } else {
-    num_objects_allocated_ = 0;
-  }
-  if (freed_bytes < num_bytes_allocated_) {
-    num_bytes_allocated_ -= freed_bytes;
-  } else {
-    num_bytes_allocated_ = 0;
-  }
+  DCHECK_LE(freed_objects, num_objects_allocated_);
+  num_objects_allocated_ -= freed_objects;
+
+  DCHECK_LE(freed_bytes, num_bytes_allocated_);
+  num_bytes_allocated_ -= freed_bytes;
 
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* global_stats = Runtime::Current()->GetStats();
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
-    ++global_stats->freed_objects;
-    ++thread_stats->freed_objects;
+    global_stats->freed_objects += freed_objects;
+    thread_stats->freed_objects += freed_objects;
     global_stats->freed_bytes += freed_bytes;
     thread_stats->freed_bytes += freed_bytes;
   }
@@ -532,20 +565,6 @@
   self->AssertThreadSuspensionIsAllowable();
 #endif
 
-  // Fail impossible allocations
-  if (alloc_size > space->Capacity()) {
-    // On failure collect soft references
-    WaitForConcurrentGcToComplete();
-    if (Runtime::Current()->HasStatsEnabled()) {
-      ++Runtime::Current()->GetStats()->gc_for_alloc_count;
-      ++Thread::Current()->GetStats()->gc_for_alloc_count;
-    }
-    self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-    CollectGarbageInternal(false, true);
-    self->TransitionFromSuspendedToRunnable();
-    return NULL;
-  }
-
   Object* ptr = space->AllocWithoutGrowth(alloc_size);
   if (ptr != NULL) {
     return ptr;
@@ -560,7 +579,7 @@
       ++Thread::Current()->GetStats()->gc_for_alloc_count;
     }
     self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-    CollectGarbageInternal(have_zygote_space_, false);
+    CollectGarbageInternal(have_zygote_space_ ? GC_PARTIAL : GC_FULL, false);
     self->TransitionFromSuspendedToRunnable();
   }
 
@@ -569,6 +588,26 @@
     return ptr;
   }
 
+  const size_t alloc_space_size = alloc_space_->Size();
+  if (alloc_space_size > kMinAllocSpaceSizeForStickyGC &&
+      alloc_space_->Capacity() - alloc_space_size < kMinRemainingSpaceForStickyGC) {
+    // Partial GC didn't free enough memory, try a full GC.
+    if (Runtime::Current()->HasStatsEnabled()) {
+      ++Runtime::Current()->GetStats()->gc_for_alloc_count;
+      ++Thread::Current()->GetStats()->gc_for_alloc_count;
+    }
+
+    // Don't bother trying a young GC unless we have a few MB AllocSpace.
+    self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
+    CollectGarbageInternal(GC_STICKY, false);
+    self->TransitionFromSuspendedToRunnable();
+
+    ptr = space->AllocWithoutGrowth(alloc_size);
+    if (ptr != NULL) {
+      return ptr;
+    }
+  }
+
   if (!have_zygote_space_) {
     // Partial GC didn't free enough memory, try a full GC.
     if (Runtime::Current()->HasStatsEnabled()) {
@@ -576,8 +615,9 @@
       ++Thread::Current()->GetStats()->gc_for_alloc_count;
     }
     self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-    CollectGarbageInternal(false, false);
+    CollectGarbageInternal(GC_PARTIAL, false);
     self->TransitionFromSuspendedToRunnable();
+
     ptr = space->AllocWithoutGrowth(alloc_size);
     if (ptr != NULL) {
       return ptr;
@@ -610,22 +650,18 @@
   }
   // We don't need a WaitForConcurrentGcToComplete here either.
   self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-  CollectGarbageInternal(false, true);
+  CollectGarbageInternal(GC_FULL, true);
   self->TransitionFromSuspendedToRunnable();
-  ptr = space->AllocWithGrowth(alloc_size);
-  if (ptr != NULL) {
-    return ptr;
-  }
-  // Allocation failed.
-  return NULL;
+  return space->AllocWithGrowth(alloc_size);
 }
 
 int64_t Heap::GetMaxMemory() {
   size_t total = 0;
   // TODO: C++0x auto
-  for (Spaces::const_iterator cur = spaces_.begin(); cur != spaces_.end(); ++cur) {
-    if ((*cur)->IsAllocSpace()) {
-      total += (*cur)->AsAllocSpace()->Capacity();
+  for (Spaces::const_iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+    Space* space = *it;
+    if (space->IsAllocSpace()) {
+      total += space->AsAllocSpace()->Capacity();
     }
   }
   return total;
@@ -687,7 +723,7 @@
   // GC unless we clear soft references.
   if (!WaitForConcurrentGcToComplete() || clear_soft_references) {
     ScopedThreadStateChange tsc(Thread::Current(), kWaitingPerformingGc);
-    CollectGarbageInternal(have_zygote_space_, clear_soft_references);
+    CollectGarbageInternal(have_zygote_space_ ? GC_PARTIAL : GC_FULL, clear_soft_references);
   }
 }
 
@@ -700,7 +736,13 @@
     return;
   }
 
-  VLOG(heap) << "Starting PreZygoteFork with alloc space size " << PrettySize(GetBytesAllocated());
+  VLOG(heap) << "Starting PreZygoteFork with alloc space size " << PrettySize(alloc_space_->Size());
+
+  {
+    // Flush the alloc stack.
+    WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+    FlushAllocStack();
+  }
 
   // Replace the first alloc space we find with a zygote space.
   // TODO: C++0x auto
@@ -721,7 +763,40 @@
   }
 }
 
-void Heap::CollectGarbageInternal(bool partial_gc, bool clear_soft_references) {
+void Heap::FlushAllocStack() {
+  MarkStackAsLive(allocation_stack_.get());
+  allocation_stack_->Reset();
+}
+
+void Heap::MarkStackAsLive(MarkStack* alloc_stack) {
+  // We can just assume everything is inside the alloc_space_'s bitmap since we should only have
+  // fresh allocations.
+  SpaceBitmap* live_bitmap = alloc_space_->GetLiveBitmap();
+
+  // Empty the allocation stack.
+  const size_t count = alloc_stack->Size();
+  for (size_t i = 0; i < count; ++i) {
+    const Object* obj = alloc_stack->Get(i);
+    DCHECK(obj != NULL);
+    live_bitmap->Set(obj);
+  }
+}
+
+void Heap::UnMarkStack(MarkStack* alloc_stack) {
+  SpaceBitmap* mark_bitmap = alloc_space_->GetMarkBitmap();
+
+  // Clear all of the things in the AllocStack.
+  size_t count = alloc_stack->Size();
+  for (size_t i = 0;i < count;++i) {
+    const Object* obj = alloc_stack->Get(i);
+    DCHECK(obj != NULL);
+    if (mark_bitmap->Test(obj)) {
+      mark_bitmap->Clear(obj);
+    }
+  }
+}
+
+void Heap::CollectGarbageInternal(GcType gc_type, bool clear_soft_references) {
   GlobalSynchronization::mutator_lock_->AssertNotHeld();
 #ifndef NDEBUG
   {
@@ -747,11 +822,28 @@
     }
   }
   gc_complete_lock_->AssertNotHeld();
-  if (concurrent_gc_) {
-    CollectGarbageConcurrentMarkSweepPlan(partial_gc, clear_soft_references);
-  } else {
-    CollectGarbageMarkSweepPlan(partial_gc, clear_soft_references);
+
+  // We need to do partial GCs every now and then to avoid the heap growing too much and
+  // fragmenting.
+  if (gc_type == GC_STICKY && ++sticky_gc_count_ > kPartialGCFrequency) {
+    gc_type = GC_PARTIAL;
   }
+  if (gc_type != GC_STICKY) {
+    sticky_gc_count_ = 0;
+  }
+
+  uint64_t start_time = NanoTime();
+  if (true || concurrent_gc_) {
+    CollectGarbageConcurrentMarkSweepPlan(gc_type, clear_soft_references);
+  } else {
+    CollectGarbageMarkSweepPlan(gc_type, clear_soft_references);
+  }
+  const uint64_t gc_duration = NanoTime() - start_time;
+  // For particularly slow GCs lets print out another warning.
+  if (gc_duration > MsToNs(100)) {
+    LOG(WARNING) << "Slow GC took " << PrettyDuration(gc_duration);
+  }
+
   gc_complete_lock_->AssertNotHeld();
   MutexLock mu(*gc_complete_lock_);
   is_gc_running_ = false;
@@ -759,25 +851,20 @@
   gc_complete_cond_->Broadcast();
 }
 
-void Heap::CollectGarbageMarkSweepPlan(bool partial_gc, bool clear_soft_references) {
-  TimingLogger timings("CollectGarbageInternal");
-  uint64_t t0 = NanoTime(), dirty_end = 0;
+void Heap::CollectGarbageMarkSweepPlan(GcType gc_type, bool clear_soft_references) {
+  TimingLogger timings("CollectGarbageInternal", true);
 
   // Suspend all threads are get exclusive access to the heap.
+  uint64_t start_time = NanoTime();
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   thread_list->SuspendAll();
   timings.AddSplit("SuspendAll");
   GlobalSynchronization::mutator_lock_->AssertExclusiveHeld();
 
-  size_t initial_size;
-  {
-    MutexLock mu(*statistics_lock_);
-    initial_size = num_bytes_allocated_;
-  }
+  size_t bytes_freed = 0;
   Object* cleared_references = NULL;
   {
     MarkSweep mark_sweep(mark_stack_.get());
-    timings.AddSplit("ctor");
 
     mark_sweep.Init();
     timings.AddSplit("Init");
@@ -786,16 +873,34 @@
     mod_union_table_->Init(&mark_sweep);
     zygote_mod_union_table_->Init(&mark_sweep);
 
+    // Swap allocation stack and live stack, enabling us to have new allocations during this GC.
+    MarkStack* temp = allocation_stack_.release();
+    allocation_stack_.reset(live_stack_.release());
+    live_stack_.reset(temp);
+
+    // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
+    // TODO: Investigate using a mark stack instead of a vector.
+    std::vector<byte*> dirty_cards;
+    if (gc_type == GC_STICKY) {
+      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+        card_table_->GetDirtyCards(*it, dirty_cards);
+      }
+    }
+
     // Clear image space cards and keep track of cards we cleared in the mod-union table.
     for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
       Space* space = *it;
       if (space->IsImageSpace()) {
         mod_union_table_->ClearCards(*it);
+        timings.AddSplit("ClearModUnionCards");
       } else if (space->GetGcRetentionPolicy() == GCRP_FULL_COLLECT) {
         zygote_mod_union_table_->ClearCards(space);
+        timings.AddSplit("ClearZygoteCards");
+      } else {
+        card_table_->ClearSpaceCards(space);
+        timings.AddSplit("ClearCards");
       }
     }
-    timings.AddSplit("ClearCards");
 
 #if VERIFY_MOD_UNION
     mod_union_table_->Verify();
@@ -803,14 +908,36 @@
 #endif
 
     WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-    if (partial_gc) {
+    if (gc_type == GC_PARTIAL) {
       // Copy the mark bits over from the live bits, do this as early as possible or else we can
       // accidentally un-mark roots.
       // Needed for scanning dirty objects.
-      mark_sweep.CopyMarkBits();
+      for (Spaces::iterator it = spaces_.begin();it != spaces_.end(); ++it) {
+        if ((*it)->GetGcRetentionPolicy() == GCRP_FULL_COLLECT) {
+          mark_sweep.CopyMarkBits(*it);
+        }
+      }
       timings.AddSplit("CopyMarkBits");
+
+      // We can assume that everything < alloc_space_ start is marked at this point.
+      mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
+    } else if (gc_type == GC_STICKY) {
+      for (Spaces::iterator it = spaces_.begin();it != spaces_.end(); ++it) {
+        if ((*it)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
+          mark_sweep.CopyMarkBits(*it);
+        }
+      }
+      timings.AddSplit("CopyMarkBits");
+
+      if (VERIFY_OBJECT_ENABLED) {
+        UnMarkStack(live_stack_.get());
+      }
+
+      mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
     }
 
+    MarkStackAsLive(live_stack_.get());
+
     mark_sweep.MarkRoots();
     timings.AddSplit("MarkRoots");
 
@@ -818,13 +945,11 @@
     DCHECK(mark_sweep.IsMarkStackEmpty());
 
     // Update zygote mod union table.
-    if (partial_gc) {
-      zygote_mod_union_table_->Update();
-      timings.AddSplit("UpdateZygoteModUnionTable");
+    zygote_mod_union_table_->Update();
+    timings.AddSplit("UpdateZygoteModUnionTable");
 
-      zygote_mod_union_table_->MarkReferences();
-      timings.AddSplit("ZygoteMarkReferences");
-    }
+    zygote_mod_union_table_->MarkReferences();
+    timings.AddSplit("ZygoteMarkReferences");
 
     // Processes the cards we cleared earlier and adds their objects into the mod-union table.
     mod_union_table_->Update();
@@ -835,75 +960,86 @@
     timings.AddSplit("MarkImageToAllocSpaceReferences");
 
     // Recursively mark all the non-image bits set in the mark bitmap.
-    mark_sweep.RecursiveMark(partial_gc);
-    timings.AddSplit(partial_gc ? "PartialMark" : "RecursiveMark");
+    if (gc_type != GC_STICKY) {
+      live_stack_->Reset();
+      mark_sweep.RecursiveMark(gc_type == GC_PARTIAL, timings);
+    } else {
+      mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
+    }
 
+    // Need to process references the swap since it uses IsMarked.
     mark_sweep.ProcessReferences(clear_soft_references);
     timings.AddSplit("ProcessReferences");
 
-    // Swap the live and mark bitmaps for each alloc space. This is needed since sweep re-swaps
-    // these bitmaps. Doing this enables us to sweep with the heap unlocked since new allocations
-    // set the live bit, but since we have the bitmaps reversed at this point, this sets the mark bit
-    // instead, resulting in no new allocated objects being incorrectly freed by sweep.
-    for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-      Space* space = *it;
-      // We never allocate into zygote spaces.
-      if (space->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT) {
-        live_bitmap_->ReplaceBitmap(space->GetLiveBitmap(), space->GetMarkBitmap());
-        mark_bitmap_->ReplaceBitmap(space->GetMarkBitmap(), space->GetLiveBitmap());
-        space->AsAllocSpace()->SwapBitmaps();
+    // This doesn't work with mutators unpaused for some reason, TODO: Fix.
+    mark_sweep.SweepSystemWeaks(false);
+    timings.AddSplit("SweepSystemWeaks");
+
+    // Need to swap for VERIFY_OBJECT_ENABLED since we put things in the live bitmap after they
+    // have been allocated.
+    const bool swap = true;
+
+    if (swap) {
+      // Swap the live and mark bitmaps for each alloc space. This is needed since sweep re-swaps
+      // these bitmaps. Doing this enables us to sweep with the heap unlocked since new allocations
+      // set the live bit, but since we have the bitmaps reversed at this point, this sets the mark bit
+      // instead, resulting in no new allocated objects being incorrectly freed by sweep.
+      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+        Space* space = *it;
+        // We only allocate into AllocSpace, so we only need to swap AllocSpaces.
+        if (space->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT) {
+          live_bitmap_->ReplaceBitmap(space->GetLiveBitmap(), space->GetMarkBitmap());
+          mark_bitmap_->ReplaceBitmap(space->GetMarkBitmap(), space->GetLiveBitmap());
+          space->AsAllocSpace()->SwapBitmaps();
+        }
       }
     }
 
+#ifndef NDEBUG
     // Verify that we only reach marked objects from the image space
     mark_sweep.VerifyImageRoots();
     timings.AddSplit("VerifyImageRoots");
+#endif
 
-    mark_sweep.Sweep(partial_gc);
+    if (gc_type != GC_STICKY) {
+      mark_sweep.Sweep(gc_type == GC_PARTIAL, swap);
+    } else {
+      mark_sweep.SweepArray(timings, live_stack_.get(), swap);
+    }
     timings.AddSplit("Sweep");
 
     cleared_references = mark_sweep.GetClearedReferences();
+    bytes_freed = mark_sweep.GetFreedBytes();
   }
 
   GrowForUtilization();
   timings.AddSplit("GrowForUtilization");
 
   thread_list->ResumeAll();
-  dirty_end = NanoTime();
+  timings.AddSplit("ResumeAll");
 
   EnqueueClearedReferences(&cleared_references);
   RequestHeapTrim();
   timings.AddSplit("Finish");
 
-  if (VLOG_IS_ON(gc)) {
-    uint64_t t1 = NanoTime();
-
+  // If the GC was slow, then print timings in the log.
+  uint64_t duration = (NanoTime() - start_time) / 1000 * 1000;
+  if (duration > MsToNs(50)) {
     MutexLock mu(*statistics_lock_);
-    // TODO: somehow make the specific GC implementation (here MarkSweep) responsible for logging.
-    // Reason: For CMS sometimes initial_size < num_bytes_allocated_ results in overflow (3GB freed message).
-    size_t bytes_freed = initial_size - num_bytes_allocated_;
-    uint64_t duration_ns = t1 - t0;
-    duration_ns -= duration_ns % 1000;
-
-    // If the GC was slow, then print timings in the log.
-    if (duration_ns > MsToNs(50)) {
-      uint64_t markSweepTime = (dirty_end - t0) / 1000 * 1000;
-      LOG(INFO) << (partial_gc ? "Partial " : "")
-                      << "GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree() << "% free, "
-                      << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory()) << ", "
-                      << "paused " << PrettyDuration(markSweepTime)
-                      << ", total " << PrettyDuration(duration_ns);
-    }
+    LOG(INFO) << (gc_type == GC_PARTIAL ? "Partial " : (gc_type == GC_STICKY ? "Sticky " : ""))
+              << "GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree() << "% free, "
+              << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory()) << ", "
+              << "paused " << PrettyDuration(duration);
   }
-  Dbg::GcDidFinish();
+
   if (VLOG_IS_ON(heap)) {
     timings.Dump();
   }
 }
 
-void Heap::CollectGarbageConcurrentMarkSweepPlan(bool partial_gc, bool clear_soft_references) {
-  TimingLogger timings("CollectGarbageInternal");
-  uint64_t t0 = NanoTime(), root_end = 0, dirty_begin = 0, dirty_end = 0;
+void Heap::CollectGarbageConcurrentMarkSweepPlan(GcType gc_type, bool clear_soft_references) {
+  TimingLogger timings("ConcurrentCollectGarbageInternal", true);
+  uint64_t root_begin = NanoTime(), root_end = 0, dirty_begin = 0, dirty_end = 0;
 
   // Suspend all threads are get exclusive access to the heap.
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
@@ -911,11 +1047,7 @@
   timings.AddSplit("SuspendAll");
   GlobalSynchronization::mutator_lock_->AssertExclusiveHeld();
 
-  size_t initial_size;
-  {
-    MutexLock mu(*statistics_lock_);
-    initial_size = num_bytes_allocated_;
-  }
+  size_t bytes_freed = 0;
   Object* cleared_references = NULL;
   {
     MarkSweep mark_sweep(mark_stack_.get());
@@ -924,6 +1056,20 @@
     mark_sweep.Init();
     timings.AddSplit("Init");
 
+    // Swap the stacks, this is safe sunce all the mutators are suspended at this point.
+    MarkStack* temp = allocation_stack_.release();
+    allocation_stack_.reset(live_stack_.release());
+    live_stack_.reset(temp);
+
+    // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
+    // TODO: Investigate using a mark stack instead of a vector.
+    std::vector<byte*> dirty_cards;
+    if (gc_type == GC_STICKY) {
+      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+        card_table_->GetDirtyCards(*it, dirty_cards);
+      }
+    }
+
     // Make sure that the tables have the correct pointer for the mark sweep.
     mod_union_table_->Init(&mark_sweep);
     zygote_mod_union_table_->Init(&mark_sweep);
@@ -933,31 +1079,59 @@
       Space* space = *it;
       if (space->IsImageSpace()) {
         mod_union_table_->ClearCards(*it);
+        timings.AddSplit("ModUnionClearCards");
       } else if (space->GetGcRetentionPolicy() == GCRP_FULL_COLLECT) {
         zygote_mod_union_table_->ClearCards(space);
+        timings.AddSplit("ZygoteModUnionClearCards");
       } else {
         card_table_->ClearSpaceCards(space);
+        timings.AddSplit("ClearCards");
       }
     }
-    timings.AddSplit("ClearCards");
 
 #if VERIFY_MOD_UNION
     mod_union_table_->Verify();
     zygote_mod_union_table_->Verify();
 #endif
 
-    if (partial_gc) {
-      // Copy the mark bits over from the live bits, do this as early as possible or else we can
-      // accidentally un-mark roots.
-      // Needed for scanning dirty objects.
-      mark_sweep.CopyMarkBits();
-      timings.AddSplit("CopyMarkBits");
-    }
 
     {
       WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-      mark_sweep.MarkRoots();
-      timings.AddSplit("MarkRoots");
+
+      if (gc_type == GC_PARTIAL) {
+        // Copy the mark bits over from the live bits, do this as early as possible or else we can
+        // accidentally un-mark roots.
+        // Needed for scanning dirty objects.
+        for (Spaces::iterator it = spaces_.begin();it != spaces_.end(); ++it) {
+          if ((*it)->GetGcRetentionPolicy() == GCRP_FULL_COLLECT) {
+            mark_sweep.CopyMarkBits(*it);
+          }
+        }
+        timings.AddSplit("CopyMarkBits");
+        mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
+      } else if (gc_type == GC_STICKY) {
+        for (Spaces::iterator it = spaces_.begin();it != spaces_.end(); ++it) {
+          if ((*it)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
+            mark_sweep.CopyMarkBits(*it);
+          }
+        }
+        timings.AddSplit("CopyMarkBits");
+        // We need to unmark the new objects since we marked them as live earlier to avoid verify
+        // objects failing.
+        if (VERIFY_OBJECT_ENABLED) {
+          UnMarkStack(live_stack_.get());
+        }
+        mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
+      }
+
+      // TODO: Investigate whether or not this is really necessary for sticky mark bits.
+      MarkStackAsLive(live_stack_.get());
+
+      if (gc_type != GC_STICKY) {
+        live_stack_->Reset();
+        mark_sweep.MarkRoots();
+        timings.AddSplit("MarkRoots");
+      }
     }
 
     // Roots are marked on the bitmap and the mark_stack is empty.
@@ -970,10 +1144,10 @@
       root_end = NanoTime();
       timings.AddSplit("RootEnd");
 
-      {
-        ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+      WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+      if (gc_type != GC_STICKY) {
         // Update zygote mod union table.
-        if (partial_gc) {
+        if (gc_type == GC_PARTIAL) {
           zygote_mod_union_table_->Update();
           timings.AddSplit("UpdateZygoteModUnionTable");
 
@@ -984,16 +1158,16 @@
         // Processes the cards we cleared earlier and adds their objects into the mod-union table.
         mod_union_table_->Update();
         timings.AddSplit("UpdateModUnionTable");
-      }
-      {
-        WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+
         // Scans all objects in the mod-union table.
         mod_union_table_->MarkReferences();
         timings.AddSplit("MarkImageToAllocSpaceReferences");
 
         // Recursively mark all the non-image bits set in the mark bitmap.
-        mark_sweep.RecursiveMark(partial_gc);
-        timings.AddSplit(partial_gc ? "PartialMark" : "RecursiveMark");
+        mark_sweep.RecursiveMark(gc_type == GC_PARTIAL, timings);
+      } else {
+        mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
+        mark_sweep.DisableFinger();
       }
     }
     // Release share on mutator_lock_ and then get exclusive access.
@@ -1004,24 +1178,30 @@
 
     {
       WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+
       // Re-mark root set.
       mark_sweep.ReMarkRoots();
       timings.AddSplit("ReMarkRoots");
 
       // Scan dirty objects, this is only required if we are not doing concurrent GC.
-      mark_sweep.RecursiveMarkDirtyObjects();
+      mark_sweep.RecursiveMarkDirtyObjects(false);
       timings.AddSplit("RecursiveMarkDirtyObjects");
     }
     {
       ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
       mark_sweep.ProcessReferences(clear_soft_references);
       timings.AddSplit("ProcessReferences");
+
+      // This doesn't work with mutators unpaused for some reason, TODO: Fix.
+      mark_sweep.SweepSystemWeaks(false);
+      timings.AddSplit("SweepSystemWeaks");
     }
     // Swap the live and mark bitmaps for each alloc space. This is needed since sweep re-swaps
     // these bitmaps. Doing this enables us to sweep with the heap unlocked since new allocations
     // set the live bit, but since we have the bitmaps reversed at this point, this sets the mark
     // bit instead, resulting in no new allocated objects being incorrectly freed by sweep.
-    {
+    bool swap = true;
+    if (swap) {
       WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
       for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
         Space* space = *it;
@@ -1040,6 +1220,7 @@
       mark_sweep.VerifyImageRoots();
       timings.AddSplit("VerifyImageRoots");
     }
+
     thread_list->ResumeAll();
     dirty_end = NanoTime();
     GlobalSynchronization::mutator_lock_->AssertNotHeld();
@@ -1047,11 +1228,16 @@
     {
       // TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above).
       WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-      mark_sweep.Sweep(partial_gc);
+      if (gc_type != GC_STICKY) {
+        mark_sweep.Sweep(gc_type == GC_PARTIAL, swap);
+      } else {
+        mark_sweep.SweepArray(timings, live_stack_.get(), swap);
+      }
       timings.AddSplit("Sweep");
     }
 
     cleared_references = mark_sweep.GetClearedReferences();
+    bytes_freed = mark_sweep.GetFreedBytes();
   }
 
   GrowForUtilization();
@@ -1061,28 +1247,18 @@
   RequestHeapTrim();
   timings.AddSplit("Finish");
 
-  if (VLOG_IS_ON(gc)) {
-    uint64_t t1 = NanoTime();
-
+  // If the GC was slow, then print timings in the log.
+  uint64_t pause_roots = (root_end - root_begin) / 1000 * 1000;
+  uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
+  if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5)) {
     MutexLock mu(*statistics_lock_);
-    // TODO: somehow make the specific GC implementation (here MarkSweep) responsible for logging.
-    // Reason: For CMS sometimes initial_size < num_bytes_allocated_ results in overflow (3GB freed message).
-    size_t bytes_freed = initial_size - num_bytes_allocated_;
-    uint64_t duration_ns = t1 - t0;
-    duration_ns -= duration_ns % 1000;
-
-    // If the GC was slow, then print timings in the log.
-    uint64_t pause_roots = (root_end - t0) / 1000 * 1000;
-    uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
-    if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5)) {
-      LOG(INFO) << (partial_gc ? "Partial " : "")
-                      << "GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree() << "% free, "
-                      << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory()) << ", "
-                      << "paused " << PrettyDuration(pause_roots) << "+" << PrettyDuration(pause_dirty)
-                      << ", total " << PrettyDuration(duration_ns);
-    }
+    LOG(INFO) << (gc_type == GC_PARTIAL ? "Partial " : (gc_type == GC_STICKY ? "Sticky " : ""))
+              << "Concurrent GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree()
+              << "% free, " << PrettySize(num_bytes_allocated_) << "/"
+              << PrettySize(GetTotalMemory()) << ", " << "paused " << PrettyDuration(pause_roots)
+              << "+" << PrettyDuration(pause_dirty);
   }
-  Dbg::GcDidFinish();
+
   if (VLOG_IS_ON(heap)) {
     timings.Dump();
   }
@@ -1173,6 +1349,7 @@
       use_footprint_limit = true;
     }
   }
+
   if (use_footprint_limit) {
     size_t foot_print_limit = alloc_space_->GetFootprintLimit();
     MutexLock mu(*statistics_lock_);
@@ -1324,13 +1501,18 @@
   if (Runtime::Current()->IsShuttingDown() || !concurrent_gc_) {
     return;
   }
+
   // TODO: We shouldn't need a WaitForConcurrentGcToComplete here since only
   //       concurrent GC resumes threads before the GC is completed and this function
   //       is only called within the GC daemon thread.
   if (!WaitForConcurrentGcToComplete()) {
     // Start a concurrent GC as one wasn't in progress
     ScopedThreadStateChange tsc(Thread::Current(), kWaitingPerformingGc);
-    CollectGarbageInternal(have_zygote_space_, false);
+    if (alloc_space_->Size() > kMinAllocSpaceSizeForStickyGC) {
+      CollectGarbageInternal(GC_STICKY, false);
+    } else {
+      CollectGarbageInternal(GC_PARTIAL, false);
+    }
   }
 }
 
diff --git a/src/heap.h b/src/heap.h
index 89b6ac4..b6536d0 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -46,15 +46,36 @@
 class Space;
 class SpaceTest;
 class Thread;
+class TimingLogger;
 
 typedef std::vector<Space*> Spaces;
 
+enum GcType {
+  // Full GC
+  GC_FULL,
+  // Sticky mark bits "generational" GC.
+  GC_STICKY,
+  // Partial GC, over only the alloc space
+  GC_PARTIAL,
+};
+
 class LOCKABLE Heap {
  public:
   static const size_t kInitialSize = 2 * MB;
 
   static const size_t kMaximumSize = 32 * MB;
 
+  // After how many GCs we force to do a partial GC instead of sticky mark bits GC.
+  static const size_t kPartialGCFrequency = 10;
+
+  // Sticky mark bits GC has some overhead, so if we have less a few megabytes of AllocSpace then
+  // it's probably better to just do a partial GC.
+  static const size_t kMinAllocSpaceSizeForStickyGC = 6 * MB;
+
+  // Minimum remaining size fo sticky GC. Since sticky GC doesn't free up as much memory as a
+  // normal GC, it is important to not use it when we are almost out of memory.
+  static const size_t kMinRemainingSpaceForStickyGC = 1 * MB;
+
   typedef void (RootVisitor)(const Object* root, void* arg);
   typedef bool (IsMarkedTester)(const Object* object, void* arg);
 
@@ -228,6 +249,16 @@
 
   void PreZygoteFork();
 
+  // Mark and empty stack.
+  void FlushAllocStack()
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+
+  // Mark all the objects in the allocation stack as live.
+  void MarkStackAsLive(MarkStack* alloc_stack);
+
+  // Un-mark all the objects in the allocation stack.
+  void UnMarkStack(MarkStack* alloc_stack);
+
   // DEPRECATED: Should remove in "near" future when support for multiple image spaces is added.
   // Assumes there is only one image space.
   ImageSpace* GetImageSpace();
@@ -251,15 +282,15 @@
   void RecordAllocation(AllocSpace* space, const Object* object)
       LOCKS_EXCLUDED(statistics_lock_, GlobalSynchronization::heap_bitmap_lock_);
 
-  void CollectGarbageInternal(bool partial_gc, bool clear_soft_references)
+  void CollectGarbageInternal(GcType gc_plan, bool clear_soft_references)
       LOCKS_EXCLUDED(gc_complete_lock_,
                      GlobalSynchronization::heap_bitmap_lock_,
                      GlobalSynchronization::mutator_lock_,
                      GlobalSynchronization::thread_suspend_count_lock_);
-  void CollectGarbageMarkSweepPlan(bool partial_gc, bool clear_soft_references)
+  void CollectGarbageMarkSweepPlan(GcType gc_plan, bool clear_soft_references)
       LOCKS_EXCLUDED(GlobalSynchronization::heap_bitmap_lock_,
                      GlobalSynchronization::mutator_lock_);
-  void CollectGarbageConcurrentMarkSweepPlan(bool partial_gc, bool clear_soft_references)
+  void CollectGarbageConcurrentMarkSweepPlan(GcType gc_plan, bool clear_soft_references)
       LOCKS_EXCLUDED(GlobalSynchronization::heap_bitmap_lock_,
                      GlobalSynchronization::mutator_lock_);
 
@@ -272,8 +303,10 @@
 
   void AddSpace(Space* space) LOCKS_EXCLUDED(GlobalSynchronization::heap_bitmap_lock_);
 
-  void VerifyObjectLocked(const Object *obj)
-      SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
+  // No thread saftey analysis since we call this everywhere and it is impossible to find a proper
+  // lock ordering for it.
+  void VerifyObjectBody(const Object *obj)
+      NO_THREAD_SAFETY_ANALYSIS;
 
   static void VerificationCallback(Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
@@ -318,6 +351,7 @@
   size_t concurrent_start_bytes_ GUARDED_BY(statistics_lock_);
   size_t concurrent_start_size_;
   size_t concurrent_min_free_;
+  size_t sticky_gc_count_;
 
   // Number of bytes allocated.  Adjusted after each allocation and free.
   size_t num_bytes_allocated_ GUARDED_BY(statistics_lock_);
@@ -342,6 +376,13 @@
   // Mark stack that we reuse to avoid re-allocating the mark stack
   UniquePtr<MarkStack> mark_stack_;
 
+  // Allocation stack, new allocations go here so that we can do sticky mark bits. This enables us
+  // to use the live bitmap as the old mark bitmap.
+  UniquePtr<MarkStack> allocation_stack_;
+
+  // Second allocation stack so that we can process allocation with the heap unlocked.
+  UniquePtr<MarkStack> live_stack_;
+
   // offset of java.lang.ref.Reference.referent
   MemberOffset reference_referent_offset_;
 
diff --git a/src/heap_bitmap.cc b/src/heap_bitmap.cc
index 50a037b..9cab8c2 100644
--- a/src/heap_bitmap.cc
+++ b/src/heap_bitmap.cc
@@ -20,8 +20,8 @@
   // Check for interval overlap.
   for (Bitmaps::const_iterator it = bitmaps_.begin(); it != bitmaps_.end(); ++it) {
     SpaceBitmap* cur_bitmap = *it;
-    if (bitmap->HeapBegin() < cur_bitmap->HeapSize() + cur_bitmap->HeapSize() &&
-        bitmap->HeapBegin() + bitmap->HeapSize() > cur_bitmap->HeapBegin()) {
+    if (bitmap->HeapBegin() < cur_bitmap->HeapLimit() &&
+        bitmap->HeapLimit() > cur_bitmap->HeapBegin()) {
       LOG(FATAL) << "Overlapping space bitmaps added to heap bitmap!";
     }
   }
diff --git a/src/hprof/hprof.cc b/src/hprof/hprof.cc
index d0c87be..356c1fb 100644
--- a/src/hprof/hprof.cc
+++ b/src/hprof/hprof.cc
@@ -408,6 +408,10 @@
     current_record_.StartNewRecord(body_fp_, HPROF_TAG_HEAP_DUMP_SEGMENT, HPROF_TIME);
     Runtime::Current()->VisitRoots(RootVisitor, this);
     {
+      WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+      Runtime::Current()->GetHeap()->FlushAllocStack();
+    }
+    {
       ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
       Runtime::Current()->GetHeap()->GetLiveBitmap()->Walk(HeapBitmapCallback, this);
     }
diff --git a/src/image_test.cc b/src/image_test.cc
index 9c947c1..82b4760 100644
--- a/src/image_test.cc
+++ b/src/image_test.cc
@@ -47,6 +47,7 @@
       EXPECT_TRUE(klass != NULL) << descriptor;
     }
   }
+
   ImageWriter writer(NULL);
   ScratchFile tmp_image;
   const uintptr_t requested_image_base = 0x60000000;
diff --git a/src/image_writer.cc b/src/image_writer.cc
index 7c88c95..969f4ab0 100644
--- a/src/image_writer.cc
+++ b/src/image_writer.cc
@@ -92,7 +92,10 @@
     return false;
   }
 #ifndef NDEBUG
-  CheckNonImageClassesRemoved();
+  {
+    ScopedObjectAccess soa(Thread::Current());
+    CheckNonImageClassesRemoved();
+  }
 #endif
   heap->DisableCardMarking();
   {
@@ -185,10 +188,13 @@
   }
 }
 
-void ImageWriter::ComputeEagerResolvedStrings() {
+void ImageWriter::ComputeEagerResolvedStrings()
+    SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
   // TODO: Check image spaces only?
+  Heap* heap = Runtime::Current()->GetHeap();
   ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-  Runtime::Current()->GetHeap()->GetLiveBitmap()->Walk(ComputeEagerResolvedStringsCallback, this);
+  heap->FlushAllocStack();
+  heap->GetLiveBitmap()->Walk(ComputeEagerResolvedStringsCallback, this);
 }
 
 bool ImageWriter::IsImageClass(const Class* klass) {
@@ -263,13 +269,20 @@
   return true;
 }
 
-void ImageWriter::CheckNonImageClassesRemoved() {
+void ImageWriter::CheckNonImageClassesRemoved()
+    SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
   if (image_classes_ == NULL) {
     return;
   }
 
+  Heap* heap = Runtime::Current()->GetHeap();
+  {
+    WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+    heap->FlushAllocStack();
+  }
+
   ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
-  Runtime::Current()->GetHeap()->GetLiveBitmap()->Walk(CheckNonImageClassesRemovedCallback, this);
+  heap->GetLiveBitmap()->Walk(CheckNonImageClassesRemovedCallback, this);
 }
 
 void ImageWriter::CheckNonImageClassesRemovedCallback(Object* obj, void* arg) {
@@ -379,10 +392,18 @@
   // know where image_roots is going to end up
   image_end_ += RoundUp(sizeof(ImageHeader), 8); // 64-bit-alignment
 
+  {
+    Heap* heap = Runtime::Current()->GetHeap();
+    ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+    heap->FlushAllocStack();
+  }
+
   // TODO: Image spaces only?
-  for (SpaceVec::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
-    (*cur)->GetLiveBitmap()->InOrderWalk(CalculateNewObjectOffsetsCallback, this);
-    DCHECK_LT(image_end_, image_->Size());
+  {
+    for (SpaceVec::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
+      (*cur)->GetLiveBitmap()->InOrderWalk(CalculateNewObjectOffsetsCallback, this);
+      DCHECK_LT(image_end_, image_->Size());
+    }
   }
 
   // Note that image_top_ is left at end of used space
@@ -398,12 +419,14 @@
   memcpy(image_->Begin(), &image_header, sizeof(image_header));
 }
 
-void ImageWriter::CopyAndFixupObjects() {
+void ImageWriter::CopyAndFixupObjects()
+    SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
   Heap* heap = Runtime::Current()->GetHeap();
   // TODO: heap validation can't handle this fix up pass
   heap->DisableObjectValidation();
   // TODO: Image spaces only?
   ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+  heap->FlushAllocStack();
   heap->GetLiveBitmap()->Walk(CopyAndFixupObjectsCallback, this);
 }
 
diff --git a/src/mark_stack.cc b/src/mark_stack.cc
index e1aa616..2f18898 100644
--- a/src/mark_stack.cc
+++ b/src/mark_stack.cc
@@ -23,15 +23,14 @@
 
 namespace art {
 
-MarkStack* MarkStack::Create() {
-  UniquePtr<MarkStack> mark_stack(new MarkStack);
-  mark_stack->Init();
+MarkStack* MarkStack::Create(const std::string& name, size_t length) {
+  UniquePtr<MarkStack> mark_stack(new MarkStack(name));
+  mark_stack->Init(length);
   return mark_stack.release();
 }
 
-void MarkStack::Init() {
-  size_t length = 64 * MB;
-  mem_map_.reset(MemMap::MapAnonymous("dalvik-mark-stack", NULL, length, PROT_READ | PROT_WRITE));
+void MarkStack::Init(size_t length) {
+  mem_map_.reset(MemMap::MapAnonymous(name_.c_str(), NULL, length, PROT_READ | PROT_WRITE));
   if (mem_map_.get() == NULL) {
     std::string maps;
     ReadFileToString("/proc/self/maps", &maps);
diff --git a/src/mark_stack.h b/src/mark_stack.h
index 59e12d0..73a77b4 100644
--- a/src/mark_stack.h
+++ b/src/mark_stack.h
@@ -17,6 +17,9 @@
 #ifndef ART_SRC_MARK_STACK_H_
 #define ART_SRC_MARK_STACK_H_
 
+#include <string>
+
+#include "atomic.h"
 #include "UniquePtr.h"
 #include "logging.h"
 #include "macros.h"
@@ -28,7 +31,8 @@
 
 class MarkStack {
  public:
-  static MarkStack* Create();
+  // Length is in bytes.
+  static MarkStack* Create(const std::string& name, size_t length);
 
   ~MarkStack();
 
@@ -39,6 +43,15 @@
     ++ptr_;
   }
 
+  // Beware: Atomic pushes and pops don't mix well.
+  void AtomicPush(const Object* obj) {
+    DCHECK(obj != NULL);
+    DCHECK_NE(ptr_, limit_);
+    DCHECK_EQ(sizeof(ptr_), sizeof(int32_t));
+    int32_t* ptr  = reinterpret_cast<int32_t*>(&ptr_);
+    *reinterpret_cast<const Object**>(android_atomic_add(sizeof(*ptr_), ptr)) = obj;
+  }
+
   const Object* Pop() {
     DCHECK_NE(ptr_, begin_);
     --ptr_;
@@ -50,12 +63,29 @@
     return ptr_ == begin_;
   }
 
+  size_t Size() const {
+    DCHECK_GE(ptr_, begin_);
+    return ptr_ - begin_;
+  }
+
+  const Object* Get(size_t index) const {
+    DCHECK_LT(index, Size());
+    return begin_[index];
+  }
+
+  Object** Begin() {
+    return const_cast<Object**>(begin_);
+  }
+
   void Reset();
 
  private:
-  MarkStack() : begin_(NULL), limit_(NULL), ptr_(NULL) {}
+  MarkStack(const std::string& name) : name_(name), begin_(NULL), limit_(NULL), ptr_(NULL) {}
 
-  void Init();
+  void Init(size_t length);
+
+  // Name of the mark stack.
+  const std::string& name_;
 
   // Memory mapping of the mark stack.
   UniquePtr<MemMap> mem_map_;
@@ -67,7 +97,7 @@
   const Object* const* limit_;
 
   // Pointer to the top of the mark stack.
-  const Object**  ptr_;
+  const Object** ptr_;
 
   DISALLOW_COPY_AND_ASSIGN(MarkStack);
 };
diff --git a/src/mark_sweep.cc b/src/mark_sweep.cc
index 227614d..c378234 100644
--- a/src/mark_sweep.cc
+++ b/src/mark_sweep.cc
@@ -19,6 +19,7 @@
 #include <climits>
 #include <vector>
 
+#include "card_table.h"
 #include "class_loader.h"
 #include "dex_cache.h"
 #include "heap.h"
@@ -34,8 +35,24 @@
 #include "timing_logger.h"
 #include "thread.h"
 
+#define MARK_STACK_PREFETCH 1
+
 namespace art {
 
+class SetFingerVisitor {
+ public:
+  SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+
+  }
+
+  void operator ()(void* finger) const {
+    mark_sweep_->SetFinger(reinterpret_cast<Object*>(finger));
+  }
+
+ private:
+  MarkSweep* const mark_sweep_;
+};
+
 MarkSweep::MarkSweep(MarkStack* mark_stack)
     : current_mark_bitmap_(NULL),
       mark_stack_(mark_stack),
@@ -47,6 +64,7 @@
       finalizer_reference_list_(NULL),
       phantom_reference_list_(NULL),
       cleared_reference_list_(NULL),
+      freed_bytes_(0), freed_objects_(0),
       class_count_(0), array_count_(0), other_count_(0) {
   DCHECK(mark_stack_ != NULL);
 }
@@ -58,35 +76,45 @@
   const Spaces& spaces = heap_->GetSpaces();
   // TODO: C++0x auto
   for (Spaces::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
-    if ((*cur)->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT) {
+    if (current_mark_bitmap_ == NULL || (*cur)->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT) {
       current_mark_bitmap_ = (*cur)->GetMarkBitmap();
       break;
     }
   }
+  if (current_mark_bitmap_ == NULL) {
+    GetHeap()->DumpSpaces();
+    DCHECK(false) << "current_mark_bitmap_ == NULL";
+  }
   // TODO: if concurrent, enable card marking in compiler
   // TODO: check that the mark bitmap is entirely clear.
 }
 
-void MarkSweep::MarkObject0(const Object* obj, bool check_finger) {
+inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) {
   DCHECK(obj != NULL);
 
-  SpaceBitmap* space_bitmap = NULL;
-  // Try to take advantage of locality of references within a space, failing this find the space
-  // the hard way.
-  if (current_mark_bitmap_->HasAddress(obj)) {
-    space_bitmap = current_mark_bitmap_;
-  } else {
-    space_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
-  }
-
   if (obj < condemned_) {
-    DCHECK(IsMarked(obj));
+    // May later use condemned for the NULL check.
+    DCHECK(obj == NULL || IsMarked(obj));
     return;
   }
-  bool is_marked = space_bitmap->Test(obj);
+
+  // Try to take advantage of locality of references within a space, failing this find the space
+  // the hard way.
+  if (!current_mark_bitmap_->HasAddress(obj)) {
+    current_mark_bitmap_ = heap_->GetMarkBitmap()->GetSpaceBitmap(obj);
+#ifndef NDEBUG
+    if (current_mark_bitmap_ == NULL) {
+      LOG(WARNING) << obj;
+      GetHeap()->DumpSpaces();
+      LOG(FATAL) << "space_bitmap == NULL";
+    }
+#endif
+  }
+
+  bool is_marked = current_mark_bitmap_->Test(obj);
   // This object was not previously marked.
   if (!is_marked) {
-    space_bitmap->Set(obj);
+    current_mark_bitmap_->Set(obj);
     if (check_finger && obj < finger_) {
       // The object must be pushed on to the mark stack.
       mark_stack_->Push(obj);
@@ -109,7 +137,6 @@
   DCHECK(root != NULL);
   DCHECK(arg != NULL);
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  DCHECK(mark_sweep->finger_ == NULL);  // no point to check finger if it is NULL
   mark_sweep->MarkObject0(root, false);
 }
 
@@ -156,17 +183,10 @@
   mark_sweep->CheckObject(root);
 }
 
-void MarkSweep::CopyMarkBits() {
-  const std::vector<Space*>& spaces = heap_->GetSpaces();
-  for (size_t i = 0; i < spaces.size(); ++i) {
-    Space* space = spaces[i];
-    if (space->GetGcRetentionPolicy() == GCRP_FULL_COLLECT) {
-      SpaceBitmap* live_bitmap = space->GetLiveBitmap();
-      SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
-      DCHECK_EQ(live_bitmap->Size(), mark_bitmap->Size());
-      std::copy(live_bitmap->Begin(), live_bitmap->Begin() + live_bitmap->Size() / kWordSize, mark_bitmap->Begin());
-    }
-  }
+void MarkSweep::CopyMarkBits(Space* space) {
+  SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+  SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+  mark_bitmap->CopyFrom(live_bitmap);
 }
 
 class ScanImageRootVisitor {
@@ -187,37 +207,35 @@
 
 // Marks all objects that are in images and have been touched by the mutator
 void MarkSweep::ScanDirtyImageRoots() {
-  const std::vector<Space*>& spaces = heap_->GetSpaces();
+  const Spaces& spaces = heap_->GetSpaces();
   CardTable* card_table = heap_->GetCardTable();
   ScanImageRootVisitor image_root_visitor(this);
-  for (size_t i = 0; i < spaces.size(); ++i) {
-    Space* space = spaces[i];
+  for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
+    Space* space = *it;
     if (space->IsImageSpace()) {
-      card_table->Scan(space->GetLiveBitmap(), space->Begin(), space->End(), image_root_visitor);
+      card_table->Scan(space->GetLiveBitmap(), space->Begin(), space->End(), image_root_visitor,
+                       IdentityFunctor());
     }
   }
 }
 
-void MarkSweep::ScanBitmapCallback(Object* obj, void* finger, void* arg) {
-  MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->finger_ = reinterpret_cast<Object*>(finger);
-  mark_sweep->ScanObject(obj);
-}
-
-void MarkSweep::ScanDirtyCardCallback(Object* obj, void* arg) {
-  MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
-  mark_sweep->ScanObject(obj);
-}
-
-void MarkSweep::ScanGrayObjects() {
-  const std::vector<Space*>& spaces = heap_->GetSpaces();
+void MarkSweep::ScanGrayObjects(bool update_finger) {
+  const Spaces& spaces = heap_->GetSpaces();
   CardTable* card_table = heap_->GetCardTable();
   ScanImageRootVisitor image_root_visitor(this);
-  for (size_t i = 0; i < spaces.size(); ++i) {
-    byte* begin = spaces[i]->Begin();
-    byte* end = spaces[i]->End();
+  SetFingerVisitor finger_visitor(this);
+
+  for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
+    Space* space = *it;
+    byte* begin = space->Begin();
+    byte* end = space->End();
     // Image spaces are handled properly since live == marked for them.
-    card_table->Scan(spaces[i]->GetMarkBitmap(), begin, end, image_root_visitor);
+    SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+    if (update_finger) {
+      card_table->Scan(mark_bitmap, begin, end, image_root_visitor, finger_visitor);
+    } else {
+      card_table->Scan(mark_bitmap, begin, end, image_root_visitor, IdentityFunctor());
+    }
   }
 }
 
@@ -242,7 +260,6 @@
   // Verify roots ensures that all the references inside the image space point
   // objects which are either in the image space or marked objects in the alloc
   // space
-#ifndef NDEBUG
   CheckBitmapVisitor visitor(this);
   const Spaces& spaces = heap_->GetSpaces();
   for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
@@ -252,15 +269,30 @@
       uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
       SpaceBitmap* live_bitmap = space->GetLiveBitmap();
       DCHECK(live_bitmap != NULL);
-      live_bitmap->VisitMarkedRange(begin, end, visitor);
+      live_bitmap->VisitMarkedRange(begin, end, visitor, IdentityFunctor());
     }
   }
-#endif
 }
 
+class ScanObjectVisitor {
+ public:
+  ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
+
+  }
+
+  void operator ()(const Object* obj) const
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
+    mark_sweep_->ScanObject(obj);
+  }
+
+ private:
+  MarkSweep* const mark_sweep_;
+};
+
 // Populates the mark stack based on the set of marked objects and
 // recursively marks until the mark stack is emptied.
-void MarkSweep::RecursiveMark(bool partial) {
+void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) {
   // RecursiveMark will build the lists of known instances of the Reference classes.
   // See DelayReferenceReferent for details.
   CHECK(soft_reference_list_ == NULL);
@@ -269,28 +301,76 @@
   CHECK(phantom_reference_list_ == NULL);
   CHECK(cleared_reference_list_ == NULL);
 
-  void* arg = reinterpret_cast<void*>(this);
   const Spaces& spaces = heap_->GetSpaces();
 
-  for (size_t i = 0; i < spaces.size(); ++i) {
-    Space* space = spaces[i];
+  SetFingerVisitor set_finger_visitor(this);
+  ScanObjectVisitor scan_visitor(this);
+  for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
+    Space* space = *it;
     if (space->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT ||
         (!partial && space->GetGcRetentionPolicy() == GCRP_FULL_COLLECT)
         ) {
+      current_mark_bitmap_ = space->GetMarkBitmap();
+      if (current_mark_bitmap_ == NULL) {
+        GetHeap()->DumpSpaces();
+        DCHECK(false) << "invalid bitmap";
+      }
+      // This function does not handle heap end increasing, so we must use the space end.
       uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
       uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-
-      current_mark_bitmap_ = space->GetMarkBitmap();
-      current_mark_bitmap_->ScanWalk(begin, end, &ScanBitmapCallback, arg);
+      current_mark_bitmap_->VisitMarkedRange(begin, end, scan_visitor, set_finger_visitor);
     }
   }
   finger_ = reinterpret_cast<Object*>(~0);
+  timings.AddSplit("RecursiveMark");
   // TODO: tune the frequency of emptying the mark stack
   ProcessMarkStack();
+  timings.AddSplit("ProcessMarkStack");
 }
 
-void MarkSweep::RecursiveMarkDirtyObjects() {
-  ScanGrayObjects();
+void MarkSweep::RecursiveMarkCards(CardTable* card_table, const std::vector<byte*>& cards,
+                                   TimingLogger& timings) {
+  ScanImageRootVisitor image_root_visitor(this);
+  SetFingerVisitor finger_visitor(this);
+  for (size_t i = 0;i < cards.size();) {
+    Object* start_obj = reinterpret_cast<Object*>(card_table->AddrFromCard(cards[i]));
+    uintptr_t begin = reinterpret_cast<uintptr_t>(start_obj);
+    uintptr_t end = begin + GC_CARD_SIZE;
+    for (++i; reinterpret_cast<uintptr_t>(cards[i]) == end && i < cards.size(); ++i) {
+      end += GC_CARD_SIZE;
+    }
+    if (current_mark_bitmap_ == NULL || !current_mark_bitmap_->HasAddress(start_obj)) {
+      current_mark_bitmap_ = heap_->GetMarkBitmap()->GetSpaceBitmap(start_obj);
+#ifndef NDEBUG
+      if (current_mark_bitmap_ == NULL) {
+        GetHeap()->DumpSpaces();
+        DCHECK(false) << "Object " << reinterpret_cast<const void*>(start_obj);
+      }
+#endif
+    }
+    current_mark_bitmap_->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
+  }
+  timings.AddSplit("RecursiveMarkCards");
+  ProcessMarkStack();
+  timings.AddSplit("ProcessMarkStack");
+}
+
+bool MarkSweep::IsMarkedCallback(const Object* object, void* arg) {
+  return
+      reinterpret_cast<MarkSweep*>(arg)->IsMarked(object) ||
+      //false;
+      !reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
+}
+
+bool MarkSweep::IsLiveCallback(const Object* object, void* arg) {
+  return
+      reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object) ||
+      //false;
+      !reinterpret_cast<MarkSweep*>(arg)->IsMarked(object);
+}
+
+void MarkSweep::RecursiveMarkDirtyObjects(bool update_finger) {
+  ScanGrayObjects(update_finger);
   ProcessMarkStack();
 }
 
@@ -298,14 +378,21 @@
   Runtime::Current()->VisitRoots(ReMarkObjectVisitor, this);
 }
 
-void MarkSweep::SweepJniWeakGlobals(HeapBitmap* bitmap) {
+void MarkSweep::SweepJniWeakGlobals(bool swap_bitmaps) {
+  HeapBitmap* live_bitmap = GetHeap()->GetLiveBitmap();
+  HeapBitmap* mark_bitmap = GetHeap()->GetMarkBitmap();
+
+  if (swap_bitmaps) {
+    std::swap(live_bitmap, mark_bitmap);
+  }
+
   JavaVMExt* vm = Runtime::Current()->GetJavaVM();
   MutexLock mu(vm->weak_globals_lock);
   IndirectReferenceTable* table = &vm->weak_globals;
   typedef IndirectReferenceTable::iterator It;  // TODO: C++0x auto
   for (It it = table->begin(), end = table->end(); it != end; ++it) {
     const Object** entry = *it;
-    if (!bitmap->Test(*entry)) {
+    if (live_bitmap->Test(*entry) && !mark_bitmap->Test(*entry)) {
       *entry = kClearedJniWeakGlobal;
     }
   }
@@ -313,15 +400,20 @@
 
 void MarkSweep::SweepSystemWeaks(bool swap_bitmaps) {
   Runtime* runtime = Runtime::Current();
+  // The callbacks check
+  // !is_marked where is_marked is the callback but we want
+  // !IsMarked && IsLive
+  // So compute !(!IsMarked && IsLive) which is equal to (IsMarked || !IsLive).
+  // Or for swapped (IsLive || !IsMarked).
   runtime->GetInternTable()->SweepInternTableWeaks(swap_bitmaps ? IsLiveCallback : IsMarkedCallback,
                                                    this);
   runtime->GetMonitorList()->SweepMonitorList(swap_bitmaps ? IsLiveCallback : IsMarkedCallback,
                                               this);
-  SweepJniWeakGlobals(swap_bitmaps ? GetHeap()->GetLiveBitmap() : GetHeap()->GetMarkBitmap());
+  SweepJniWeakGlobals(swap_bitmaps);
 }
 
 struct SweepCallbackContext {
-  Heap* heap;
+  MarkSweep* mark_sweep;
   AllocSpace* space;
 };
 
@@ -331,7 +423,8 @@
   size_t freed_objects = num_ptrs;
   size_t freed_bytes = 0;
   SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
-  Heap* heap = context->heap;
+  MarkSweep* mark_sweep = context->mark_sweep;
+  Heap* heap = mark_sweep->GetHeap();
   AllocSpace* space = context->space;
   // Use a bulk free, that merges consecutive objects before freeing or free per object?
   // Documentation suggests better free performance with merging, but this may be at the expensive
@@ -352,14 +445,17 @@
       space->Free(obj);
     }
   }
+
   heap->RecordFree(freed_objects, freed_bytes);
+  mark_sweep->freed_objects_ += freed_objects;
+  mark_sweep->freed_bytes_ += freed_bytes;
 }
 
 void MarkSweep::ZygoteSweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
   GlobalSynchronization::heap_bitmap_lock_->AssertExclusiveHeld();
 
   SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg);
-  Heap* heap = context->heap;
+  Heap* heap = context->mark_sweep->GetHeap();
   // We don't free any actual memory to avoid dirtying the shared zygote pages.
   for (size_t i = 0; i < num_ptrs; ++i) {
     Object* obj = static_cast<Object*>(ptrs[i]);
@@ -368,17 +464,75 @@
   }
 }
 
-void MarkSweep::Sweep(bool partial) {
-  // If we don't swap bitmaps then we can not do this concurrently.
-  SweepSystemWeaks(true);
+void MarkSweep::SweepArray(TimingLogger& logger, MarkStack* allocations, bool swap_bitmaps) {
+  size_t freed_bytes = 0;
+  AllocSpace* space = heap_->GetAllocSpace();
 
+  // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
+  // bitmap, resulting in occasional frees of Weaks which are still in use.
+  // TODO: Fix when sweeping weaks works properly with mutators unpaused + allocation list.
+  // SweepSystemWeaks(swap_bitmaps);
+
+  // Newly allocated objects MUST be in the alloc space and those are the only objects which we are
+  // going to free.
+  SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+  SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
+  if (swap_bitmaps) {
+    std::swap(live_bitmap, mark_bitmap);
+  }
+
+  size_t count = allocations->Size();
+  Object** objects = allocations->Begin();
+  Object** out = objects;
+
+  // Empty the allocation stack.
+  for (size_t i = 0;i < count;++i) {
+    Object* obj = objects[i];
+    // There should only be objects in the AllocSpace in the allocation stack.
+    DCHECK(space->Contains(obj));
+    DCHECK(mark_bitmap->HasAddress(obj));
+    // Mark as live in the live bitmap if and only if we are marked in the mark bitmap.
+    if (mark_bitmap->Test(obj)) {
+      live_bitmap->Set(obj);
+    } else {
+      // Due to bitmap swapping, a young object can end up being marked as live.
+      live_bitmap->Clear(obj);
+      // Don't bother un-marking since we clear the mark bitmap anyways.
+      *(out++) = obj;
+      size_t size = space->AllocationSize(obj);
+      freed_bytes += size;
+    }
+  }
+  logger.AddSplit("Process allocation stack");
+
+  size_t freed_objects = out - objects;
+  if (freed_objects != 0) {
+    VLOG(heap) << "Freed " << freed_objects << "/" << count
+              << " objects with size " << PrettySize(freed_bytes);
+    space->FreeList(freed_objects, objects);
+    heap_->RecordFree(freed_objects, freed_bytes);
+    freed_objects_ += freed_objects;
+    freed_bytes_ += freed_bytes;
+    logger.AddSplit("FreeList");
+  }
+
+  allocations->Reset();
+  logger.AddSplit("Reset stack");
+}
+
+void MarkSweep::Sweep(bool partial, bool swap_bitmaps) {
   DCHECK(mark_stack_->IsEmpty());
 
+  // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark
+  // bitmap, resulting in occasional frees of Weaks which are still in use.
+  // SweepSystemWeaks(swap_bitmaps);
+
   const Spaces& spaces = heap_->GetSpaces();
   SweepCallbackContext scc;
-  scc.heap = heap_;
-  for (size_t i = 0; i < spaces.size(); ++i) {
-    Space* space = spaces[i];
+  scc.mark_sweep = this;
+  // TODO: C++0x auto
+  for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
+    Space* space = *it;
     if (
         space->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT ||
         (!partial && space->GetGcRetentionPolicy() == GCRP_FULL_COLLECT)
@@ -390,12 +544,15 @@
       SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
       if (space->GetGcRetentionPolicy() == GCRP_ALWAYS_COLLECT) {
         // Bitmaps are pre-swapped for optimization which enables sweeping with the heap unlocked.
-        SpaceBitmap::SweepWalk(
-            *mark_bitmap, *live_bitmap, begin, end, &SweepCallback, reinterpret_cast<void*>(&scc));
+        if (swap_bitmaps) {
+          std::swap(live_bitmap, mark_bitmap);
+        }
+        SpaceBitmap::SweepWalk(*live_bitmap, *mark_bitmap, begin, end,
+                               &SweepCallback, reinterpret_cast<void*>(&scc));
       } else {
         // Zygote sweep takes care of dirtying cards and clearing live bits, does not free actual memory.
-        SpaceBitmap::SweepWalk(
-            *live_bitmap, *mark_bitmap, begin, end, &ZygoteSweepCallback, reinterpret_cast<void*>(&scc));
+        SpaceBitmap::SweepWalk(*live_bitmap, *mark_bitmap, begin, end,
+                               &ZygoteSweepCallback, reinterpret_cast<void*>(&scc));
       }
     }
   }
@@ -419,11 +576,11 @@
   if (ref_offsets != CLASS_WALK_SUPER) {
     // Found a reference offset bitmap.  Mark the specified offsets.
     while (ref_offsets != 0) {
-      size_t right_shift = CLZ(ref_offsets);
+      const size_t right_shift = CLZ(ref_offsets);
       MemberOffset byte_offset = CLASS_OFFSET_FROM_CLZ(right_shift);
       const Object* ref = obj->GetFieldObject<const Object*>(byte_offset, false);
       MarkObject(ref);
-      ref_offsets &= ~(CLASS_HIGH_BIT >> right_shift);
+      ref_offsets ^= CLASS_HIGH_BIT >> right_shift;
     }
   } else {
     // There is no reference offset bitmap.  In the non-static case,
@@ -557,12 +714,21 @@
   }
 }
 
+void MarkSweep::ScanRoot(const Object* obj) {
+  ScanObject(obj);
+}
+
 // Scans an object reference.  Determines the type of the reference
 // and dispatches to a specialized scanning routine.
 void MarkSweep::ScanObject(const Object* obj) {
   DCHECK(obj != NULL);
   DCHECK(obj->GetClass() != NULL);
-  DCHECK(heap_->GetMarkBitmap()->Test(obj));
+#ifndef NDEBUG
+  if (!IsMarked(obj)) {
+    heap_->DumpSpaces();
+    LOG(FATAL) << "Scanning unmarked object " << reinterpret_cast<const void*>(obj);
+  }
+#endif
   if (obj->IsClass()) {
     ScanClass(obj);
   } else if (obj->IsArrayInstance()) {
@@ -574,11 +740,44 @@
 
 // Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack() {
+#if MARK_STACK_PREFETCH
+  const size_t fifo_size = 4;
+  const size_t fifo_mask = fifo_size - 1;
+  const Object* fifo[fifo_size];
+  for (size_t i = 0;i < fifo_size;++i) {
+    fifo[i] = NULL;
+  }
+  size_t fifo_pos = 0;
+  size_t fifo_count = 0;
+  for (;;) {
+    const Object* obj = fifo[fifo_pos & fifo_mask];
+    if (obj != NULL) {
+      ScanObject(obj);
+      fifo[fifo_pos & fifo_mask] = NULL;
+      --fifo_count;
+    }
+
+    if (!mark_stack_->IsEmpty()) {
+      const Object* obj = mark_stack_->Pop();
+      DCHECK(obj != NULL);
+      fifo[fifo_pos & fifo_mask] = obj;
+      __builtin_prefetch(obj);
+      fifo_count++;
+    }
+    fifo_pos++;
+
+    if (!fifo_count) {
+      CHECK(mark_stack_->IsEmpty()) << mark_stack_->Size();
+      break;
+    }
+  }
+#else
   while (!mark_stack_->IsEmpty()) {
     const Object* obj = mark_stack_->Pop();
     DCHECK(obj != NULL);
     ScanObject(obj);
   }
+#endif
 }
 
 // Walks the reference list marking any references subject to the
@@ -705,12 +904,15 @@
 #ifndef NDEBUG
   VLOG(heap) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_ << " other=" << other_count_;
 #endif
+  // Ensure that the mark stack is empty.
+  CHECK(mark_stack_->IsEmpty());
+
   // Clear all of the alloc spaces' mark bitmaps.
   const Spaces& spaces = heap_->GetSpaces();
   // TODO: C++0x auto
-  for (Spaces::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
-    if ((*cur)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
-      (*cur)->GetMarkBitmap()->Clear();
+  for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) {
+    if ((*it)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
+      (*it)->GetMarkBitmap()->Clear();
     }
   }
   mark_stack_->Reset();
diff --git a/src/mark_sweep.h b/src/mark_sweep.h
index bb48b7a..db845b7 100644
--- a/src/mark_sweep.h
+++ b/src/mark_sweep.h
@@ -33,6 +33,7 @@
 class ModUnionVisitor;
 class ModUnionTableBitmap;
 class Object;
+class TimingLogger;
 
 class MarkSweep {
  public:
@@ -59,19 +60,25 @@
   }
 
   // Builds a mark stack and recursively mark until it empties.
-  void RecursiveMark(bool partial)
+  void RecursiveMark(bool partial, TimingLogger& timings)
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
-  // Copies mark bits from live bitmap of zygote space to mark bitmap for partial GCs.
-  void CopyMarkBits();
+  // Copies mark bits from live bitmap of ZygoteSpace to mark bitmap for partial GCs.
+  void CopyMarkBits(Space* space);
 
   // Builds a mark stack with objects on dirty cards and recursively mark
   // until it empties.
-  void RecursiveMarkDirtyObjects()
+  void RecursiveMarkDirtyObjects(bool update_finger)
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
+  // Recursive mark objects on specified cards. Updates finger.
+  void RecursiveMarkCards(CardTable* card_table, const std::vector<byte*>& cards,
+                          TimingLogger& timings)
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);;
+
   // Remarks the root set after completing the concurrent mark.
   void ReMarkRoots()
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
@@ -91,21 +98,55 @@
   }
 
   // Sweeps unmarked objects to complete the garbage collection.
-  void Sweep(bool partial) EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+  void Sweep(bool partial, bool swap_bitmaps)
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+
+  // Sweep only pointers within an array. WARNING: Trashes objects.
+  void SweepArray(TimingLogger& logger, MarkStack* allocation_stack_, bool swap_bitmaps)
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   Object* GetClearedReferences() {
     return cleared_reference_list_;
   }
 
+  // Proxy for external access to ScanObject.
+  void ScanRoot(const Object* obj)
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
+
   // Blackens an object.
   void ScanObject(const Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
+  void SetFinger(Object* new_finger) {
+    finger_ = new_finger;
+  }
+
+  void DisableFinger() {
+    SetFinger(reinterpret_cast<Object*>(~static_cast<uintptr_t>(0)));
+  }
+
+  size_t GetFreedBytes() const {
+    return freed_bytes_;
+  }
+
+  size_t GetFreedObjects() const {
+    return freed_objects_;
+  }
+
+  void SetCondemned(Object* condemned) {
+    condemned_ = condemned;
+  }
+
+  void SweepSystemWeaks(bool swap_bitmaps)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+
  private:
   // Returns true if the object has its bit set in the mark bitmap.
   bool IsMarked(const Object* object) const
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_) {
+    DCHECK(current_mark_bitmap_ != NULL);
     if (current_mark_bitmap_->HasAddress(object)) {
       return current_mark_bitmap_->Test(object);
     }
@@ -113,14 +154,10 @@
   }
 
   static bool IsMarkedCallback(const Object* object, void* arg)
-      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_) {
-    return reinterpret_cast<MarkSweep*>(arg)->IsMarked(object);
-  }
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   static bool IsLiveCallback(const Object* object, void* arg)
-      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_) {
-    return reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
-  }
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   static void MarkObjectVisitor(const Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
@@ -148,7 +185,6 @@
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
-
   static void SweepCallback(size_t num_ptrs, Object** ptrs, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
@@ -295,7 +331,8 @@
   }
 
   // Blackens objects grayed during a garbage collection.
-  void ScanGrayObjects() EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+  void ScanGrayObjects(bool update_finger)
+      EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   // Schedules an unmarked object for reference processing.
   void DelayReferenceReferent(Object* reference)
@@ -324,9 +361,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
-  void SweepSystemWeaks(bool swap_bitmaps)
-      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
-  void SweepJniWeakGlobals(HeapBitmap* bitmap)
+  void SweepJniWeakGlobals(bool swap_bitmaps)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   // Current space, we check this space first to avoid searching for the appropriate space for an object.
@@ -350,6 +385,9 @@
 
   Object* cleared_reference_list_;
 
+  size_t freed_bytes_;
+  size_t freed_objects_;
+
   size_t class_count_;
   size_t array_count_;
   size_t other_count_;
diff --git a/src/mod_union_table.cc b/src/mod_union_table.cc
index 3b928e3..51fe70a 100644
--- a/src/mod_union_table.cc
+++ b/src/mod_union_table.cc
@@ -70,6 +70,19 @@
   SpaceBitmap* bitmap_;
 };
 
+class ModUnionClearCardSetVisitor {
+ public:
+  explicit ModUnionClearCardSetVisitor(std::set<byte*>* const cleared_cards)
+    : cleared_cards_(cleared_cards) {
+  }
+
+  inline void operator ()(byte* card) const {
+    cleared_cards_->insert(card);
+  }
+ private:
+  std::set<byte*>* const cleared_cards_;
+};
+
 class ModUnionClearCardVisitor {
  public:
   explicit ModUnionClearCardVisitor(std::vector<byte*>* cleared_cards)
@@ -95,7 +108,8 @@
     if (space->IsImageSpace()) {
       // The mod-union table is only needed when we have an image space since it's purpose is to
       // cache image roots.
-      UniquePtr<SpaceBitmap> bitmap(SpaceBitmap::Create("mod-union table bitmap", space->Begin(), space->Capacity()));
+      UniquePtr<SpaceBitmap> bitmap(SpaceBitmap::Create("mod-union table bitmap", space->Begin(),
+                                                        space->Capacity()));
       CHECK(bitmap.get() != NULL) << "Failed to create mod-union bitmap";
       bitmaps_.Put(space, bitmap.release());
     }
@@ -131,7 +145,7 @@
     // At this point we need to update the mod-union bitmap to contain all the objects which reach
     // the alloc space.
     ModUnionVisitor visitor(mark_sweep_, bitmap);
-    space->GetLiveBitmap()->VisitMarkedRange(start, end, visitor);
+    space->GetLiveBitmap()->VisitMarkedRange(start, end, visitor, IdentityFunctor());
   }
 }
 
@@ -144,7 +158,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_) {
     DCHECK(root != NULL);
-    mark_sweep_->ScanObject(root);
+    mark_sweep_->ScanRoot(root);
   }
 
  private:
@@ -158,13 +172,13 @@
     const Space* space = cur->first;
     uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
     uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-    cur->second->VisitMarkedRange(begin, end, image_root_scanner);
+    cur->second->VisitMarkedRange(begin, end, image_root_scanner, IdentityFunctor());
   }
 }
 
 
 ModUnionTableReferenceCache::ModUnionTableReferenceCache(Heap* heap) : ModUnionTable(heap) {
-  cleared_cards_.reserve(32);
+
 }
 
 ModUnionTableReferenceCache::~ModUnionTableReferenceCache() {
@@ -174,7 +188,7 @@
 
 void ModUnionTableReferenceCache::ClearCards(Space* space) {
   CardTable* card_table = GetMarkSweep()->GetHeap()->GetCardTable();
-  ModUnionClearCardVisitor visitor(&cleared_cards_);
+  ModUnionClearCardSetVisitor visitor(&cleared_cards_);
   // Clear dirty cards in the this space and update the corresponding mod-union bits.
   card_table->VisitClear(space->Begin(), space->End(), visitor);
 }
@@ -269,7 +283,7 @@
       references_(references) {
   }
 
-  void operator ()(Object* obj) const {
+  void operator ()(Object* obj, void* /* finger */) const {
     DCHECK(obj != NULL);
     MarkSweep* mark_sweep = mod_union_table_->GetMarkSweep();
     CheckReferenceVisitor visitor(mod_union_table_, references_);
@@ -318,16 +332,15 @@
   ReferenceArray cards_references;
   ModUnionReferenceVisitor visitor(this, &cards_references);
 
-  for (size_t i = 0; i < cleared_cards_.size(); ++i) {
-    byte* card = cleared_cards_[i];
-
+  for (ClearedCards::iterator it = cleared_cards_.begin(); it != cleared_cards_.end(); ++it) {
+    byte* card = *it;
     // Clear and re-compute alloc space references associated with this card.
     cards_references.clear();
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + GC_CARD_SIZE;
     SpaceBitmap* live_bitmap =
         heap->FindSpaceFromObject(reinterpret_cast<Object*>(start))->GetLiveBitmap();
-    live_bitmap->VisitMarkedRange(start, end, visitor);
+    live_bitmap->VisitMarkedRange(start, end, visitor, IdentityFunctor());
 
     // Update the corresponding references for the card.
     // TODO: C++0x auto
@@ -369,19 +382,6 @@
 
 }
 
-class ModUnionClearCardSetVisitor {
- public:
-  explicit ModUnionClearCardSetVisitor(std::set<byte*>* const cleared_cards)
-    : cleared_cards_(cleared_cards) {
-  }
-
-  void operator ()(byte* card) const {
-    cleared_cards_->insert(card);
-  }
- private:
-  std::set<byte*>* const cleared_cards_;
-};
-
 void ModUnionTableCardCache::ClearCards(Space* space) {
   CardTable* card_table = GetMarkSweep()->GetHeap()->GetCardTable();
   ModUnionClearCardSetVisitor visitor(&cleared_cards_);
@@ -399,7 +399,7 @@
     uintptr_t end = start + GC_CARD_SIZE;
     SpaceBitmap* live_bitmap =
         heap_->FindSpaceFromObject(reinterpret_cast<Object*>(start))->GetLiveBitmap();
-    live_bitmap->VisitMarkedRange(start, end, visitor);
+    live_bitmap->VisitMarkedRange(start, end, visitor, IdentityFunctor());
   }
 }
 
diff --git a/src/mod_union_table.h b/src/mod_union_table.h
index 424f2f3..63a46f0 100644
--- a/src/mod_union_table.h
+++ b/src/mod_union_table.h
@@ -33,6 +33,7 @@
 class ModUnionTable {
  public:
   typedef std::vector<const Object*> ReferenceArray;
+  typedef std::set<byte*> ClearedCards;
 
   ModUnionTable(Heap* heap) : heap_(heap), mark_sweep_(0) {
 
@@ -128,7 +129,7 @@
 
  protected:
   // Cleared card array, used to update the mod-union table.
-  std::vector<byte*> cleared_cards_;
+  ClearedCards cleared_cards_;
 
   // Maps from dirty cards to their corresponding alloc space references.
   ReferenceMap references_;
@@ -137,7 +138,6 @@
 // Card caching implementation. Keeps track of which cards we cleared and only this information.
 class ModUnionTableCardCache : public ModUnionTable {
  public:
-  typedef std::set<byte*> ClearedCards;
   typedef SafeMap<const byte*, ReferenceArray > ReferenceMap;
 
   ModUnionTableCardCache(Heap* heap);
diff --git a/src/mutex.h b/src/mutex.h
index edd458d..6947a89 100644
--- a/src/mutex.h
+++ b/src/mutex.h
@@ -55,9 +55,9 @@
   kClassLinkerClassesLock = 6,
   kThreadListLock = 7,
   kHeapBitmapLock = 8,
-  kZygoteCreationLock = 9,
-  kMonitorLock = 10,
-  kMutatorLock = 11,
+  kMonitorLock = 9,
+  kMutatorLock = 10,
+  kZygoteCreationLock = 11,
   kMaxMutexLevel = kMutatorLock,
 };
 std::ostream& operator<<(std::ostream& os, const MutexLevel& rhs);
diff --git a/src/oatdump.cc b/src/oatdump.cc
index 2fc728e..05d7250 100644
--- a/src/oatdump.cc
+++ b/src/oatdump.cc
@@ -558,6 +558,11 @@
     Heap* heap = Runtime::Current()->GetHeap();
     const Spaces& spaces = heap->GetSpaces();
     // TODO: C++0x auto
+    {
+      WriterMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
+      heap->FlushAllocStack();
+    }
+    ReaderMutexLock mu(*GlobalSynchronization::heap_bitmap_lock_);
     for (Spaces::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
       (*cur)->GetLiveBitmap()->Walk(ImageDumper::Callback, this);
       os_ << "\n";
diff --git a/src/space.cc b/src/space.cc
index a828d91..f0f9323 100644
--- a/src/space.cc
+++ b/src/space.cc
@@ -154,6 +154,10 @@
   SpaceBitmap* temp_live_bitmap = live_bitmap_.release();
   live_bitmap_.reset(mark_bitmap_.release());
   mark_bitmap_.reset(temp_live_bitmap);
+  // Swap names to get more descriptive diagnostics.
+  std::string temp_name = live_bitmap_->GetName();
+  live_bitmap_->SetName(mark_bitmap_->GetName());
+  mark_bitmap_->SetName(temp_name);
 }
 
 Object* AllocSpace::AllocWithoutGrowthLocked(size_t num_bytes) {
@@ -190,6 +194,14 @@
   return result;
 }
 
+void AllocSpace::SetGrowthLimit(size_t growth_limit) {
+  growth_limit = RoundUp(growth_limit, kPageSize);
+  growth_limit_ = growth_limit;
+  if (Size() > growth_limit_) {
+    end_ = begin_ + growth_limit;
+  }
+}
+
 AllocSpace* AllocSpace::CreateZygoteSpace() {
   end_ = reinterpret_cast<byte*>(RoundUp(reinterpret_cast<uintptr_t>(end_), kPageSize));
   DCHECK(IsAligned<GC_CARD_SIZE>(begin_));
@@ -212,7 +224,8 @@
   VLOG(heap) << "Size " << size;
   VLOG(heap) << "GrowthLimit " << growth_limit_;
   VLOG(heap) << "Capacity " << Capacity();
-  growth_limit_ = RoundUp(size, kPageSize);
+  SetGrowthLimit(RoundUp(size, kPageSize));
+  SetFootprintLimit(RoundUp(size, kPageSize));
   // FIXME: Do we need reference counted pointers here?
   // Make the two spaces share the same mark bitmaps since the bitmaps span both of the spaces.
   VLOG(heap) << "Creating new AllocSpace: ";
@@ -228,9 +241,9 @@
   }
   AllocSpace* alloc_space = new AllocSpace(name_, mem_map.release(), mspace, end_, end, growth_limit);
   live_bitmap_->SetHeapLimit(reinterpret_cast<uintptr_t>(end_));
-  CHECK(live_bitmap_->HeapLimit() == reinterpret_cast<uintptr_t>(end_));
+  CHECK_EQ(live_bitmap_->HeapLimit(), reinterpret_cast<uintptr_t>(end_));
   mark_bitmap_->SetHeapLimit(reinterpret_cast<uintptr_t>(end_));
-  CHECK(mark_bitmap_->HeapLimit() == reinterpret_cast<uintptr_t>(end_));
+  CHECK_EQ(mark_bitmap_->HeapLimit(), reinterpret_cast<uintptr_t>(end_));
   name_ += "-zygote-transformed";
   VLOG(heap) << "zygote space creation done";
   return alloc_space;
@@ -264,25 +277,8 @@
 // Callback from dlmalloc when it needs to increase the footprint
 extern "C" void* art_heap_morecore(void* mspace, intptr_t increment) {
   Heap* heap = Runtime::Current()->GetHeap();
-  if (heap->GetAllocSpace()->GetMspace() == mspace) {
-    return heap->GetAllocSpace()->MoreCore(increment);
-  } else {
-    // Exhaustively search alloc spaces.
-    const Spaces& spaces = heap->GetSpaces();
-    // TODO: C++0x auto
-    for (Spaces::const_iterator cur = spaces.begin(); cur != spaces.end(); ++cur) {
-      if ((*cur)->IsAllocSpace()) {
-        AllocSpace* space = (*cur)->AsAllocSpace();
-        if (mspace == space->GetMspace()) {
-          return space->MoreCore(increment);
-        }
-      }
-    }
-  }
-
-  LOG(FATAL) << "Unexpected call to art_heap_morecore. mspace: " << mspace
-             << " increment: " << increment;
-  return NULL;
+  DCHECK_EQ(heap->GetAllocSpace()->GetMspace(), mspace);
+  return heap->GetAllocSpace()->MoreCore(increment);
 }
 
 void* AllocSpace::MoreCore(intptr_t increment) {
diff --git a/src/space.h b/src/space.h
index 6ab3302..bc1493d 100644
--- a/src/space.h
+++ b/src/space.h
@@ -224,6 +224,8 @@
     return mark_bitmap_.get();
   }
 
+  void SetGrowthLimit(size_t growth_limit);
+
   // Swap the live and mark bitmaps of this space. This is used by the GC for concurrent sweeping.
   void SwapBitmaps();
 
diff --git a/src/space_bitmap.cc b/src/space_bitmap.cc
index 438237d..439e637 100644
--- a/src/space_bitmap.cc
+++ b/src/space_bitmap.cc
@@ -22,6 +22,14 @@
 
 namespace art {
 
+std::string SpaceBitmap::GetName() const {
+  return name_;
+}
+
+void SpaceBitmap::SetName(const std::string& name) {
+  name_ = name;
+}
+
 SpaceBitmap* SpaceBitmap::Create(const std::string& name, byte* heap_begin, size_t heap_capacity) {
   CHECK(heap_begin != NULL);
   // Round up since heap_capacity is not necessarily a multiple of kAlignment * kBitsPerWord.
@@ -60,10 +68,14 @@
     if (result == -1) {
       PLOG(WARNING) << "madvise failed";
     }
-    heap_end_ = heap_begin_ - 1;
   }
 }
 
+void SpaceBitmap::CopyFrom(SpaceBitmap* source_bitmap) {
+  DCHECK_EQ(Size(), source_bitmap->Size());
+  std::copy(source_bitmap->Begin(), source_bitmap->Begin() + source_bitmap->Size() / kWordSize, Begin());
+}
+
 // Return true iff <obj> is within the range of pointers that this bitmap could potentially cover,
 // even if a bit has not been set for it.
 bool SpaceBitmap::HasAddress(const void* obj) const {
@@ -78,84 +90,19 @@
 void SpaceBitmap::Walk(SpaceBitmap::Callback* callback, void* arg) {
   CHECK(bitmap_begin_ != NULL);
   CHECK(callback != NULL);
-  if (heap_end_ < heap_begin_) {
-    return;  // Bitmap is empty.
-  }
-  uintptr_t end = OffsetToIndex(heap_end_ - heap_begin_);
+
+  uintptr_t end = OffsetToIndex(HeapLimit() - heap_begin_ - 1);
+  word* bitmap_begin = bitmap_begin_;
   for (uintptr_t i = 0; i <= end; ++i) {
-    word w = bitmap_begin_[i];
-    if (UNLIKELY(w != 0)) {
+    word w = bitmap_begin[i];
+    if (w != 0) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
-      while (w != 0) {
+      do {
         const size_t shift = CLZ(w);
         Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
         (*callback)(obj, arg);
         w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-      }
-    }
-  }
-}
-
-// Similar to Walk but the callback routine is permitted to change the bitmap bits and end during
-// traversal.  Used by the the root marking scan exclusively.
-//
-// The callback is invoked with a finger argument.  The finger is a pointer to an address not yet
-// visited by the traversal.  If the callback sets a bit for an address at or above the finger, this
-// address will be visited by the traversal.  If the callback sets a bit for an address below the
-// finger, this address will not be visited (typiscally such an address would be placed on the
-// marking stack).
-void SpaceBitmap::ScanWalk(uintptr_t scan_begin, uintptr_t scan_end, ScanCallback* callback, void* arg) {
-  CHECK(bitmap_begin_ != NULL);
-  CHECK(callback != NULL);
-  CHECK_LE(scan_begin, scan_end);
-  CHECK_GE(scan_begin, heap_begin_);
-
-  // This function doesn't support unaligned boundaries yet.
-  size_t begin_offset = scan_begin - heap_begin_;
-  size_t end_offset = scan_end - heap_begin_;
-  DCHECK((begin_offset / kAlignment) % kBitsPerWord == 0)
-      << "scan begin " << reinterpret_cast<const void*>(scan_begin)
-      << " with offset " << begin_offset
-      << " not aligned to word boundary";
-  DCHECK((end_offset / kAlignment) % kBitsPerWord == 0)
-      << "scan end " << reinterpret_cast<const void*>(scan_end)
-      << " with offset " << end_offset
-      << " not aligned to word boundary";
-
-  size_t start = OffsetToIndex(begin_offset);
-  if (scan_end < heap_end_) {
-    // The end of the space we're looking at is before the current maximum bitmap PC, scan to that
-    // and don't recompute end on each iteration
-    size_t end = OffsetToIndex(end_offset - 1);
-    for (size_t i = start; i <= end; i++) {
-      word w = bitmap_begin_[i];
-      if (UNLIKELY(w != 0)) {
-        uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
-        void* finger = reinterpret_cast<void*>(IndexToOffset(i + 1) + heap_begin_);
-        while (w != 0) {
-          const size_t shift = CLZ(w);
-          Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
-          (*callback)(obj, finger, arg);
-          w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-        }
-      }
-    }
-  } else {
-    size_t end = OffsetToIndex(heap_end_ - heap_begin_);
-    for (size_t i = start; i <= end; i++) {
-      word w = bitmap_begin_[i];
-      if (UNLIKELY(w != 0)) {
-        uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
-        void* finger = reinterpret_cast<void*>(IndexToOffset(i + 1) + heap_begin_);
-        while (w != 0) {
-          const size_t shift = CLZ(w);
-          Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
-          (*callback)(obj, finger, arg);
-          w ^= static_cast<size_t>(kWordHighBitMask) >> shift;
-        }
-      }
-      // update 'end' in case callback modified bitmap
-      end = OffsetToIndex(heap_end_ - heap_begin_);
+      } while (w != 0);
     }
   }
 }
@@ -176,31 +123,32 @@
   CHECK(callback != NULL);
   CHECK_LE(sweep_begin, sweep_end);
   CHECK_GE(sweep_begin, live_bitmap.heap_begin_);
-  sweep_end = std::min(sweep_end - 1, live_bitmap.heap_end_);
-  if (live_bitmap.heap_end_ < live_bitmap.heap_begin_) {
-    // Easy case; both are obviously empty.
-    // TODO: this should never happen
+
+  if (sweep_end <= sweep_begin) {
     return;
   }
+
   // TODO: rewrite the callbacks to accept a std::vector<Object*> rather than a Object**?
-  std::vector<Object*> pointer_buf(4 * kBitsPerWord);
+  const size_t buffer_size = kWordSize * kBitsPerWord;
+  Object* pointer_buf[buffer_size];
   Object** pb = &pointer_buf[0];
   size_t start = OffsetToIndex(sweep_begin - live_bitmap.heap_begin_);
-  size_t end = OffsetToIndex(sweep_end - live_bitmap.heap_begin_);
+  size_t end = OffsetToIndex(sweep_end - live_bitmap.heap_begin_ - 1);
+  CHECK_LT(end, live_bitmap.Size() / kWordSize);
   word* live = live_bitmap.bitmap_begin_;
   word* mark = mark_bitmap.bitmap_begin_;
   for (size_t i = start; i <= end; i++) {
     word garbage = live[i] & ~mark[i];
     if (UNLIKELY(garbage != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + live_bitmap.heap_begin_;
-      while (garbage != 0) {
+      do {
         const size_t shift = CLZ(garbage);
         garbage ^= static_cast<size_t>(kWordHighBitMask) >> shift;
         *pb++ = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
-      }
+      } while (garbage != 0);
       // Make sure that there are always enough slots available for an
       // entire word of one bits.
-      if (pb >= &pointer_buf[pointer_buf.size() - kBitsPerWord]) {
+      if (pb >= &pointer_buf[buffer_size - kBitsPerWord]) {
         (*callback)(pb - &pointer_buf[0], &pointer_buf[0], arg);
         pb = &pointer_buf[0];
       }
@@ -297,8 +245,8 @@
                                        IndexToOffset(bitmap_size_ / kWordSize)));
   CHECK(bitmap_begin_ != NULL);
   CHECK(callback != NULL);
-  uintptr_t end = OffsetToIndex(heap_end_ - heap_begin_);
-  for (uintptr_t i = 0; i <= end; ++i) {
+  uintptr_t end = Size() / kWordSize;
+  for (uintptr_t i = 0; i < end; ++i) {
     word w = bitmap_begin_[i];
     if (UNLIKELY(w != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
@@ -312,4 +260,12 @@
   }
 }
 
+std::ostream& operator << (std::ostream& stream, const SpaceBitmap& bitmap) {
+  return stream
+    << bitmap.GetName() << "["
+    << "begin=" << reinterpret_cast<const void*>(bitmap.HeapBegin())
+    << ",end=" << reinterpret_cast<const void*>(bitmap.HeapLimit())
+    << "]";
+  }
+
 }  // namespace art
diff --git a/src/space_bitmap.h b/src/space_bitmap.h
index bbf60f3..db1a5eb 100644
--- a/src/space_bitmap.h
+++ b/src/space_bitmap.h
@@ -78,12 +78,8 @@
     DCHECK(HasAddress(obj)) << obj;
     DCHECK(bitmap_begin_ != NULL);
     DCHECK_GE(addr, heap_begin_);
-    if (addr <= heap_end_) {
-      const uintptr_t offset = addr - heap_begin_;
-      return (bitmap_begin_[OffsetToIndex(offset)] & OffsetToMask(offset)) != 0;
-    } else {
-      return false;
-    }
+    const uintptr_t offset = addr - heap_begin_;
+    return (bitmap_begin_[OffsetToIndex(offset)] & OffsetToMask(offset)) != 0;
   }
 
   bool HasAddress(const void* addr) const;
@@ -110,11 +106,13 @@
     }
   }
 
-  template <typename Visitor>
-  void VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end, const Visitor& visitor) const
+  template <typename Visitor, typename FingerVisitor>
+  void VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end,
+                        const Visitor& visitor, const FingerVisitor& finger_visitor) const
       EXCLUSIVE_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_) {
     DCHECK_LT(visit_begin, visit_end);
 
+    const size_t word_span = kAlignment * kBitsPerWord; // Equals IndexToOffset(1).
     const size_t bit_index_start = (visit_begin - heap_begin_) / kAlignment;
     const size_t bit_index_end = (visit_end - heap_begin_ - 1) / kAlignment;
 
@@ -134,6 +132,7 @@
     // If word_start == word_end then handle this case at the same place we handle the right edge.
     if (edge_word != 0 && word_start < word_end) {
       uintptr_t ptr_base = IndexToOffset(word_start) + heap_begin_;
+      finger_visitor(reinterpret_cast<void*>(ptr_base + word_span));
       do {
         const size_t shift = CLZ(edge_word);
         Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
@@ -147,6 +146,7 @@
       size_t w = bitmap_begin_[i];
       if (w != 0) {
         uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
+        finger_visitor(reinterpret_cast<void*>(ptr_base + word_span));
         do {
           const size_t shift = CLZ(w);
           Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
@@ -165,9 +165,9 @@
     }
 
     // Bits that we trim off the right.
-    const size_t trim_bits = kBitsPerWord - 1 - right_bits;
-    edge_word &= ~((1 << trim_bits) - 1);
+    edge_word &= ~((static_cast<size_t>(kWordHighBitMask) >> right_bits) - 1);
     uintptr_t ptr_base = IndexToOffset(word_end) + heap_begin_;
+    finger_visitor(reinterpret_cast<void*>(ptr_base + word_span));
     while (edge_word != 0) {
       const size_t shift = CLZ(edge_word);
       Object* obj = reinterpret_cast<Object*>(ptr_base + shift * kAlignment);
@@ -176,18 +176,20 @@
     }
   }
 
-  void Walk(Callback* callback, void* arg);
+  void Walk(Callback* callback, void* arg)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
 
   void InOrderWalk(Callback* callback, void* arg)
+      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
-  void ScanWalk(uintptr_t base, uintptr_t max, ScanCallback* thunk, void* arg);
-
   static void SweepWalk(const SpaceBitmap& live,
                         const SpaceBitmap& mark,
                         uintptr_t base, uintptr_t max,
                         SweepCallback* thunk, void* arg);
 
+  void CopyFrom(SpaceBitmap* source_bitmap);
+
   // Starting address of our internal storage.
   word* Begin() {
     return bitmap_begin_;
@@ -215,12 +217,15 @@
   // Set the max address which can covered by the bitmap.
   void SetHeapLimit(uintptr_t new_end);
 
+  std::string GetName() const;
+  void SetName(const std::string& name);
+
  private:
   // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1,
   // however, we document that this is expected on heap_end_
   SpaceBitmap(const std::string& name, MemMap* mem_map, word* bitmap_begin, size_t bitmap_size, const void* heap_begin)
       : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
-        heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)), heap_end_(heap_begin_ - 1),
+        heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
         name_(name) {}
 
   inline void Modify(const Object* obj, bool do_set) {
@@ -231,9 +236,6 @@
     const word mask = OffsetToMask(offset);
     DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_;
     if (do_set) {
-      if (addr > heap_end_) {
-        heap_end_ = addr;
-      }
       bitmap_begin_[index] |= mask;
     } else {
       bitmap_begin_[index] &= ~mask;
@@ -253,15 +255,12 @@
   // bitmap.
   const uintptr_t heap_begin_;
 
-  // The highest pointer value ever returned by an allocation from
-  // this heap.  I.e., the highest address that may correspond to a
-  // set bit.  If there are no bits set, (heap_end_ < heap_begin_).
-  uintptr_t heap_end_;
-
   // Name of this bitmap.
   std::string name_;
 };
 
+std::ostream& operator << (std::ostream& stream, const SpaceBitmap& bitmap);
+
 }  // namespace art
 
 #endif  // ART_SRC_SPACE_BITMAP_H_
diff --git a/src/space_test.cc b/src/space_test.cc
index c1c1dca..2dd8dc1 100644
--- a/src/space_test.cc
+++ b/src/space_test.cc
@@ -78,7 +78,7 @@
     AllocSpace* space(Space::CreateAllocSpace("test", 4 * MB, 16 * MB, 16 * MB, NULL));
     ASSERT_TRUE(space != NULL);
 
-  // Make space findable to the heap, will also delete space when runtime is cleaned up
+    // Make space findable to the heap, will also delete space when runtime is cleaned up
     Runtime::Current()->GetHeap()->AddSpace(space);
 
     // Succeeds, fits without adjusting the footprint limit.
diff --git a/src/thread.h b/src/thread.h
index 48278d8..efa92f4 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -450,9 +450,9 @@
       SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 
 #if VERIFY_OBJECT_ENABLED
-  void VerifyStack();
+  void VerifyStack() SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_);
 #else
-  void VerifyStack() {}
+  void VerifyStack() SHARED_LOCKS_REQUIRED(GlobalSynchronization::mutator_lock_){}
 #endif
 
   //
diff --git a/src/timing_logger.h b/src/timing_logger.h
index 6043d5a..0f55798 100644
--- a/src/timing_logger.h
+++ b/src/timing_logger.h
@@ -29,7 +29,8 @@
 
 class TimingLogger {
  public:
-  explicit TimingLogger(const char* name) : name_(name) {
+  explicit TimingLogger(const char* name, bool precise = false)
+      : name_(name), precise_(precise) {
     AddSplit("");
   }
 
@@ -45,7 +46,13 @@
   void Dump(std::ostream& os) const {
     os << name_ << ": begin\n";
     for (size_t i = 1; i < times_.size(); ++i) {
-      os << name_ << StringPrintf(": %8lld ms, ", NsToMs(times_[i] - times_[i-1])) << labels_[i] << "\n";
+      if (precise_) {
+        os << name_ << ": " << std::setw(12) << PrettyDuration(times_[i] - times_[i-1]) << " "
+           << labels_[i] << "\n";
+      } else {
+        os << name_ << StringPrintf(": %8lld ms, ", NsToMs(times_[i] - times_[i-1])) << labels_[i]
+           <<  "\n";
+      }
     }
     os << name_ << ": end, " << NsToMs(GetTotalNs()) << " ms\n";
   }
@@ -56,6 +63,7 @@
 
  private:
   std::string name_;
+  bool precise_;
   std::vector<uint64_t> times_;
   std::vector<std::string> labels_;
 };
diff --git a/src/utils.h b/src/utils.h
index 851c6b1..2846dad 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -314,6 +314,12 @@
 bool IsValidDexFilename(const std::string& filename);
 bool IsValidOatFilename(const std::string& filename);
 
+class IdentityFunctor {
+ public:
+  template <typename T>
+  inline T operator () (T t) const { return t; }
+};
+
 }  // namespace art
 
 #endif  // ART_SRC_UTILS_H_