Remove first GC pause.

Fixed error where we were prefetching the address of address of
objects.

Remove first paused by removing get dirty cards and replacing it
with card aging. We now age the cards before doing the checkpoint
instead of clearing them. This lets us know which cards were
dirtied before the start of the GC and which cards were dirtied
after.

Optimized FreeList slightly.

Change-Id: I39d6aac1839476d7541d83970c8b27b266e8a117
diff --git a/src/gc/card_table.cc b/src/gc/card_table.cc
index a5531d8..1e978b5 100644
--- a/src/gc/card_table.cc
+++ b/src/gc/card_table.cc
@@ -96,25 +96,6 @@
   memset(mem_map_->Begin(), kCardClean, mem_map_->Size());
 }
 
-void CardTable::PreClearCards(ContinuousSpace* space, std::vector<byte*>& out_cards) {
-  byte* card_end = CardFromAddr(space->End());
-  for (byte* card_cur = CardFromAddr(space->Begin()); card_cur < card_end; ++card_cur) {
-    if (*card_cur == kCardDirty) {
-      out_cards.push_back(card_cur);
-      *card_cur = kCardClean;
-    }
-  }
-}
-
-void CardTable::GetDirtyCards(ContinuousSpace* space, std::vector<byte*>& out_cards) const {
-  byte* card_end = CardFromAddr(space->End());
-  for (byte* card_cur = CardFromAddr(space->Begin());card_cur < card_end; ++card_cur) {
-    if (*card_cur == kCardDirty) {
-      out_cards.push_back(card_cur);
-    }
-  }
-}
-
 bool CardTable::AddrIsInCardTable(const void* addr) const {
   return IsValidCard(biased_begin_ + ((uintptr_t)addr >> kCardShift));
 }
diff --git a/src/gc/card_table.h b/src/gc/card_table.h
index 035c073..2a20fc8 100644
--- a/src/gc/card_table.h
+++ b/src/gc/card_table.h
@@ -22,6 +22,7 @@
 #include "mem_map.h"
 #include "space_bitmap.h"
 #include "UniquePtr.h"
+#include "utils.h"
 
 namespace art {
 
@@ -72,32 +73,146 @@
     return biased_begin_;
   }
 
-  // For every dirty card between begin and end invoke the visitor with the specified argument.
+  /*
+   * Visitor is expected to take in a card and return the new value. When a value is modified, the
+   * modify visitor is called.
+   * visitor: The visitor which modifies the cards. Returns the new value for a card given an old
+   * value.
+   * modified: Whenever the visitor modifies a card, this visitor is called on the card. Enables
+   * us to know which cards got cleared.
+   */
+  template <typename Visitor, typename ModifiedVisitor>
+  void ModifyCardsAtomic(byte* scan_begin, byte* scan_end, const Visitor& visitor,
+                      const ModifiedVisitor& modified = VoidFunctor()) {
+    byte* card_cur = CardFromAddr(scan_begin);
+    byte* card_end = CardFromAddr(scan_end);
+    CheckCardValid(card_cur);
+    CheckCardValid(card_end);
+
+    // Handle any unaligned cards at the start.
+    while (!IsAligned<sizeof(word)>(card_cur) && card_cur < card_end) {
+      byte expected, new_value;
+      do {
+        expected = *card_cur;
+        new_value = visitor(expected);
+      } while (expected != new_value && UNLIKELY(byte_cas(expected, new_value, card_cur) != 0));
+      if (expected != new_value) {
+        modified(card_cur, expected, new_value);
+      }
+      ++card_cur;
+    }
+
+    // Handle unaligned cards at the end.
+    while (!IsAligned<sizeof(word)>(card_end) && card_end > card_cur) {
+      --card_end;
+      byte expected, new_value;
+      do {
+        expected = *card_end;
+        new_value = visitor(expected);
+      } while (expected != new_value && UNLIKELY(byte_cas(expected, new_value, card_end) != 0));
+      if (expected != new_value) {
+        modified(card_cur, expected, new_value);
+      }
+    }
+
+    // Now we have the words, we can process words in parallel.
+    uintptr_t* word_cur = reinterpret_cast<uintptr_t*>(card_cur);
+    uintptr_t* word_end = reinterpret_cast<uintptr_t*>(card_end);
+    uintptr_t expected_word;
+    uintptr_t new_word;
+
+    // TODO: Parallelize.
+    while (word_cur < word_end) {
+      while ((expected_word = *word_cur) != 0) {
+        new_word =
+            (visitor((expected_word >> 0) & 0xFF) << 0) |
+            (visitor((expected_word >> 8) & 0xFF) << 8) |
+            (visitor((expected_word >> 16) & 0xFF) << 16) |
+            (visitor((expected_word >> 24) & 0xFF) << 24);
+        if (new_word == expected_word) {
+          // No need to do a cas.
+          break;
+        }
+        if (LIKELY(android_atomic_cas(expected_word, new_word,
+                                      reinterpret_cast<int32_t*>(word_cur)) == 0)) {
+          for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
+            const byte expected_byte = (expected_word >> (8 * i)) & 0xFF;
+            const byte new_byte = (new_word >> (8 * i)) & 0xFF;
+            if (expected_byte != new_byte) {
+              modified(reinterpret_cast<byte*>(word_cur) + i, expected_byte, new_byte);
+            }
+          }
+          break;
+        }
+      }
+      ++word_cur;
+    }
+  }
+
+  // For every dirty at least minumum age between begin and end invoke the visitor with the
+  // specified argument.
   template <typename Visitor, typename FingerVisitor>
   void Scan(SpaceBitmap* bitmap, byte* scan_begin, byte* scan_end,
-            const Visitor& visitor, const FingerVisitor& finger_visitor) const
+            const Visitor& visitor, const FingerVisitor& finger_visitor,
+            const byte minimum_age = kCardDirty) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(bitmap->HasAddress(scan_begin));
     DCHECK(bitmap->HasAddress(scan_end - 1));  // scan_end is the byte after the last byte we scan.
     byte* card_cur = CardFromAddr(scan_begin);
     byte* card_end = CardFromAddr(scan_end);
-    while (card_cur < card_end) {
+    CheckCardValid(card_cur);
+    CheckCardValid(card_end);
+
+    // Handle any unaligned cards at the start.
+    while (!IsAligned<sizeof(word)>(card_cur) && card_cur < card_end) {
+      if (*card_cur >= minimum_age) {
+        uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(card_cur));
+        uintptr_t end = start + kCardSize;
+        bitmap->VisitMarkedRange(start, end, visitor, finger_visitor);
+      }
+      ++card_cur;
+    }
+
+    byte* aligned_end = card_end -
+        (reinterpret_cast<uintptr_t>(card_end) & (sizeof(uintptr_t) - 1));
+
+    // Now we have the words, we can send these to be processed in parallel.
+    uintptr_t* word_cur = reinterpret_cast<uintptr_t*>(card_cur);
+    uintptr_t* word_end = reinterpret_cast<uintptr_t*>(aligned_end);
+
+    // TODO: Parallelize
+    while (word_cur < word_end) {
       // Find the first dirty card.
-      card_cur = reinterpret_cast<byte*>(memchr(card_cur, kCardDirty, card_end - card_cur));
-      if (card_cur == NULL) {
+      while (*word_cur == 0 && word_cur < word_end) {
+        word_cur++;
+      }
+      if (word_cur >= word_end) {
         break;
       }
-      byte* run_start = card_cur++;
-
-      while (*card_cur == kCardDirty && card_cur < card_end) {
-        card_cur++;
+      uintptr_t start_word = *word_cur;
+      for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
+        if ((start_word & 0xFF) == minimum_age) {
+          byte* card = reinterpret_cast<byte*>(word_cur) + i;
+          DCHECK_EQ(*card, start_word & 0xFF);
+          uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(card));
+          uintptr_t end = start + kCardSize;
+          bitmap->VisitMarkedRange(start, end, visitor, finger_visitor);
+        }
+        start_word >>= 8;
       }
-      byte* run_end = card_cur;
+      ++word_cur;
+    }
 
-      uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(run_start));
-      uintptr_t end = reinterpret_cast<uintptr_t>(AddrFromCard(run_end));
-      bitmap->VisitMarkedRange(start, end, visitor, finger_visitor);
+    // Handle any unaligned cards at the end.
+    card_cur = reinterpret_cast<byte*>(word_end);
+    while (card_cur < card_end) {
+      if (*card_cur >= minimum_age) {
+        uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(card_cur));
+        uintptr_t end = start + kCardSize;
+        bitmap->VisitMarkedRange(start, end, visitor, finger_visitor);
+      }
+      ++card_cur;
     }
   }
 
@@ -110,12 +225,6 @@
   // Resets all of the bytes in the card table which do not map to the image space.
   void ClearSpaceCards(ContinuousSpace* space);
 
-  // Clean all the cards which map to a space.
-  void PreClearCards(ContinuousSpace* space, std::vector<byte*>& out_cards);
-
-  // Returns all of the dirty cards which map to a space.
-  void GetDirtyCards(ContinuousSpace* space, std::vector<byte*>& out_cards) const;
-
   // Returns the first address in the heap which maps to this card.
   void* AddrFromCard(const byte *card_addr) const {
     DCHECK(IsValidCard(card_addr))
@@ -138,6 +247,19 @@
   bool AddrIsInCardTable(const void* addr) const;
 
  private:
+  static int byte_cas(byte old_value, byte new_value, byte* address) {
+    // Little endian means most significant byte is on the left.
+    const size_t shift = reinterpret_cast<uintptr_t>(address) % sizeof(uintptr_t);
+    // Align the address down.
+    address -= shift;
+    int32_t* word_address = reinterpret_cast<int32_t*>(address);
+    // Word with the byte we are trying to cas cleared.
+    const int32_t cur_word = *word_address & ~(0xFF << shift);
+    const int32_t old_word = cur_word | (static_cast<int32_t>(old_value) << shift);
+    const int32_t new_word = cur_word | (static_cast<int32_t>(new_value) << shift);
+    return android_atomic_cas(old_word, new_word, word_address);
+  }
+
   CardTable(MemMap* begin, byte* biased_begin, size_t offset);
 
   // Returns true iff the card table address is within the bounds of the card table.
@@ -147,6 +269,13 @@
     return card_addr >= begin && card_addr < end;
   }
 
+  void CheckCardValid(byte* card) const {
+    DCHECK(IsValidCard(card))
+        << " card_addr: " << reinterpret_cast<const void*>(card)
+        << " begin: " << reinterpret_cast<void*>(mem_map_->Begin() + offset_)
+        << " end: " << reinterpret_cast<void*>(mem_map_->End());
+  }
+
   // Verifies that all gray objects are on a dirty card.
   void VerifyCardTable();
 
diff --git a/src/gc/heap_bitmap.h b/src/gc/heap_bitmap.h
index 666fcc7..42c4166 100644
--- a/src/gc/heap_bitmap.h
+++ b/src/gc/heap_bitmap.h
@@ -73,8 +73,7 @@
       // TODO: C++0x auto
       for (Bitmaps::iterator it = bitmaps_.begin(); it != bitmaps_.end(); ++it) {
         SpaceBitmap* bitmap = *it;
-        bitmap->VisitMarkedRange(bitmap->HeapBegin(), bitmap->HeapLimit(), visitor,
-                                 IdentityFunctor());
+        bitmap->VisitMarkedRange(bitmap->HeapBegin(), bitmap->HeapLimit(), visitor, VoidFunctor());
       }
       large_objects_->Visit(visitor);
     }
diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc
index 4662cf6..f2617e4 100644
--- a/src/gc/mark_sweep.cc
+++ b/src/gc/mark_sweep.cc
@@ -51,6 +51,7 @@
 static const bool kProfileLargeObjects = false;
 static const bool kMeasureOverhead = false;
 static const bool kCountTasks = false;
+static const bool kCountJavaLangRefs = false;
 
 class SetFingerVisitor {
  public:
@@ -83,6 +84,7 @@
       large_object_test_(0), large_object_mark_(0),
       classes_marked_(0), overhead_time_(0),
       work_chunks_created_(0), work_chunks_deleted_(0),
+      reference_count_(0),
       gc_barrier_(new Barrier),
       large_object_lock_("large object lock") {
   DCHECK(mark_stack_ != NULL);
@@ -272,8 +274,7 @@
   }
 
   void operator ()(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) const
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
-                            Locks::mutator_lock_) {
+      NO_THREAD_SAFETY_ANALYSIS {
     mark_sweep_->CheckReference(obj, ref, offset, is_static);
   }
 
@@ -327,7 +328,7 @@
   MarkSweep* const mark_sweep_;
 };
 
-void MarkSweep::ScanGrayObjects() {
+void MarkSweep::ScanGrayObjects(byte minimum_age) {
   const Spaces& spaces = heap_->GetSpaces();
   CardTable* card_table = heap_->GetCardTable();
   ScanObjectVisitor visitor(this);
@@ -339,7 +340,7 @@
     byte* end = space->End();
     // Image spaces are handled properly since live == marked for them.
     SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
-    card_table->Scan(mark_bitmap, begin, end, visitor, IdentityFunctor());
+    card_table->Scan(mark_bitmap, begin, end, visitor, VoidFunctor(), minimum_age);
   }
 }
 
@@ -373,7 +374,7 @@
       uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
       SpaceBitmap* live_bitmap = space->GetLiveBitmap();
       DCHECK(live_bitmap != NULL);
-      live_bitmap->VisitMarkedRange(begin, end, visitor, IdentityFunctor());
+      live_bitmap->VisitMarkedRange(begin, end, visitor, VoidFunctor());
     }
   }
 }
@@ -436,7 +437,7 @@
       }
     }
     if (kDisableFinger) {
-      active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, IdentityFunctor());
+      active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, VoidFunctor());
     } else {
       active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor);
     }
@@ -452,8 +453,8 @@
       !reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
 }
 
-void MarkSweep::RecursiveMarkDirtyObjects() {
-  ScanGrayObjects();
+void MarkSweep::RecursiveMarkDirtyObjects(byte minimum_age) {
+  ScanGrayObjects(minimum_age);
   ProcessMarkStack();
 }
 
@@ -569,8 +570,8 @@
   virtual void Run(Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
     // Note: self is not necessarily equal to thread since thread may be suspended.
     Thread* self = Thread::Current();
-    DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
-        << thread->GetState();
+    CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     thread->VisitRoots(MarkSweep::MarkObjectCallback, mark_sweep_);
     mark_sweep_->GetBarrier().Pass(self);
@@ -585,12 +586,12 @@
 }
 
 void MarkSweep::MarkRootsCheckpoint() {
-  UniquePtr<CheckpointMarkThreadRoots> check_point(new CheckpointMarkThreadRoots(this));
+  CheckpointMarkThreadRoots check_point(this);
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   // Increment the count of the barrier. If all of the checkpoints have already been finished then
   // will hit 0 and continue. Otherwise we are still waiting for some checkpoints, so the counter
   // will go positive and we will unblock when it hits zero.
-  gc_barrier_->Increment(Thread::Current(), thread_list->RunCheckpoint(check_point.get()));
+  gc_barrier_->Increment(Thread::Current(), thread_list->RunCheckpoint(&check_point));
 }
 
 void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
@@ -675,7 +676,7 @@
   freed_bytes_ += freed_bytes;
   logger.AddSplit("FreeList");
   allocations->Reset();
-  logger.AddSplit("Reset stack");
+  logger.AddSplit("ResetStack");
 }
 
 void MarkSweep::Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps) {
@@ -799,6 +800,9 @@
   DCHECK(klass->IsReferenceClass());
   Object* pending = obj->GetFieldObject<Object*>(heap_->GetReferencePendingNextOffset(), false);
   Object* referent = heap_->GetReferenceReferent(obj);
+  if (kCountJavaLangRefs) {
+    ++reference_count_;
+  }
   if (pending == NULL && referent != NULL && !IsMarked(referent)) {
     Object** list = NULL;
     if (klass->IsSoftReferenceClass()) {
@@ -826,9 +830,7 @@
   }
 
   void operator ()(const Object* /* obj */, const Object* ref, const MemberOffset& /* offset */,
-                   bool /* is_static */) const
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+                   bool /* is_static */) const EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     mark_sweep_->MarkObject(ref);
   }
 
@@ -963,13 +965,9 @@
   virtual void Run(Thread* self) {
     int index;
     while ((index = index_++) < length_) {
-      static const size_t prefetch_look_ahead = 4;
       if (kUseMarkStackPrefetch) {
-        if (index + prefetch_look_ahead < length_) {
-          __builtin_prefetch(&data_[index + prefetch_look_ahead]);
-        } else {
-          __builtin_prefetch(&data_[length_ - 1]);
-        }
+        static const size_t prefetch_look_ahead = 1;
+        __builtin_prefetch(data_[std::min(index + prefetch_look_ahead, length_ - 1)]);
       }
       Object* obj = data_[index];
       DCHECK(obj != NULL);
@@ -1203,25 +1201,29 @@
 }
 
 MarkSweep::~MarkSweep() {
-  if (class_count_ != 0 || array_count_ != 0 || other_count_ != 0) {
-    LOG(INFO) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_
-               << " other=" << other_count_;
+  if (kCountScannedTypes) {
+    VLOG(gc) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_
+             << " other=" << other_count_;
   }
 
   if (kCountTasks) {
-    LOG(INFO) << "Total number of work chunks allocated: " << work_chunks_created_;
+    VLOG(gc) << "Total number of work chunks allocated: " << work_chunks_created_;
   }
 
   if (kMeasureOverhead) {
-    LOG(INFO) << "Overhead time " << PrettyDuration(overhead_time_);
+    VLOG(gc) << "Overhead time " << PrettyDuration(overhead_time_);
   }
 
   if (kProfileLargeObjects) {
-    LOG(INFO) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_;
+    VLOG(gc) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_;
   }
 
   if (kCountClassesMarked) {
-    LOG(INFO) << "Classes marked " << classes_marked_;
+    VLOG(gc) << "Classes marked " << classes_marked_;
+  }
+
+  if (kCountJavaLangRefs) {
+    VLOG(gc) << "References scanned " << reference_count_;
   }
 
   // Ensure that the mark stack is empty.
diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h
index f510943..2297ce1 100644
--- a/src/gc/mark_sweep.h
+++ b/src/gc/mark_sweep.h
@@ -82,7 +82,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Builds a mark stack with objects on dirty cards and recursively mark until it empties.
-  void RecursiveMarkDirtyObjects()
+  void RecursiveMarkDirtyObjects(byte minimum_age = CardTable::kCardDirty)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -165,7 +165,7 @@
         ++other_count_;
       }
       VisitOtherReferences(klass, obj, visitor);
-      if (klass->IsReferenceClass()) {
+      if (UNLIKELY(klass->IsReferenceClass())) {
         DelayReferenceReferent(const_cast<Object*>(obj));
       }
     }
@@ -329,9 +329,8 @@
   template <typename Visitor>
   static void VisitFieldsReferences(const Object* obj, uint32_t ref_offsets, bool is_static,
                              const Visitor& visitor)
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
-                            Locks::mutator_lock_) {
-    if (ref_offsets != CLASS_WALK_SUPER) {
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
+    if (LIKELY(ref_offsets != CLASS_WALK_SUPER)) {
       // Found a reference offset bitmap.  Mark the specified offsets.
       while (ref_offsets != 0) {
         size_t right_shift = CLZ(ref_offsets);
@@ -386,8 +385,9 @@
   }
 
   // Blackens objects grayed during a garbage collection.
-  void ScanGrayObjects()
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  void ScanGrayObjects(byte minimum_age)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Schedules an unmarked object for reference processing.
   void DelayReferenceReferent(Object* reference)
@@ -459,6 +459,7 @@
   AtomicInteger overhead_time_;
   AtomicInteger work_chunks_created_;
   AtomicInteger work_chunks_deleted_;
+  AtomicInteger reference_count_;
 
   UniquePtr<Barrier> gc_barrier_;
   Mutex large_object_lock_;
diff --git a/src/gc/mod_union_table.cc b/src/gc/mod_union_table.cc
index 967f795..5dd61e7 100644
--- a/src/gc/mod_union_table.cc
+++ b/src/gc/mod_union_table.cc
@@ -56,7 +56,7 @@
       bitmap_(bitmap) {
   }
 
-  void operator ()(Object* obj) const
+  void operator ()(const Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
                             Locks::mutator_lock_) {
     DCHECK(obj != NULL);
@@ -76,9 +76,12 @@
     : cleared_cards_(cleared_cards) {
   }
 
-  inline void operator ()(byte* card) const {
-    cleared_cards_->insert(card);
+  inline void operator ()(byte* card, byte expected_value, byte new_value) const {
+    if (expected_value == CardTable::kCardDirty) {
+      cleared_cards_->insert(card);
+    }
   }
+
  private:
   std::set<byte*>* const cleared_cards_;
 };
@@ -89,8 +92,10 @@
     : cleared_cards_(cleared_cards) {
   }
 
-  void operator ()(byte* card) const {
-    cleared_cards_->push_back(card);
+  void operator ()(byte* card, byte expected_card, byte new_card) const {
+    if (expected_card == CardTable::kCardDirty) {
+      cleared_cards_->push_back(card);
+    }
   }
  private:
   std::vector<byte*>* cleared_cards_;
@@ -124,7 +129,7 @@
   CardTable* card_table = heap_->GetCardTable();
   ModUnionClearCardVisitor visitor(&cleared_cards_);
   // Clear dirty cards in the this image space and update the corresponding mod-union bits.
-  card_table->VisitClear(space->Begin(), space->End(), visitor);
+  card_table->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), visitor);
 }
 
 void ModUnionTableBitmap::Update() {
@@ -145,7 +150,7 @@
     // At this point we need to update the mod-union bitmap to contain all the objects which reach
     // the alloc space.
     ModUnionVisitor visitor(heap_, bitmap);
-    space->GetLiveBitmap()->VisitMarkedRange(start, end, visitor, IdentityFunctor());
+    space->GetLiveBitmap()->VisitMarkedRange(start, end, visitor, VoidFunctor());
   }
 }
 
@@ -172,7 +177,7 @@
     const ContinuousSpace* space = it->first;
     uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
     uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-    it->second->VisitMarkedRange(begin, end, image_root_scanner, IdentityFunctor());
+    it->second->VisitMarkedRange(begin, end, image_root_scanner, VoidFunctor());
   }
 }
 
@@ -189,7 +194,7 @@
   CardTable* card_table = GetHeap()->GetCardTable();
   ModUnionClearCardSetVisitor visitor(&cleared_cards_);
   // Clear dirty cards in the this space and update the corresponding mod-union bits.
-  card_table->VisitClear(space->Begin(), space->End(), visitor);
+  card_table->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), visitor);
 }
 
 class AddToReferenceArrayVisitor {
@@ -204,8 +209,8 @@
   // Extra parameters are required since we use this same visitor signature for checking objects.
   void operator ()(const Object* obj, const Object* ref, const MemberOffset& /* offset */,
                      bool /* is_static */) const {
-    // Only add the reference if it fits our criteria.
-    if (mod_union_table_->AddReference(obj, ref)) {
+    // Only add the reference if it is non null and fits our criteria.
+    if (ref != NULL && mod_union_table_->AddReference(obj, ref)) {
       references_->push_back(ref);
     }
   }
@@ -224,7 +229,7 @@
       references_(references) {
   }
 
-  void operator ()(Object* obj) const
+  void operator ()(const Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_,
                             Locks::mutator_lock_) {
     DCHECK(obj != NULL);
@@ -253,9 +258,10 @@
   // Extra parameters are required since we use this same visitor signature for checking objects.
   void operator ()(const Object* obj, const Object* ref, const MemberOffset& /* offset */,
                      bool /* is_static */) const
-      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     Heap* heap = mod_union_table_->GetHeap();
-    if (mod_union_table_->AddReference(obj, ref) && references_.find(ref) == references_.end()) {
+    if (ref != NULL && mod_union_table_->AddReference(obj, ref) &&
+        references_.find(ref) == references_.end()) {
       ContinuousSpace* from_space = heap->FindSpaceFromObject(obj);
       ContinuousSpace* to_space = heap->FindSpaceFromObject(ref);
       LOG(INFO) << "Object " << reinterpret_cast<const void*>(obj) << "(" << PrettyTypeOf(obj) << ")"
@@ -284,7 +290,7 @@
       references_(references) {
   }
 
-  void operator ()(Object* obj) const
+  void operator ()(const Object* obj) const
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
     DCHECK(obj != NULL);
     CheckReferenceVisitor visitor(mod_union_table_, references_);
@@ -319,7 +325,7 @@
       uintptr_t end = start + CardTable::kCardSize;
       SpaceBitmap* live_bitmap =
               heap->FindSpaceFromObject(reinterpret_cast<Object*>(start))->GetLiveBitmap();
-      live_bitmap->VisitMarkedRange(start, end, visitor, IdentityFunctor());
+      live_bitmap->VisitMarkedRange(start, end, visitor, VoidFunctor());
     }
   }
 }
@@ -339,7 +345,7 @@
     uintptr_t end = start + CardTable::kCardSize;
     SpaceBitmap* live_bitmap =
         heap->FindSpaceFromObject(reinterpret_cast<Object*>(start))->GetLiveBitmap();
-    live_bitmap->VisitMarkedRange(start, end, visitor, IdentityFunctor());
+    live_bitmap->VisitMarkedRange(start, end, visitor, VoidFunctor());
 
     // Update the corresponding references for the card.
     // TODO: C++0x auto
@@ -383,7 +389,7 @@
   CardTable* card_table = GetHeap()->GetCardTable();
   ModUnionClearCardSetVisitor visitor(&cleared_cards_);
   // Clear dirty cards in the this space and update the corresponding mod-union bits.
-  card_table->VisitClear(space->Begin(), space->End(), visitor);
+  card_table->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), visitor);
 }
 
 // Mark all references to the alloc space(s).
@@ -396,7 +402,7 @@
     uintptr_t end = start + CardTable::kCardSize;
     SpaceBitmap* live_bitmap =
         heap_->FindSpaceFromObject(reinterpret_cast<Object*>(start))->GetLiveBitmap();
-    live_bitmap->VisitMarkedRange(start, end, visitor, IdentityFunctor());
+    live_bitmap->VisitMarkedRange(start, end, visitor, VoidFunctor());
   }
 }
 
diff --git a/src/gc/mod_union_table.h b/src/gc/mod_union_table.h
index 5e4733a..84592a4 100644
--- a/src/gc/mod_union_table.h
+++ b/src/gc/mod_union_table.h
@@ -185,11 +185,9 @@
         return (*it)->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect;
       }
     }
-    if (ref != NULL) {
-      Implementation::GetHeap()->DumpSpaces();
-      LOG(FATAL) << "Reference " << ref << " not in any space!";
-    }
-    return false;
+    // Assume it points to a large object.
+    // TODO: Check.
+    return true;
   }
 };
 
diff --git a/src/gc/space.cc b/src/gc/space.cc
index a7a5942..8595dd0 100644
--- a/src/gc/space.cc
+++ b/src/gc/space.cc
@@ -28,6 +28,8 @@
 
 namespace art {
 
+static const bool kPrefetchDuringDlMallocFreeList = true;
+
 // Magic padding value that we use to check for buffer overruns.
 static const word kPaddingValue = 0xBAC0BAC0;
 
@@ -308,7 +310,7 @@
         *reinterpret_cast<word*>(reinterpret_cast<byte*>(ptr) + AllocationSize(ptr) -
             sizeof(word) - kChunkOverhead), kPaddingValue);
   }
-  const size_t bytes_freed = AllocationSize(ptr);
+  const size_t bytes_freed = InternalAllocationSize(ptr);
   num_bytes_allocated_ -= bytes_freed;
   --num_objects_allocated_;
   mspace_free(mspace_, ptr);
@@ -316,15 +318,21 @@
 }
 
 size_t DlMallocSpace::FreeList(Thread* self, size_t num_ptrs, Object** ptrs) {
+  DCHECK(ptrs != NULL);
+
   // Don't need the lock to calculate the size of the freed pointers.
   size_t bytes_freed = 0;
   for (size_t i = 0; i < num_ptrs; i++) {
-    bytes_freed += AllocationSize(ptrs[i]);
+    Object* ptr = ptrs[i];
+    const size_t look_ahead = 8;
+    if (kPrefetchDuringDlMallocFreeList && i + look_ahead < num_ptrs) {
+      // The head of chunk for the allocation is sizeof(size_t) behind the allocation.
+      __builtin_prefetch(reinterpret_cast<char*>(ptrs[i + look_ahead]) - sizeof(size_t));
+    }
+    bytes_freed += InternalAllocationSize(ptr);
   }
 
-  MutexLock mu(self, lock_);
   if (kDebugSpaces) {
-    CHECK(ptrs != NULL);
     size_t num_broken_ptrs = 0;
     for (size_t i = 0; i < num_ptrs; i++) {
       if (!Contains(ptrs[i])) {
@@ -337,10 +345,14 @@
     }
     CHECK_EQ(num_broken_ptrs, 0u);
   }
-  num_bytes_allocated_ -= bytes_freed;
-  num_objects_allocated_ -= num_ptrs;
-  mspace_bulk_free(mspace_, reinterpret_cast<void**>(ptrs), num_ptrs);
-  return bytes_freed;
+
+  {
+    MutexLock mu(self, lock_);
+    num_bytes_allocated_ -= bytes_freed;
+    num_objects_allocated_ -= num_ptrs;
+    mspace_bulk_free(mspace_, reinterpret_cast<void**>(ptrs), num_ptrs);
+    return bytes_freed;
+  }
 }
 
 // Callback from dlmalloc when it needs to increase the footprint
@@ -384,11 +396,16 @@
   return original_end;
 }
 
-size_t DlMallocSpace::AllocationSize(const Object* obj) {
+// Virtual functions can't get inlined.
+inline size_t DlMallocSpace::InternalAllocationSize(const Object* obj) {
   return mspace_usable_size(const_cast<void*>(reinterpret_cast<const void*>(obj))) +
       kChunkOverhead;
 }
 
+size_t DlMallocSpace::AllocationSize(const Object* obj) {
+  return InternalAllocationSize(obj);
+}
+
 void MspaceMadviseCallback(void* start, void* end, size_t used_bytes, void* /* arg */) {
   // Is this chunk in use?
   if (used_bytes != 0) {
diff --git a/src/gc/space.h b/src/gc/space.h
index 81b03fa..d816a10 100644
--- a/src/gc/space.h
+++ b/src/gc/space.h
@@ -29,11 +29,7 @@
 
 namespace art {
 
-#ifndef NDEBUG
-static const bool kDebugSpaces = true;
-#else
-static const bool kDebugSpaces = false;
-#endif
+static const bool kDebugSpaces = kIsDebugBuild;
 
 class DlMallocSpace;
 class ImageSpace;
@@ -357,6 +353,7 @@
   }
 
  private:
+  size_t InternalAllocationSize(const Object* obj);
   Object* AllocWithoutGrowthLocked(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   UniquePtr<SpaceBitmap> live_bitmap_;
diff --git a/src/heap.cc b/src/heap.cc
index b4cf4a9..2c52ff2 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -268,7 +268,7 @@
   // TODO: Count objects in the image space here.
   num_bytes_allocated_ = 0;
 
-  // Max stack size in bytes.
+  // Default mark stack size in bytes.
   static const size_t default_mark_stack_size = 64 * KB;
   mark_stack_.reset(ObjectStack::Create("dalvik-mark-stack", default_mark_stack_size));
   allocation_stack_.reset(ObjectStack::Create("dalvik-allocation-stack",
@@ -854,6 +854,8 @@
 
 void Heap::PreZygoteFork() {
   static Mutex zygote_creation_lock_("zygote creation lock", kZygoteCreationLock);
+  // Do this before acquiring the zygote creation lock so that we don't get lock order violations.
+  CollectGarbage(false);
   Thread* self = Thread::Current();
   MutexLock mu(self, zygote_creation_lock_);
 
@@ -973,6 +975,7 @@
   if (gc_type != kGcTypeSticky) {
     sticky_gc_count_ = 0;
   }
+
   if (concurrent_gc_) {
     CollectGarbageConcurrentMarkSweepPlan(self, gc_type, gc_cause, clear_soft_references);
   } else {
@@ -1024,17 +1027,8 @@
     // Swap allocation stack and live stack, enabling us to have new allocations during this GC.
     SwapStacks();
 
-    // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
-    // TODO: Investigate using a mark stack instead of a vector.
-    std::vector<byte*> dirty_cards;
-    if (gc_type == kGcTypeSticky) {
-      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-        card_table_->GetDirtyCards(*it, dirty_cards);
-      }
-    }
-
-    // Clear image space cards and keep track of cards we cleared in the mod-union table.
-    ClearCards(timings);
+    // Process dirty cards and add dirty cards to mod union tables.
+    ProcessCards(timings);
 
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     if (gc_type == kGcTypePartial) {
@@ -1088,7 +1082,9 @@
     if (gc_type != kGcTypeSticky) {
       mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
     } else {
-      mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
+      // Use -1 since we want to scan all of the cards which we aged earlier when we did
+      // ClearCards. These are the cards which were dirty before the GC started.
+      mark_sweep.RecursiveMarkDirtyObjects(CardTable::kCardDirty - 1);
     }
     mark_sweep.DisableFinger();
 
@@ -1261,7 +1257,7 @@
       ScanVisitor scan_visitor;
       byte* byte_cover_begin = reinterpret_cast<byte*>(card_table->AddrFromCard(card_addr));
       card_table->Scan(bitmap, byte_cover_begin, byte_cover_begin + CardTable::kCardSize,
-                       scan_visitor, IdentityFunctor());
+                       scan_visitor, VoidFunctor());
 
       // Try and see if a mark sweep collector scans the reference.
       ObjectStack* mark_stack = heap_->mark_stack_.get();
@@ -1492,9 +1488,7 @@
 }
 
 void Heap::SwapStacks() {
-  ObjectStack* temp = allocation_stack_.release();
-  allocation_stack_.reset(live_stack_.release());
-  live_stack_.reset(temp);
+  allocation_stack_.swap(live_stack_);
 
   // Sort the live stack so that we can quickly binary search it later.
   if (VERIFY_OBJECT_ENABLED) {
@@ -1502,7 +1496,7 @@
   }
 }
 
-void Heap::ClearCards(TimingLogger& timings) {
+void Heap::ProcessCards(TimingLogger& timings) {
   // Clear image space cards and keep track of cards we cleared in the mod-union table.
   for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
     ContinuousSpace* space = *it;
@@ -1513,8 +1507,10 @@
       zygote_mod_union_table_->ClearCards(space);
       timings.AddSplit("ZygoteModUnionClearCards");
     } else {
-      card_table_->ClearSpaceCards(space);
-      timings.AddSplit("ClearCards");
+      // No mod union table for the AllocSpace. Age the cards so that the GC knows that these cards
+      // were dirty before the GC started.
+      card_table_->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), VoidFunctor());
+      timings.AddSplit("AllocSpaceClearCards");
     }
   }
 }
@@ -1522,12 +1518,11 @@
 void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_cause,
                                                  bool clear_soft_references) {
   TimingLogger timings("ConcurrentCollectGarbageInternal", true);
-  uint64_t gc_begin = NanoTime(), root_begin = 0, root_end = 0, dirty_begin = 0, dirty_end = 0;
+  uint64_t gc_begin = NanoTime(), dirty_begin = 0, dirty_end = 0;
   std::stringstream gc_type_str;
   gc_type_str << gc_type << " ";
 
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
-  std::vector<byte*> dirty_cards;
   size_t bytes_freed = 0;
   Object* cleared_references = NULL;
   {
@@ -1567,30 +1562,34 @@
       mark_sweep.FindDefaultMarkBitmap();
     }
 
-    mark_sweep.MarkRootsCheckpoint();
-    timings.AddSplit("MarkRootsCheckpoint");
-
-    {
-      root_begin = NanoTime();
-
-      // Suspend all threads are get exclusive access to the heap.
+    if (verify_pre_gc_heap_) {
       thread_list->SuspendAll();
-      timings.AddSplit("SuspendAll");
-      Locks::mutator_lock_->AssertExclusiveHeld(self);
-
-      if (verify_pre_gc_heap_) {
+      {
         WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
         if (!VerifyHeapReferences()) {
           LOG(FATAL) << "Pre " << gc_type_str.str() << "Gc verification failed";
         }
         timings.AddSplit("VerifyHeapReferencesPreGC");
       }
+      thread_list->ResumeAll();
+    }
 
-      // Swap the stacks, this is safe since all the mutators are suspended at this point.
-      SwapStacks();
+    // Process dirty cards and add dirty cards to mod union tables.
+    ProcessCards(timings);
 
-      // Check that all objects which reference things in the live stack are on dirty cards.
-      if (verify_missing_card_marks_) {
+    // Need to do this before the checkpoint since we don't want any threads to add references to
+    // the live stack during the recursive mark.
+    SwapStacks();
+    timings.AddSplit("SwapStacks");
+
+    // Tell the running threads to suspend and mark their roots.
+    mark_sweep.MarkRootsCheckpoint();
+    timings.AddSplit("MarkRootsCheckpoint");
+
+    // Check that all objects which reference things in the live stack are on dirty cards.
+    if (verify_missing_card_marks_) {
+      thread_list->SuspendAll();
+      {
         ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
         // Sort the live stack so that we can quickly binary search it later.
         std::sort(live_stack_->Begin(), live_stack_->End());
@@ -1598,35 +1597,22 @@
           LOG(FATAL) << "Pre GC verification of missing card marks failed";
         }
       }
-
-      // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
-      // TODO: Investigate using a mark stack instead of a vector.
-      if (gc_type == kGcTypeSticky) {
-        dirty_cards.reserve(4 * KB);
-        for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-          card_table_->GetDirtyCards(*it, dirty_cards);
-        }
-        timings.AddSplit("GetDirtyCards");
-      }
-
-      // Clear image space cards and keep track of cards we cleared in the mod-union table.
-      ClearCards(timings);
+      thread_list->ResumeAll();
     }
 
     if (verify_mod_union_table_) {
+      thread_list->SuspendAll();
       ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
       zygote_mod_union_table_->Update();
       zygote_mod_union_table_->Verify();
       mod_union_table_->Update();
       mod_union_table_->Verify();
+      thread_list->ResumeAll();
     }
 
-    // Allow mutators to go again, acquire share on mutator_lock_ to continue.
-    thread_list->ResumeAll();
     {
+      // Allow mutators to go again, acquire share on mutator_lock_ to continue.
       ReaderMutexLock reader_lock(self, *Locks::mutator_lock_);
-      root_end = NanoTime();
-      timings.AddSplit("RootEnd");
 
       // Mark the roots which we can do concurrently.
       WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
@@ -1649,7 +1635,8 @@
         // Recursively mark all the non-image bits set in the mark bitmap.
         mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
       } else {
-        mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
+        mark_sweep.RecursiveMarkDirtyObjects(CardTable::kCardDirty - 1);
+        timings.AddSplit("RecursiveMarkCards");
       }
       mark_sweep.DisableFinger();
     }
@@ -1779,20 +1766,18 @@
   timings.AddSplit("Finish");
 
   // If the GC was slow, then print timings in the log.
-  uint64_t pause_roots = (root_end - root_begin) / 1000 * 1000;
-  uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
+  uint64_t pause_time = (dirty_end - dirty_begin) / 1000 * 1000;
   uint64_t duration = (NanoTime() - gc_begin) / 1000 * 1000;
-  total_paused_time_ += pause_roots + pause_dirty;
-  if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5) ||
-      (gc_cause == kGcCauseForAlloc && duration > MsToNs(20))) {
+  total_paused_time_ += pause_time;
+  if (pause_time > MsToNs(5) || (gc_cause == kGcCauseForAlloc && duration > MsToNs(20))) {
     const size_t percent_free = GetPercentFree();
     const size_t current_heap_size = GetUsedMemorySize();
     const size_t total_memory = GetTotalMemory();
     LOG(INFO) << gc_cause << " " << gc_type_str.str()
               << "Concurrent GC freed " << PrettySize(bytes_freed) << ", " << percent_free
               << "% free, " << PrettySize(current_heap_size) << "/"
-              << PrettySize(total_memory) << ", " << "paused " << PrettyDuration(pause_roots)
-              << "+" << PrettyDuration(pause_dirty) << " total " << PrettyDuration(duration);
+              << PrettySize(total_memory) << ", " << "paused " << PrettyDuration(pause_time)
+              << " total " << PrettyDuration(duration);
     if (VLOG_IS_ON(heap)) {
       timings.Dump();
     }
@@ -1946,8 +1931,8 @@
   Object* head = (*list)->GetFieldObject<Object*>(reference_pendingNext_offset_, false);
   Object* ref;
 
-  // TODO: Remove this lock, use atomic stacks for storing references.
-  MutexLock mu(Thread::Current(), *reference_queue_lock_);
+  // Note: the following code is thread-safe because it is only called from ProcessReferences which
+  // is single threaded.
   if (*list == head) {
     ref = *list;
     *list = NULL;
diff --git a/src/heap.h b/src/heap.h
index 6c4c38b..584718e 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -59,6 +59,17 @@
 typedef AtomicStack<Object*> ObjectStack;
 typedef std::vector<ContinuousSpace*> Spaces;
 
+class AgeCardVisitor {
+ public:
+  byte operator ()(byte card) const {
+    if (card == CardTable::kCardDirty) {
+      return card - 1;
+    } else {
+      return 0;
+    }
+  }
+};
+
 // The ordering of the enum matters, it is used to determine which GCs are run first.
 enum GcType {
   // No Gc
@@ -382,7 +393,7 @@
   void SwapStacks();
 
   // Clear cards and update the mod union table.
-  void ClearCards(TimingLogger& timings);
+  void ProcessCards(TimingLogger& timings);
 
   Spaces spaces_;
 
diff --git a/src/utils.h b/src/utils.h
index d33cc4b..b2b2de9 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -342,10 +342,25 @@
 bool IsValidDexFilename(const std::string& filename);
 bool IsValidOatFilename(const std::string& filename);
 
-class IdentityFunctor {
+class VoidFunctor {
  public:
-  template <typename T>
-  inline T operator () (T t) const { return t; }
+  template <typename A>
+  inline void operator () (A a) const {
+    UNUSED(a);
+  }
+
+  template <typename A, typename B>
+  inline void operator () (A a, B b) const {
+    UNUSED(a);
+    UNUSED(b);
+  }
+
+  template <typename A, typename B, typename C>
+  inline void operator () (A a, B b, C c) const {
+    UNUSED(a);
+    UNUSED(b);
+    UNUSED(c);
+  }
 };
 
 }  // namespace art