More parallel GC, rewritten parallel mark stack processing.

Card scanning may now be done in parallel. This speeds up sticky and
reduces pause times for all GC types.

Speedup on my mako (ritz perf):
Average pause time for sticky GC (~250 samples):
Without parallel cards scanning enabled: 2.524904215ms
Parallel card scanning (num_gc_threads_): 1.552123552ms
Throughput (~250 samples):
Sticky GC throughput with parallel card scanning: 69MB/s
Sticky GC throughput without parallel card scanning: 51MB/s

Rewrote the mark stack processing to be LIFO and use a prefetch queue
like the non parallel version.

Cleaned up some of the logcat printing for the activity manager
process state listening.

Added unlikely hints to object scanning since arrays and classes are
scanned much less often than normal objects.

Fixed a bug where the number of GC threads was clamped to 1 due to a
bool instead of a size_t.

Fixed a race condition when we added references to the reference
queues. Sharded the reference queue lock into one lock for each reference
type (weak, soft, phatom, finalizer).

Changed timing splits to be different for processing gray objects with
and without mutators paused since sticky GC does both.

Mask out the class bit when visiting fields as an optimization, this is
valid since classes are held live by the class linker.

Partially completed: Parallel recursive mark + finger.

Bug: 10245302
Bug: 9969166
Bug: 9986532
Bug: 9961698

Change-Id: I142d09718c4609b7c2387cb28f517a6983c73288
diff --git a/runtime/base/bounded_fifo.h b/runtime/base/bounded_fifo.h
new file mode 100644
index 0000000..cb92d40
--- /dev/null
+++ b/runtime/base/bounded_fifo.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_BOUNDED_FIFO_H_
+#define ART_RUNTIME_BASE_BOUNDED_FIFO_H_
+
+#include "cutils/atomic.h"
+#include "cutils/atomic-inline.h"
+
+namespace art {
+
+// A bounded fifo is a fifo which has a bounded size. The power of two version uses a bit mask to
+// avoid needing to deal with wrapping integers around or using a modulo operation.
+template <typename T, const size_t MaxSize>
+class BoundedFifoPowerOfTwo {
+ public:
+  BoundedFifoPowerOfTwo() {
+    // TODO: Do this with a compile time check.
+    CHECK(IsPowerOfTwo(MaxSize));
+    clear();
+  }
+
+  void clear() {
+    back_index_ = 0;
+    size_ = 0;
+  }
+
+  bool empty() const {
+    return size() == 0;
+  }
+
+  size_t size() const {
+    return size_;
+  }
+
+  void push_back(const T& value) {
+    ++size_;
+    DCHECK_LE(size_, MaxSize);
+    // Relies on integer overflow behaviour.
+    data_[back_index_++ & mask_] = value;
+  }
+
+  const T& front() const {
+    DCHECK_GT(size_, 0U);
+    return data_[(back_index_ - size_) & mask_];
+  }
+
+  void pop_front() {
+    DCHECK_GT(size_, 0U);
+    --size_;
+  }
+
+ private:
+  static const size_t mask_ = MaxSize - 1;
+  size_t back_index_, size_;
+  T data_[MaxSize];
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_BOUNDED_FIFO_H_
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index a732566..997d725 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -98,6 +98,12 @@
     return begin_[index];
   }
 
+  // Pop a number of elements.
+  void PopBackCount(int32_t n) {
+    DCHECK_GE(Size(), static_cast<size_t>(n));
+    back_index_.fetch_sub(n);
+  }
+
   bool IsEmpty() const {
     return Size() == 0;
   }
@@ -108,11 +114,11 @@
   }
 
   T* Begin() const {
-    return const_cast<mirror::Object**>(begin_ + front_index_);
+    return const_cast<T*>(begin_ + front_index_);
   }
 
   T* End() const {
-    return const_cast<mirror::Object**>(begin_ + back_index_);
+    return const_cast<T*>(begin_ + back_index_);
   }
 
   size_t Capacity() const {
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index fb0f61b..fa2ab27 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -65,33 +65,32 @@
       (reinterpret_cast<uintptr_t>(card_end) & (sizeof(uintptr_t) - 1));
 
   // Now we have the words, we can send these to be processed in parallel.
-  uintptr_t* word_cur = reinterpret_cast<uintptr_t*>(card_cur);
-  uintptr_t* word_end = reinterpret_cast<uintptr_t*>(aligned_end);
+  auto* word_cur = reinterpret_cast<uintptr_t*>(card_cur);
+  auto* word_end = reinterpret_cast<uintptr_t*>(aligned_end);
+  for (;;) {
+    while (LIKELY(*word_cur == 0)) {
+      ++word_cur;
+      if (UNLIKELY(word_cur >= word_end)) {
+        goto exit_for;
+      }
+    }
 
-  // TODO: Parallelize
-  while (word_cur < word_end) {
     // Find the first dirty card.
-    while (*word_cur == 0 && word_cur < word_end) {
-      word_cur++;
-    }
-    if (word_cur >= word_end) {
-      break;
-    }
     uintptr_t start_word = *word_cur;
+    uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(reinterpret_cast<byte*>(word_cur)));
     for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
-      if ((start_word & 0xFF) >= minimum_age) {
-        byte* card = reinterpret_cast<byte*>(word_cur) + i;
-        const byte card_byte = *card;
-        DCHECK(card_byte == (start_word & 0xFF) || card_byte == kCardDirty)
-        << "card " << static_cast<size_t>(card_byte) << " word " << (start_word & 0xFF);
-        uintptr_t start = reinterpret_cast<uintptr_t>(AddrFromCard(card));
-        uintptr_t end = start + kCardSize;
-        bitmap->VisitMarkedRange(start, end, visitor);
+      if (static_cast<byte>(start_word) >= minimum_age) {
+        auto* card = reinterpret_cast<byte*>(word_cur) + i;
+        DCHECK(*card == static_cast<byte>(start_word) || *card == kCardDirty)
+            << "card " << static_cast<size_t>(*card) << " word " << (start_word & 0xFF);
+        bitmap->VisitMarkedRange(start, start + kCardSize, visitor);
       }
       start_word >>= 8;
+      start += kCardSize;
     }
     ++word_cur;
   }
+  exit_for:
 
   // Handle any unaligned cards at the end.
   card_cur = reinterpret_cast<byte*>(word_end);
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index a1936de..37e719f 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -117,10 +117,10 @@
   void ClearSpaceCards(space::ContinuousSpace* space);
 
   // Returns the first address in the heap which maps to this card.
-  void* AddrFromCard(const byte *card_addr) const;
+  void* AddrFromCard(const byte *card_addr) const ALWAYS_INLINE;
 
   // Returns the address of the relevant byte in the card table, given an address on the heap.
-  byte* CardFromAddr(const void *addr) const;
+  byte* CardFromAddr(const void *addr) const ALWAYS_INLINE;
 
   bool AddrIsInCardTable(const void* addr) const;
 
@@ -134,7 +134,7 @@
     return card_addr >= begin && card_addr < end;
   }
 
-  void CheckCardValid(byte* card) const;
+  void CheckCardValid(byte* card) const ALWAYS_INLINE;
 
   // Verifies that all gray objects are on a dirty card.
   void VerifyCardTable();
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 9260137..039415e 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -61,9 +61,7 @@
 }
 
 void GarbageCollector::Run() {
-  Thread* self = Thread::Current();
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
-
   uint64_t start_time = NanoTime();
   pause_times_.clear();
   duration_ns_ = 0;
@@ -82,6 +80,7 @@
     uint64_t pause_end = NanoTime();
     pause_times_.push_back(pause_end - pause_start);
   } else {
+    auto* self = Thread::Current();
     {
       ReaderMutexLock mu(self, *Locks::mutator_lock_);
       MarkingPhase();
@@ -134,7 +133,7 @@
     accounting::SpaceSetMap* mark_set = space->GetMarkObjects();
     heap_->GetLiveBitmap()->ReplaceObjectSet(live_set, mark_set);
     heap_->GetMarkBitmap()->ReplaceObjectSet(mark_set, live_set);
-    space->SwapBitmaps();
+    down_cast<space::LargeObjectSpace*>(space)->SwapBitmaps();
   }
 }
 
diff --git a/runtime/gc/collector/mark_sweep-inl.h b/runtime/gc/collector/mark_sweep-inl.h
index e158952..d0b0b5c 100644
--- a/runtime/gc/collector/mark_sweep-inl.h
+++ b/runtime/gc/collector/mark_sweep-inl.h
@@ -37,27 +37,26 @@
   }
   mirror::Class* klass = obj->GetClass();
   DCHECK(klass != NULL);
-  if (klass == java_lang_Class_) {
+  if (UNLIKELY(klass->IsArrayClass())) {
+    if (kCountScannedTypes) {
+      ++array_count_;
+    }
+    if (klass->IsObjectArrayClass()) {
+      VisitObjectArrayReferences(obj->AsObjectArray<mirror::Object>(), visitor);
+    }
+  } else if (UNLIKELY(klass == java_lang_Class_)) {
     DCHECK_EQ(klass->GetClass(), java_lang_Class_);
     if (kCountScannedTypes) {
       ++class_count_;
     }
     VisitClassReferences(klass, obj, visitor);
-  } else if (klass->IsArrayClass()) {
-    if (kCountScannedTypes) {
-      ++array_count_;
-    }
-    visitor(obj, klass, mirror::Object::ClassOffset(), false);
-    if (klass->IsObjectArrayClass()) {
-      VisitObjectArrayReferences(obj->AsObjectArray<mirror::Object>(), visitor);
-    }
   } else {
     if (kCountScannedTypes) {
       ++other_count_;
     }
     VisitOtherReferences(klass, obj, visitor);
     if (UNLIKELY(klass->IsReferenceClass())) {
-      DelayReferenceReferent(const_cast<mirror::Object*>(obj));
+      DelayReferenceReferent(klass, const_cast<mirror::Object*>(obj));
     }
   }
 }
@@ -117,6 +116,11 @@
                                              bool is_static, const Visitor& visitor) {
   if (LIKELY(ref_offsets != CLASS_WALK_SUPER)) {
     // Found a reference offset bitmap.  Mark the specified offsets.
+#ifndef MOVING_COLLECTOR
+    // Clear the class bit since we mark the class as part of marking the classlinker roots.
+    DCHECK_EQ(mirror::Object::ClassOffset().Uint32Value(), 0U);
+    ref_offsets &= (1U << (sizeof(ref_offsets) * 8 - 1)) - 1;
+#endif
     while (ref_offsets != 0) {
       size_t right_shift = CLZ(ref_offsets);
       MemberOffset field_offset = CLASS_OFFSET_FROM_CLZ(right_shift);
@@ -149,11 +153,11 @@
 template <typename Visitor>
 inline void MarkSweep::VisitObjectArrayReferences(const mirror::ObjectArray<mirror::Object>* array,
                                                   const Visitor& visitor) {
-  const int32_t length = array->GetLength();
-  for (int32_t i = 0; i < length; ++i) {
-    const mirror::Object* element = array->GetWithoutChecks(i);
+  const size_t length = static_cast<size_t>(array->GetLength());
+  for (size_t i = 0; i < length; ++i) {
+    const mirror::Object* element = array->GetWithoutChecks(static_cast<int32_t>(i));
     const size_t width = sizeof(mirror::Object*);
-    MemberOffset offset = MemberOffset(i * width + mirror::Array::DataOffset(width).Int32Value());
+    MemberOffset offset(i * width + mirror::Array::DataOffset(width).Int32Value());
     visitor(array, element, offset, false);
   }
 }
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index e93bcd1..5c31eb1 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -21,6 +21,7 @@
 #include <climits>
 #include <vector>
 
+#include "base/bounded_fifo.h"
 #include "base/logging.h"
 #include "base/macros.h"
 #include "base/mutex-inl.h"
@@ -60,17 +61,29 @@
 namespace collector {
 
 // Performance options.
-static const bool kParallelMarkStack = true;
-static const bool kDisableFinger = true;  // TODO: Fix, bit rotten.
-static const bool kUseMarkStackPrefetch = true;
-static const size_t kSweepArrayChunkFreeSize = 1024;
+constexpr bool kUseRecursiveMark = false;
+constexpr bool kUseMarkStackPrefetch = true;
+constexpr size_t kSweepArrayChunkFreeSize = 1024;
+
+// Parallelism options.
+constexpr bool kParallelCardScan = true;
+constexpr bool kParallelRecursiveMark = true;
+// Don't attempt to parallelize mark stack processing unless the mark stack is at least n
+// elements. This is temporary until we reduce the overhead caused by allocating tasks, etc.. Not
+// having this can add overhead in ProcessReferences since we may end up doing many calls of
+// ProcessMarkStack with very small mark stacks.
+constexpr size_t kMinimumParallelMarkStackSize = 128;
+constexpr bool kParallelProcessMarkStack = true;
 
 // Profiling and information flags.
-static const bool kCountClassesMarked = false;
-static const bool kProfileLargeObjects = false;
-static const bool kMeasureOverhead = false;
-static const bool kCountTasks = false;
-static const bool kCountJavaLangRefs = false;
+constexpr bool kCountClassesMarked = false;
+constexpr bool kProfileLargeObjects = false;
+constexpr bool kMeasureOverhead = false;
+constexpr bool kCountTasks = false;
+constexpr bool kCountJavaLangRefs = false;
+
+// Turn off kCheckLocks when profiling the GC since it slows the GC down by up to 40%.
+constexpr bool kCheckLocks = kDebugLocking;
 
 void MarkSweep::ImmuneSpace(space::ContinuousSpace* space) {
   // Bind live to mark bitmap if necessary.
@@ -92,7 +105,6 @@
       }
       prev_space = cur_space;
     }
-
     // If previous space was immune, then extend the immune region. Relies on continuous spaces
     // being sorted by Heap::AddContinuousSpace.
     if (prev_space != NULL &&
@@ -106,13 +118,9 @@
 
 void MarkSweep::BindBitmaps() {
   timings_.StartSplit("BindBitmaps");
-  const std::vector<space::ContinuousSpace*>& spaces = GetHeap()->GetContinuousSpaces();
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
-
   // Mark all of the spaces we never collect as immune.
-  typedef std::vector<space::ContinuousSpace*>::const_iterator It;
-  for (It it = spaces.begin(), end = spaces.end(); it != end; ++it) {
-    space::ContinuousSpace* space = *it;
+  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyNeverCollect) {
       ImmuneSpace(space);
     }
@@ -169,7 +177,7 @@
 
   FindDefaultMarkBitmap();
 
-// Do any pre GC verification.
+  // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
   heap_->PreGcVerification(this);
 }
@@ -183,7 +191,6 @@
 bool MarkSweep::HandleDirtyObjectsPhase() {
   base::TimingLogger::ScopedSplit split("HandleDirtyObjectsPhase", &timings_);
   Thread* self = Thread::Current();
-  accounting::ObjectStack* allocation_stack = GetHeap()->allocation_stack_.get();
   Locks::mutator_lock_->AssertExclusiveHeld(self);
 
   {
@@ -193,7 +200,7 @@
     ReMarkRoots();
 
     // Scan dirty objects, this is only required if we are not doing concurrent GC.
-    RecursiveMarkDirtyObjects(accounting::CardTable::kCardDirty);
+    RecursiveMarkDirtyObjects(true, accounting::CardTable::kCardDirty);
   }
 
   ProcessReferences(self);
@@ -203,7 +210,7 @@
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // This second sweep makes sure that we don't have any objects in the live stack which point to
     // freed objects. These cause problems since their references may be previously freed objects.
-    SweepArray(allocation_stack, false);
+    SweepArray(GetHeap()->allocation_stack_.get(), false);
   }
   return true;
 }
@@ -233,7 +240,7 @@
     // If we exclusively hold the mutator lock, all threads must be suspended.
     MarkRoots();
   } else {
-    MarkRootsCheckpoint(self);
+    MarkThreadRoots(self);
     MarkNonThreadRoots();
   }
   MarkConcurrentRoots();
@@ -242,14 +249,17 @@
   MarkReachableObjects();
 }
 
+void MarkSweep::MarkThreadRoots(Thread* self) {
+  MarkRootsCheckpoint(self);
+}
+
 void MarkSweep::MarkReachableObjects() {
   // Mark everything allocated since the last as GC live so that we can sweep concurrently,
   // knowing that new allocations won't be marked as live.
   timings_.StartSplit("MarkStackAsLive");
   accounting::ObjectStack* live_stack = heap_->GetLiveStack();
   heap_->MarkAllocStack(heap_->alloc_space_->GetLiveBitmap(),
-                       heap_->large_object_space_->GetLiveObjects(),
-                       live_stack);
+                        heap_->large_object_space_->GetLiveObjects(), live_stack);
   live_stack->Reset();
   timings_.EndSplit();
   // Recursively mark all the non-image bits set in the mark bitmap.
@@ -258,7 +268,7 @@
 
 void MarkSweep::ReclaimPhase() {
   base::TimingLogger::ScopedSplit split("ReclaimPhase", &timings_);
-  Thread* self = Thread::Current();
+  auto* self = Thread::Current();
 
   if (!IsConcurrent()) {
     base::TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
@@ -278,7 +288,7 @@
     // first place.
     mirror::Object** end = allocation_stack->End();
     for (mirror::Object** it = allocation_stack->Begin(); it != end; ++it) {
-      Object* obj = *it;
+      const Object* obj = *it;
       if (obj != NULL) {
         UnMarkObjectNonNull(obj);
       }
@@ -466,7 +476,7 @@
 // objects.  Any newly-marked objects whose addresses are lower than
 // the finger won't be visited by the bitmap scan, so those objects
 // need to be added to the mark stack.
-void MarkSweep::MarkObject(const Object* obj) {
+inline void MarkSweep::MarkObject(const Object* obj) {
   if (obj != NULL) {
     MarkObjectNonNull(obj);
   }
@@ -541,26 +551,13 @@
   timings_.EndSplit();
 }
 
-class CheckObjectVisitor {
- public:
-  explicit CheckObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {}
-
-  void operator()(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) const
-      NO_THREAD_SAFETY_ANALYSIS {
-    if (kDebugLocking) {
-      Locks::heap_bitmap_lock_->AssertSharedHeld(Thread::Current());
-    }
-    mark_sweep_->CheckReference(obj, ref, offset, is_static);
-  }
-
- private:
-  MarkSweep* const mark_sweep_;
-};
-
 void MarkSweep::CheckObject(const Object* obj) {
   DCHECK(obj != NULL);
-  CheckObjectVisitor visitor(this);
-  VisitObjectReferences(obj, visitor);
+  VisitObjectReferences(obj, [this](const Object* obj, const Object* ref, MemberOffset offset,
+      bool is_static) NO_THREAD_SAFETY_ANALYSIS {
+    Locks::heap_bitmap_lock_->AssertSharedHeld(Thread::Current());
+    CheckReference(obj, ref, offset, is_static);
+  });
 }
 
 void MarkSweep::VerifyImageRootVisitor(Object* root, void* arg) {
@@ -583,11 +580,12 @@
 
 class ScanObjectVisitor {
  public:
-  explicit ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {}
+  explicit ScanObjectVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE
+      : mark_sweep_(mark_sweep) {}
 
   // TODO: Fixme when anotatalysis works with visitors.
-  void operator()(const Object* obj) const NO_THREAD_SAFETY_ANALYSIS {
-    if (kDebugLocking) {
+  void operator()(const Object* obj) const ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS {
+    if (kCheckLocks) {
       Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
       Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
     }
@@ -598,52 +596,229 @@
   MarkSweep* const mark_sweep_;
 };
 
-void MarkSweep::ScanGrayObjects(byte minimum_age) {
-  accounting::CardTable* card_table = GetHeap()->GetCardTable();
-  ScanObjectVisitor visitor(this);
-  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
-    switch (space->GetGcRetentionPolicy()) {
-      case space::kGcRetentionPolicyNeverCollect:
-        timings_.StartSplit("ScanGrayImageSpaceObjects");
-        break;
-      case space::kGcRetentionPolicyFullCollect:
-        timings_.StartSplit("ScanGrayZygoteSpaceObjects");
-        break;
-      case space::kGcRetentionPolicyAlwaysCollect:
-        timings_.StartSplit("ScanGrayAllocSpaceObjects");
-        break;
+template <bool kUseFinger = false>
+class MarkStackTask : public Task {
+ public:
+  MarkStackTask(ThreadPool* thread_pool, MarkSweep* mark_sweep, size_t mark_stack_size,
+                const Object** mark_stack)
+      : mark_sweep_(mark_sweep),
+        thread_pool_(thread_pool),
+        mark_stack_pos_(mark_stack_size) {
+    // We may have to copy part of an existing mark stack when another mark stack overflows.
+    if (mark_stack_size != 0) {
+      DCHECK(mark_stack != NULL);
+      // TODO: Check performance?
+      std::copy(mark_stack, mark_stack + mark_stack_size, mark_stack_);
     }
-    byte* begin = space->Begin();
-    byte* end = space->End();
-    // Image spaces are handled properly since live == marked for them.
-    accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
-    card_table->Scan(mark_bitmap, begin, end, visitor, minimum_age);
+    if (kCountTasks) {
+      ++mark_sweep_->work_chunks_created_;
+    }
+  }
+
+  static const size_t kMaxSize = 1 * KB;
+
+ protected:
+  class ScanObjectParallelVisitor {
+   public:
+    explicit ScanObjectParallelVisitor(MarkStackTask<kUseFinger>* chunk_task) ALWAYS_INLINE
+        : chunk_task_(chunk_task) {}
+
+    void operator()(const Object* obj) const {
+      MarkSweep* mark_sweep = chunk_task_->mark_sweep_;
+      mark_sweep->ScanObjectVisit(obj,
+          [mark_sweep, this](const Object* /* obj */, const Object* ref,
+              const MemberOffset& /* offset */, bool /* is_static */) ALWAYS_INLINE {
+        if (ref != nullptr && mark_sweep->MarkObjectParallel(ref)) {
+          if (kUseFinger) {
+            android_memory_barrier();
+            if (reinterpret_cast<uintptr_t>(ref) >=
+                static_cast<uintptr_t>(mark_sweep->atomic_finger_)) {
+              return;
+            }
+          }
+          chunk_task_->MarkStackPush(ref);
+        }
+      });
+    }
+
+   private:
+    MarkStackTask<kUseFinger>* const chunk_task_;
+  };
+
+  virtual ~MarkStackTask() {
+    // Make sure that we have cleared our mark stack.
+    DCHECK_EQ(mark_stack_pos_, 0U);
+    if (kCountTasks) {
+      ++mark_sweep_->work_chunks_deleted_;
+    }
+  }
+
+  MarkSweep* const mark_sweep_;
+  ThreadPool* const thread_pool_;
+  // Thread local mark stack for this task.
+  const Object* mark_stack_[kMaxSize];
+  // Mark stack position.
+  size_t mark_stack_pos_;
+
+  void MarkStackPush(const Object* obj) ALWAYS_INLINE {
+    if (UNLIKELY(mark_stack_pos_ == kMaxSize)) {
+      // Mark stack overflow, give 1/2 the stack to the thread pool as a new work task.
+      mark_stack_pos_ /= 2;
+      auto* task = new MarkStackTask(thread_pool_, mark_sweep_, kMaxSize - mark_stack_pos_,
+                                     mark_stack_ + mark_stack_pos_);
+      thread_pool_->AddTask(Thread::Current(), task);
+    }
+    DCHECK(obj != nullptr);
+    DCHECK(mark_stack_pos_ < kMaxSize);
+    mark_stack_[mark_stack_pos_++] = obj;
+  }
+
+  virtual void Finalize() {
+    delete this;
+  }
+
+  // Scans all of the objects
+  virtual void Run(Thread* self) {
+    ScanObjectParallelVisitor visitor(this);
+    // TODO: Tune this.
+    static const size_t kFifoSize = 4;
+    BoundedFifoPowerOfTwo<const Object*, kFifoSize> prefetch_fifo;
+    for (;;) {
+      const Object* obj = NULL;
+      if (kUseMarkStackPrefetch) {
+        while (mark_stack_pos_ != 0 && prefetch_fifo.size() < kFifoSize) {
+          const Object* obj = mark_stack_[--mark_stack_pos_];
+          DCHECK(obj != NULL);
+          __builtin_prefetch(obj);
+          prefetch_fifo.push_back(obj);
+        }
+        if (UNLIKELY(prefetch_fifo.empty())) {
+          break;
+        }
+        obj = prefetch_fifo.front();
+        prefetch_fifo.pop_front();
+      } else {
+        if (UNLIKELY(mark_stack_pos_ == 0)) {
+          break;
+        }
+        obj = mark_stack_[--mark_stack_pos_];
+      }
+      DCHECK(obj != NULL);
+      visitor(obj);
+    }
+  }
+};
+
+class CardScanTask : public MarkStackTask<false> {
+ public:
+  CardScanTask(ThreadPool* thread_pool, MarkSweep* mark_sweep, accounting::SpaceBitmap* bitmap,
+               byte* begin, byte* end, byte minimum_age, size_t mark_stack_size,
+               const Object** mark_stack_obj)
+      : MarkStackTask<false>(thread_pool, mark_sweep, mark_stack_size, mark_stack_obj),
+        bitmap_(bitmap),
+        begin_(begin),
+        end_(end),
+        minimum_age_(minimum_age) {
+  }
+
+ protected:
+  accounting::SpaceBitmap* const bitmap_;
+  byte* const begin_;
+  byte* const end_;
+  const byte minimum_age_;
+
+  virtual void Finalize() {
+    delete this;
+  }
+
+  virtual void Run(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
+    ScanObjectParallelVisitor visitor(this);
+    accounting::CardTable* card_table = mark_sweep_->GetHeap()->GetCardTable();
+    card_table->Scan(bitmap_, begin_, end_, visitor, minimum_age_);
+    // Finish by emptying our local mark stack.
+    MarkStackTask::Run(self);
+  }
+};
+
+void MarkSweep::ScanGrayObjects(bool paused, byte minimum_age) {
+  accounting::CardTable* card_table = GetHeap()->GetCardTable();
+  ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+  const bool parallel = kParallelCardScan && thread_pool != nullptr;
+  if (parallel) {
+    auto* self = Thread::Current();
+    // Can't have a different split for each space since multiple spaces can have their cards being
+    // scanned at the same time.
+    timings_.StartSplit(paused ? "(Paused)ScanGrayObjects" : "ScanGrayObjects");
+    // Try to take some of the mark stack since we can pass this off to the worker tasks.
+    const Object** mark_stack_begin = const_cast<const Object**>(mark_stack_->Begin());
+    const Object** mark_stack_end = const_cast<const Object**>(mark_stack_->End());
+    const auto mark_stack_size = mark_stack_end - mark_stack_begin;
+    const size_t thread_count = thread_pool->GetThreadCount() + 1;
+    // Estimated number of work tasks we will create.
+    const size_t mark_stack_tasks = GetHeap()->GetContinuousSpaces().size() * thread_count;
+    DCHECK_NE(mark_stack_tasks, 0U);
+    const size_t mark_stack_delta = std::min(CardScanTask::kMaxSize / 2,
+                                             mark_stack_size / mark_stack_tasks + 1);
+    for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+      byte* card_begin = space->Begin();
+      byte* card_end = space->End();
+      // Calculate how many bytes of heap we will scan,
+      const size_t address_range = card_end - card_begin;
+      // Calculate how much address range each task gets.
+      const size_t card_delta = RoundUp(address_range / thread_count + 1,
+                                        accounting::CardTable::kCardSize);
+      // Create the worker tasks for this space.
+      while (card_begin != card_end) {
+        // Add a range of cards.
+        size_t addr_remaining = card_end - card_begin;
+        size_t card_increment = std::min(card_delta, addr_remaining);
+        // Take from the back of the mark stack.
+        size_t mark_stack_remaining = mark_stack_end - mark_stack_begin;
+        size_t mark_stack_increment = std::min(mark_stack_delta, mark_stack_remaining);
+        mark_stack_end -= mark_stack_increment;
+        mark_stack_->PopBackCount(static_cast<int32_t>(mark_stack_increment));
+        DCHECK_EQ(mark_stack_end, mark_stack_->End());
+        // Add the new task to the thread pool.
+        auto* task = new CardScanTask(thread_pool, this, space->GetMarkBitmap(), card_begin,
+                                      card_begin + card_increment, minimum_age,
+                                      mark_stack_increment, mark_stack_end);
+        thread_pool->AddTask(self, task);
+        card_begin += card_increment;
+      }
+    }
+    thread_pool->StartWorkers(self);
+    thread_pool->Wait(self, paused, true);  // Only do work in the main thread if we are paused.
+    thread_pool->StopWorkers(self);
     timings_.EndSplit();
+  } else {
+    for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+      // Image spaces are handled properly since live == marked for them.
+      switch (space->GetGcRetentionPolicy()) {
+        case space::kGcRetentionPolicyNeverCollect:
+          timings_.StartSplit(paused ? "(Paused)ScanGrayImageSpaceObjects" :
+              "ScanGrayImageSpaceObjects");
+          break;
+        case space::kGcRetentionPolicyFullCollect:
+          timings_.StartSplit(paused ? "(Paused)ScanGrayZygoteSpaceObjects" :
+              "ScanGrayZygoteSpaceObjects");
+          break;
+        case space::kGcRetentionPolicyAlwaysCollect:
+          timings_.StartSplit(paused ? "(Paused)ScanGrayAllocSpaceObjects" :
+              "ScanGrayAllocSpaceObjects");
+          break;
+        }
+      ScanObjectVisitor visitor(this);
+      card_table->Scan(space->GetMarkBitmap(), space->Begin(), space->End(), visitor, minimum_age);
+      timings_.EndSplit();
+    }
   }
 }
 
-class CheckBitmapVisitor {
- public:
-  explicit CheckBitmapVisitor(MarkSweep* mark_sweep) : mark_sweep_(mark_sweep) {}
-
-  void operator()(const Object* obj) const NO_THREAD_SAFETY_ANALYSIS {
-    if (kDebugLocking) {
-      Locks::heap_bitmap_lock_->AssertSharedHeld(Thread::Current());
-    }
-    DCHECK(obj != NULL);
-    mark_sweep_->CheckObject(obj);
-  }
-
- private:
-  MarkSweep* mark_sweep_;
-};
-
 void MarkSweep::VerifyImageRoots() {
   // Verify roots ensures that all the references inside the image space point
   // objects which are either in the image space or marked objects in the alloc
   // space
   timings_.StartSplit("VerifyImageRoots");
-  CheckBitmapVisitor visitor(this);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsImageSpace()) {
       space::ImageSpace* image_space = space->AsImageSpace();
@@ -651,12 +826,46 @@
       uintptr_t end = reinterpret_cast<uintptr_t>(image_space->End());
       accounting::SpaceBitmap* live_bitmap = image_space->GetLiveBitmap();
       DCHECK(live_bitmap != NULL);
-      live_bitmap->VisitMarkedRange(begin, end, visitor);
+      live_bitmap->VisitMarkedRange(begin, end, [this](const Object* obj) {
+        if (kCheckLocks) {
+          Locks::heap_bitmap_lock_->AssertSharedHeld(Thread::Current());
+        }
+        DCHECK(obj != NULL);
+        CheckObject(obj);
+      });
     }
   }
   timings_.EndSplit();
 }
 
+class RecursiveMarkTask : public MarkStackTask<false> {
+ public:
+  RecursiveMarkTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
+                    accounting::SpaceBitmap* bitmap, uintptr_t begin, uintptr_t end)
+      : MarkStackTask<false>(thread_pool, mark_sweep, 0, NULL),
+        bitmap_(bitmap),
+        begin_(begin),
+        end_(end) {
+  }
+
+ protected:
+  accounting::SpaceBitmap* const bitmap_;
+  const uintptr_t begin_;
+  const uintptr_t end_;
+
+  virtual void Finalize() {
+    delete this;
+  }
+
+  // Scans all of the objects
+  virtual void Run(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
+    ScanObjectParallelVisitor visitor(this);
+    bitmap_->VisitMarkedRange(begin_, end_, visitor);
+    // Finish by emptying our local mark stack.
+    MarkStackTask::Run(self);
+  }
+};
+
 // Populates the mark stack based on the set of marked objects and
 // recursively marks until the mark stack is emptied.
 void MarkSweep::RecursiveMark() {
@@ -669,9 +878,13 @@
   CHECK(phantom_reference_list_ == NULL);
   CHECK(cleared_reference_list_ == NULL);
 
-  const bool partial = GetGcType() == kGcTypePartial;
-  ScanObjectVisitor scan_visitor(this);
-  if (!kDisableFinger) {
+  if (kUseRecursiveMark) {
+    const bool partial = GetGcType() == kGcTypePartial;
+    ScanObjectVisitor scan_visitor(this);
+    auto* self = Thread::Current();
+    ThreadPool* thread_pool = heap_->GetThreadPool();
+    const bool parallel = kParallelRecursiveMark && thread_pool != NULL;
+    mark_stack_->Reset();
     for (const auto& space : GetHeap()->GetContinuousSpaces()) {
       if ((space->GetGcRetentionPolicy() == space::kGcRetentionPolicyAlwaysCollect) ||
           (!partial && space->GetGcRetentionPolicy() == space::kGcRetentionPolicyFullCollect)) {
@@ -680,14 +893,39 @@
           GetHeap()->DumpSpaces();
           LOG(FATAL) << "invalid bitmap";
         }
-        // This function does not handle heap end increasing, so we must use the space end.
-        uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
-        uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-        current_mark_bitmap_->VisitMarkedRange(begin, end, scan_visitor);
+        if (parallel) {
+          // We will use the mark stack the future.
+          // CHECK(mark_stack_->IsEmpty());
+          // This function does not handle heap end increasing, so we must use the space end.
+          uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
+          uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
+          atomic_finger_ = static_cast<int32_t>(0xFFFFFFFF);
+
+          // Create a few worker tasks.
+          size_t n = (thread_pool->GetThreadCount() + 1) * 2;
+          while (begin != end) {
+            uintptr_t start = begin;
+            uintptr_t delta = (end - begin) / n;
+            delta = RoundUp(delta, KB);
+            if (delta < 16 * KB) delta = end - begin;
+            begin += delta;
+            auto* task = new RecursiveMarkTask(thread_pool, this, current_mark_bitmap_, start,
+                                               begin);
+            thread_pool->AddTask(self, task);
+          }
+          thread_pool->StartWorkers(self);
+          thread_pool->Wait(self, false, true);
+          thread_pool->StopWorkers(self);
+        } else {
+          // This function does not handle heap end increasing, so we must use the space end.
+          uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
+          uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
+          current_mark_bitmap_->VisitMarkedRange(begin, end, scan_visitor);
+        }
       }
     }
   }
-  ProcessMarkStack();
+  ProcessMarkStack(false);
 }
 
 bool MarkSweep::IsMarkedCallback(const Object* object, void* arg) {
@@ -696,9 +934,9 @@
       !reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object);
 }
 
-void MarkSweep::RecursiveMarkDirtyObjects(byte minimum_age) {
-  ScanGrayObjects(minimum_age);
-  ProcessMarkStack();
+void MarkSweep::RecursiveMarkDirtyObjects(bool paused, byte minimum_age) {
+  ScanGrayObjects(paused, minimum_age);
+  ProcessMarkStack(paused);
 }
 
 void MarkSweep::ReMarkRoots() {
@@ -879,7 +1117,7 @@
   // bitmap, resulting in occasional frees of Weaks which are still in use.
   SweepSystemWeaksArray(allocations);
 
-  timings_.StartSplit("Process allocation stack");
+  timings_.StartSplit("SweepArray");
   // Newly allocated objects MUST be in the alloc space and those are the only objects which we are
   // going to free.
   accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
@@ -912,12 +1150,12 @@
         DCHECK_GE(out, objects_to_chunk_free);
         DCHECK_LE(static_cast<size_t>(out - objects_to_chunk_free), kSweepArrayChunkFreeSize);
         if (static_cast<size_t>(out - objects_to_chunk_free) == kSweepArrayChunkFreeSize) {
-          timings_.StartSplit("FreeList");
+          // timings_.StartSplit("FreeList");
           size_t chunk_freed_objects = out - objects_to_chunk_free;
           freed_objects += chunk_freed_objects;
           freed_bytes += space->FreeList(self, chunk_freed_objects, objects_to_chunk_free);
           objects_to_chunk_free = out;
-          timings_.EndSplit();
+          // timings_.EndSplit();
         }
       }
     } else if (!large_mark_objects->Test(obj)) {
@@ -929,11 +1167,11 @@
   DCHECK_GE(out, objects_to_chunk_free);
   DCHECK_LE(static_cast<size_t>(out - objects_to_chunk_free), kSweepArrayChunkFreeSize);
   if (out - objects_to_chunk_free > 0) {
-    timings_.StartSplit("FreeList");
+    // timings_.StartSplit("FreeList");
     size_t chunk_freed_objects = out - objects_to_chunk_free;
     freed_objects += chunk_freed_objects;
     freed_bytes += space->FreeList(self, chunk_freed_objects, objects_to_chunk_free);
-    timings_.EndSplit();
+    // timings_.EndSplit();
   }
   CHECK_EQ(count, allocations->Size());
   timings_.EndSplit();
@@ -971,8 +1209,8 @@
       sweep_space = (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyFullCollect);
     }
     if (sweep_space) {
-      uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
-      uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
+      auto begin = reinterpret_cast<uintptr_t>(space->Begin());
+      auto end = reinterpret_cast<uintptr_t>(space->End());
       scc.space = space->AsDlMallocSpace();
       accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
       accounting::SpaceBitmap* mark_bitmap = space->GetMarkBitmap();
@@ -1066,31 +1304,34 @@
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the
 // referent has not yet been marked, put it on the appropriate list in
-// the gcHeap for later processing.
-void MarkSweep::DelayReferenceReferent(Object* obj) {
-  DCHECK(obj != NULL);
-  Class* klass = obj->GetClass();
-  DCHECK(klass != NULL);
+// the heap for later processing.
+void MarkSweep::DelayReferenceReferent(mirror::Class* klass, Object* obj) {
+  DCHECK(klass != nullptr);
   DCHECK(klass->IsReferenceClass());
-  Object* pending = obj->GetFieldObject<Object*>(heap_->GetReferencePendingNextOffset(), false);
+  DCHECK(obj != NULL);
   Object* referent = heap_->GetReferenceReferent(obj);
-  if (kCountJavaLangRefs) {
-    ++reference_count_;
-  }
-  if (pending == NULL && referent != NULL && !IsMarked(referent)) {
-    Object** list = NULL;
-    if (klass->IsSoftReferenceClass()) {
-      list = &soft_reference_list_;
-    } else if (klass->IsWeakReferenceClass()) {
-      list = &weak_reference_list_;
-    } else if (klass->IsFinalizerReferenceClass()) {
-      list = &finalizer_reference_list_;
-    } else if (klass->IsPhantomReferenceClass()) {
-      list = &phantom_reference_list_;
+  if (referent != NULL && !IsMarked(referent)) {
+    if (kCountJavaLangRefs) {
+      ++reference_count_;
     }
-    DCHECK(list != NULL) << PrettyClass(klass) << " " << std::hex << klass->GetAccessFlags();
-    // TODO: One lock per list?
-    heap_->EnqueuePendingReference(obj, list);
+    Thread* self = Thread::Current();
+    // TODO: Remove these locks, and use atomic stacks for storing references?
+    if (klass->IsSoftReferenceClass()) {
+      MutexLock mu(self, *heap_->GetSoftRefQueueLock());
+      heap_->EnqueuePendingReference(obj, &soft_reference_list_);
+    } else if (klass->IsWeakReferenceClass()) {
+      MutexLock mu(self, *heap_->GetWeakRefQueueLock());
+      heap_->EnqueuePendingReference(obj, &weak_reference_list_);
+    } else if (klass->IsFinalizerReferenceClass()) {
+      MutexLock mu(self, *heap_->GetFinalizerRefQueueLock());
+      heap_->EnqueuePendingReference(obj, &finalizer_reference_list_);
+    } else if (klass->IsPhantomReferenceClass()) {
+      MutexLock mu(self, *heap_->GetPhantomRefQueueLock());
+      heap_->EnqueuePendingReference(obj, &phantom_reference_list_);
+    } else {
+      LOG(FATAL) << "Invalid reference type " << PrettyClass(klass)
+                 << " " << std::hex << klass->GetAccessFlags();
+    }
   }
 }
 
@@ -1100,13 +1341,13 @@
 
 class MarkObjectVisitor {
  public:
-  explicit MarkObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {}
+  explicit MarkObjectVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE : mark_sweep_(mark_sweep) {}
 
   // TODO: Fixme when anotatalysis works with visitors.
   void operator()(const Object* /* obj */, const Object* ref, const MemberOffset& /* offset */,
-                  bool /* is_static */) const
+                  bool /* is_static */) const ALWAYS_INLINE
       NO_THREAD_SAFETY_ANALYSIS {
-    if (kDebugLocking) {
+    if (kCheckLocks) {
       Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
       Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
     }
@@ -1124,204 +1365,61 @@
   ScanObjectVisit(obj, visitor);
 }
 
-class MarkStackChunk : public Task {
- public:
-  MarkStackChunk(ThreadPool* thread_pool, MarkSweep* mark_sweep, Object** begin, Object** end)
-      : mark_sweep_(mark_sweep),
-        thread_pool_(thread_pool),
-        index_(0),
-        length_(0),
-        output_(NULL) {
-    length_ = end - begin;
-    if (begin != end) {
-      // Cost not significant since we only do this for the initial set of mark stack chunks.
-      memcpy(data_, begin, length_ * sizeof(*begin));
-    }
-    if (kCountTasks) {
-      ++mark_sweep_->work_chunks_created_;
-    }
-  }
-
-  ~MarkStackChunk() {
-    DCHECK(output_ == NULL || output_->length_ == 0);
-    DCHECK_GE(index_, length_);
-    delete output_;
-    if (kCountTasks) {
-      ++mark_sweep_->work_chunks_deleted_;
-    }
-  }
-
-  MarkSweep* const mark_sweep_;
-  ThreadPool* const thread_pool_;
-  static const size_t max_size = 1 * KB;
-  // Index of which object we are scanning. Only needs to be atomic if we are doing work stealing.
-  size_t index_;
-  // Input / output mark stack. We add newly marked references to data_ until length reaches
-  // max_size. This is an optimization so that less tasks are created.
-  // TODO: Investigate using a bounded buffer FIFO.
-  Object* data_[max_size];
-  // How many elements in data_ we need to scan.
-  size_t length_;
-  // Output block, newly marked references get added to the ouput block so that another thread can
-  // scan them.
-  MarkStackChunk* output_;
-
-  class MarkObjectParallelVisitor {
-   public:
-    explicit MarkObjectParallelVisitor(MarkStackChunk* chunk_task) : chunk_task_(chunk_task) {}
-
-    void operator()(const Object* /* obj */, const Object* ref,
-                    const MemberOffset& /* offset */, bool /* is_static */) const {
-      if (ref != NULL && chunk_task_->mark_sweep_->MarkObjectParallel(ref)) {
-        chunk_task_->MarkStackPush(ref);
-      }
-    }
-
-   private:
-    MarkStackChunk* const chunk_task_;
-  };
-
-  // Push an object into the block.
-  // Don't need to use atomic ++ since we only one thread is writing to an output block at any
-  // given time.
-  void Push(Object* obj) {
-    CHECK(obj != NULL);
-    data_[length_++] = obj;
-  }
-
-  void MarkStackPush(const Object* obj) {
-    if (static_cast<size_t>(length_) < max_size) {
-      Push(const_cast<Object*>(obj));
-    } else {
-      // Internal (thread-local) buffer is full, push to a new buffer instead.
-      if (UNLIKELY(output_ == NULL)) {
-        AllocateOutputChunk();
-      } else if (UNLIKELY(static_cast<size_t>(output_->length_) == max_size)) {
-        // Output block is full, queue it up for processing and obtain a new block.
-        EnqueueOutput();
-        AllocateOutputChunk();
-      }
-      output_->Push(const_cast<Object*>(obj));
-    }
-  }
-
-  void ScanObject(Object* obj) {
-    mark_sweep_->ScanObjectVisit(obj, MarkObjectParallelVisitor(this));
-  }
-
-  void EnqueueOutput() {
-    if (output_ != NULL) {
-      uint64_t start = 0;
-      if (kMeasureOverhead) {
-        start = NanoTime();
-      }
-      thread_pool_->AddTask(Thread::Current(), output_);
-      output_ = NULL;
-      if (kMeasureOverhead) {
-        mark_sweep_->overhead_time_.fetch_add(NanoTime() - start);
-      }
-    }
-  }
-
-  void AllocateOutputChunk() {
-    uint64_t start = 0;
-    if (kMeasureOverhead) {
-      start = NanoTime();
-    }
-    output_ = new MarkStackChunk(thread_pool_, mark_sweep_, NULL, NULL);
-    if (kMeasureOverhead) {
-      mark_sweep_->overhead_time_.fetch_add(NanoTime() - start);
-    }
-  }
-
-  void Finalize() {
-    EnqueueOutput();
-    delete this;
-  }
-
-  // Scans all of the objects
-  virtual void Run(Thread* self) {
-    size_t index;
-    while ((index = index_++) < length_) {
-      if (kUseMarkStackPrefetch) {
-        static const size_t prefetch_look_ahead = 1;
-        __builtin_prefetch(data_[std::min(index + prefetch_look_ahead, length_ - 1)]);
-      }
-      Object* obj = data_[index];
-      DCHECK(obj != NULL);
-      ScanObject(obj);
-    }
-  }
-};
-
-void MarkSweep::ProcessMarkStackParallel() {
-  CHECK(kDisableFinger) << "parallel mark stack processing cannot work when finger is enabled";
+void MarkSweep::ProcessMarkStackParallel(bool paused) {
   Thread* self = Thread::Current();
   ThreadPool* thread_pool = GetHeap()->GetThreadPool();
-  // Split the current mark stack up into work tasks.
   const size_t num_threads = thread_pool->GetThreadCount();
-  const size_t stack_size = mark_stack_->Size();
   const size_t chunk_size =
-      std::min((stack_size + num_threads - 1) / num_threads,
-               static_cast<size_t>(MarkStackChunk::max_size));
-  size_t index = 0;
-  for (size_t i = 0; i < num_threads || index < stack_size; ++i) {
-    Object** begin = &mark_stack_->Begin()[std::min(stack_size, index)];
-    Object** end = &mark_stack_->Begin()[std::min(stack_size, index + chunk_size)];
-    index += chunk_size;
-    thread_pool->AddTask(self, new MarkStackChunk(thread_pool, this, begin, end));
+      std::min(mark_stack_->Size() / num_threads + 1,
+               static_cast<size_t>(MarkStackTask<false>::kMaxSize));
+  CHECK_GT(chunk_size, 0U);
+  // Split the current mark stack up into work tasks.
+  for (mirror::Object **it = mark_stack_->Begin(), **end = mark_stack_->End(); it < end; ) {
+    const size_t delta = std::min(static_cast<size_t>(end - it), chunk_size);
+    thread_pool->AddTask(self, new MarkStackTask<false>(thread_pool, this, delta,
+                                                        const_cast<const mirror::Object**>(it)));
+    it += delta;
   }
   thread_pool->StartWorkers(self);
-  thread_pool->Wait(self, true, true);
+  // Don't do work in the main thread since it assumed at least one other thread will require CPU
+  // time during the GC.
+  thread_pool->Wait(self, paused, true);
+  thread_pool->StopWorkers(self);
   mark_stack_->Reset();
-  // LOG(INFO) << "Idle wait time " << PrettyDuration(thread_pool->GetWaitTime());
   CHECK_EQ(work_chunks_created_, work_chunks_deleted_) << " some of the work chunks were leaked";
 }
 
 // Scan anything that's on the mark stack.
-void MarkSweep::ProcessMarkStack() {
-  ThreadPool* thread_pool = GetHeap()->GetThreadPool();
+void MarkSweep::ProcessMarkStack(bool paused) {
   timings_.StartSplit("ProcessMarkStack");
-  if (kParallelMarkStack && thread_pool != NULL && thread_pool->GetThreadCount() > 0) {
-    ProcessMarkStackParallel();
-    timings_.EndSplit();
-    return;
-  }
-
-  if (kUseMarkStackPrefetch) {
-    const size_t fifo_size = 4;
-    const size_t fifo_mask = fifo_size - 1;
-    const Object* fifo[fifo_size];
-    for (size_t i = 0; i < fifo_size; ++i) {
-      fifo[i] = NULL;
-    }
-    size_t fifo_pos = 0;
-    size_t fifo_count = 0;
-    for (;;) {
-      const Object* obj = fifo[fifo_pos & fifo_mask];
-      if (obj != NULL) {
-        ScanObject(obj);
-        fifo[fifo_pos & fifo_mask] = NULL;
-        --fifo_count;
-      }
-
-      if (!mark_stack_->IsEmpty()) {
-        const Object* obj = mark_stack_->PopBack();
-        DCHECK(obj != NULL);
-        fifo[fifo_pos & fifo_mask] = obj;
-        __builtin_prefetch(obj);
-        fifo_count++;
-      }
-      fifo_pos++;
-
-      if (!fifo_count) {
-        CHECK(mark_stack_->IsEmpty()) << mark_stack_->Size();
-        break;
-      }
-    }
+  const bool parallel = kParallelProcessMarkStack && GetHeap()->GetThreadPool() &&
+      mark_stack_->Size() >= kMinimumParallelMarkStackSize;
+  if (parallel) {
+    ProcessMarkStackParallel(paused);
   } else {
-    while (!mark_stack_->IsEmpty()) {
-      const Object* obj = mark_stack_->PopBack();
+    // TODO: Tune this.
+    static const size_t kFifoSize = 4;
+    BoundedFifoPowerOfTwo<const Object*, kFifoSize> prefetch_fifo;
+    for (;;) {
+      const Object* obj = NULL;
+      if (kUseMarkStackPrefetch) {
+        while (!mark_stack_->IsEmpty() && prefetch_fifo.size() < kFifoSize) {
+          const Object* obj = mark_stack_->PopBack();
+          DCHECK(obj != NULL);
+          __builtin_prefetch(obj);
+          prefetch_fifo.push_back(obj);
+        }
+        if (prefetch_fifo.empty()) {
+          break;
+        }
+        obj = prefetch_fifo.front();
+        prefetch_fifo.pop_front();
+      } else {
+        if (mark_stack_->IsEmpty()) {
+          break;
+        }
+        obj = mark_stack_->PopBack();
+      }
       DCHECK(obj != NULL);
       ScanObject(obj);
     }
@@ -1362,9 +1460,8 @@
   *list = clear;
   timings_.EndSplit();
 
-  // Restart the mark with the newly black references added to the
-  // root set.
-  ProcessMarkStack();
+  // Restart the mark with the newly black references added to the root set.
+  ProcessMarkStack(true);
 }
 
 inline bool MarkSweep::IsMarked(const Object* object) const
@@ -1422,7 +1519,7 @@
   }
   timings_.EndSplit();
   if (has_enqueued) {
-    ProcessMarkStack();
+    ProcessMarkStack(true);
   }
   DCHECK(*list == NULL);
 }
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 8db03d3..ba12e64 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -54,7 +54,6 @@
   class ContinuousSpace;
 }  // namespace space
 
-class CheckObjectVisitor;
 class Heap;
 
 namespace collector {
@@ -126,7 +125,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Builds a mark stack with objects on dirty cards and recursively mark until it empties.
-  void RecursiveMarkDirtyObjects(byte minimum_age)
+  void RecursiveMarkDirtyObjects(bool paused, byte minimum_age)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -260,8 +259,13 @@
   // Unmarks an object by clearing the bit inside of the corresponding bitmap, or if it is in a
   // space set, removing the object from the set.
   void UnMarkObjectNonNull(const mirror::Object* obj)
-        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-        EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+  // Mark the vm thread roots.
+  virtual void MarkThreadRoots(Thread* self)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Marks an object atomically, safe to use from multiple threads.
   void MarkObjectNonNullParallel(const mirror::Object* obj);
@@ -342,20 +346,20 @@
   }
 
   // Blackens objects grayed during a garbage collection.
-  void ScanGrayObjects(byte minimum_age)
+  void ScanGrayObjects(bool paused, byte minimum_age)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Schedules an unmarked object for reference processing.
-  void DelayReferenceReferent(mirror::Object* reference)
+  void DelayReferenceReferent(mirror::Class* klass, mirror::Object* reference)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   // Recursively blackens objects on the mark stack.
-  void ProcessMarkStack()
+  void ProcessMarkStack(bool paused)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void ProcessMarkStackParallel()
+  void ProcessMarkStackParallel(bool paused)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -402,6 +406,9 @@
   mirror::Object* phantom_reference_list_;
   mirror::Object* cleared_reference_list_;
 
+  // Parallel finger.
+  AtomicInteger atomic_finger_;
+
   // Number of bytes freed in this collection.
   AtomicInteger freed_bytes_;
   // Number of objects freed in this collection.
@@ -431,7 +438,6 @@
  private:
   friend class AddIfReachesAllocSpaceVisitor;  // Used by mod-union table.
   friend class CheckBitmapVisitor;
-  friend class CheckObjectVisitor;
   friend class CheckReferenceVisitor;
   friend class art::gc::Heap;
   friend class InternTableEntryIsUnmarked;
@@ -445,7 +451,7 @@
   friend class ModUnionScanImageRootVisitor;
   friend class ScanBitmapVisitor;
   friend class ScanImageRootVisitor;
-  friend class MarkStackChunk;
+  template<bool kUseFinger> friend class MarkStackTask;
   friend class FifoMarkStackChunk;
 
   DISALLOW_COPY_AND_ASSIGN(MarkSweep);
diff --git a/runtime/gc/collector/sticky_mark_sweep.cc b/runtime/gc/collector/sticky_mark_sweep.cc
index 008d3e0..19cbe6b 100644
--- a/runtime/gc/collector/sticky_mark_sweep.cc
+++ b/runtime/gc/collector/sticky_mark_sweep.cc
@@ -47,7 +47,11 @@
 }
 
 void StickyMarkSweep::MarkReachableObjects() {
-  RecursiveMarkDirtyObjects(accounting::CardTable::kCardDirty - 1);
+  // All reachable objects must be referenced by a root or a dirty card, so we can clear the mark
+  // stack here since all objects in the mark stack will get scanned by the card scanning anyways.
+  // TODO: Not put these objects in the mark stack in the first place.
+  mark_stack_->Reset();
+  RecursiveMarkDirtyObjects(false, accounting::CardTable::kCardDirty - 1);
 }
 
 void StickyMarkSweep::Sweep(bool swap_bitmaps) {
@@ -55,6 +59,11 @@
   SweepArray(live_stack, false);
 }
 
+void StickyMarkSweep::MarkThreadRoots(Thread* self) {
+  MarkRootsCheckpoint(self);
+}
+
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/sticky_mark_sweep.h b/runtime/gc/collector/sticky_mark_sweep.h
index 2099c79..79c4359 100644
--- a/runtime/gc/collector/sticky_mark_sweep.h
+++ b/runtime/gc/collector/sticky_mark_sweep.h
@@ -43,6 +43,10 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  virtual void MarkThreadRoots(Thread* self)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
   void Sweep(bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
  private:
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index d5a8d75..159e379 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -80,7 +80,10 @@
       num_gc_threads_(num_gc_threads),
       low_memory_mode_(low_memory_mode),
       have_zygote_space_(false),
-      reference_queue_lock_(NULL),
+      soft_ref_queue_lock_(NULL),
+      weak_ref_queue_lock_(NULL),
+      finalizer_ref_queue_lock_(NULL),
+      phantom_ref_queue_lock_(NULL),
       is_gc_running_(false),
       last_gc_type_(collector::kGcTypeNone),
       next_gc_type_(collector::kGcTypePartial),
@@ -97,7 +100,7 @@
       // Initially care about pauses in case we never get notified of process states, or if the JNI
       // code becomes broken.
       care_about_pause_times_(true),
-      concurrent_start_bytes_(concurrent_gc ? initial_size - (kMinConcurrentRemainingBytes)
+      concurrent_start_bytes_(concurrent_gc ? initial_size - kMinConcurrentRemainingBytes
                                             :  std::numeric_limits<size_t>::max()),
       total_bytes_freed_ever_(0),
       total_objects_freed_ever_(0),
@@ -219,8 +222,11 @@
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
                                                 *gc_complete_lock_));
 
-  // Create the reference queue lock, this is required so for parallel object scanning in the GC.
-  reference_queue_lock_ = new Mutex("reference queue lock");
+  // Create the reference queue locks, this is required so for parallel object scanning in the GC.
+  soft_ref_queue_lock_ = new Mutex("Soft reference queue lock");
+  weak_ref_queue_lock_ = new Mutex("Weak reference queue lock");
+  finalizer_ref_queue_lock_ = new Mutex("Finalizer reference queue lock");
+  phantom_ref_queue_lock_ = new Mutex("Phantom reference queue lock");
 
   last_gc_time_ns_ = NanoTime();
   last_gc_size_ = GetBytesAllocated();
@@ -240,20 +246,15 @@
 }
 
 void Heap::CreateThreadPool() {
-  thread_pool_.reset(new ThreadPool(num_gc_threads_));
+  if (num_gc_threads_ != 0) {
+    thread_pool_.reset(new ThreadPool(num_gc_threads_));
+  }
 }
 
 void Heap::DeleteThreadPool() {
   thread_pool_.reset(NULL);
 }
 
-// Sort spaces based on begin address
-struct ContinuousSpaceSorter {
-  bool operator()(const space::ContinuousSpace* a, const space::ContinuousSpace* b) const {
-    return a->Begin() < b->Begin();
-  }
-};
-
 static bool ReadStaticInt(JNIEnvExt* env, jclass clz, const char* name, int* out_value) {
   CHECK(out_value != NULL);
   jfieldID field = env->GetStaticFieldID(clz, name, "I");
@@ -266,7 +267,7 @@
 }
 
 void Heap::ListenForProcessStateChange() {
-  VLOG(gc) << "Heap notified of process state change";
+  VLOG(heap) << "Heap notified of process state change";
 
   Thread* self = Thread::Current();
   JNIEnvExt* env = self->GetJniEnv();
@@ -320,8 +321,8 @@
       int process_state = 0;
       if (ReadStaticInt(env, activity_manager.get(), care_about_pauses[i], &process_state)) {
         process_state_cares_about_pause_time_.insert(process_state);
-        VLOG(gc) << "Adding process state " << process_state
-                 << " to set of states which care about pause time";
+        VLOG(heap) << "Adding process state " << process_state
+                   << " to set of states which care about pause time";
       }
     }
   }
@@ -367,8 +368,8 @@
     care_about_pause_times_ = process_state_cares_about_pause_time_.find(process_state) !=
         process_state_cares_about_pause_time_.end();
 
-    VLOG(gc) << "New process state " << process_state
-             << " care about pauses " << care_about_pause_times_;
+    VLOG(heap) << "New process state " << process_state
+               << " care about pauses " << care_about_pause_times_;
   }
 }
 
@@ -385,7 +386,10 @@
   }
 
   // Ensure that spaces remain sorted in increasing order of start address (required for CMS finger)
-  std::sort(continuous_spaces_.begin(), continuous_spaces_.end(), ContinuousSpaceSorter());
+  std::sort(continuous_spaces_.begin(), continuous_spaces_.end(),
+            [](const space::ContinuousSpace* a, const space::ContinuousSpace* b) {
+              return a->Begin() < b->Begin();
+            });
 
   // Ensure that ImageSpaces < ZygoteSpaces < AllocSpaces so that we can do address based checks to
   // avoid redundant marking.
@@ -493,7 +497,10 @@
   STLDeleteElements(&continuous_spaces_);
   STLDeleteElements(&discontinuous_spaces_);
   delete gc_complete_lock_;
-  delete reference_queue_lock_;
+  delete soft_ref_queue_lock_;
+  delete weak_ref_queue_lock_;
+  delete finalizer_ref_queue_lock_;
+  delete phantom_ref_queue_lock_;
 }
 
 space::ContinuousSpace* Heap::FindContinuousSpaceFromObject(const mirror::Object* obj,
@@ -1155,7 +1162,6 @@
                                                bool clear_soft_references) {
   Thread* self = Thread::Current();
 
-
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
   Locks::mutator_lock_->AssertNotHeld(self);
 
@@ -1249,7 +1255,7 @@
                        << ((i != pauses.size() - 1) ? ", " : "");
       }
       LOG(INFO) << gc_cause << " " << collector->GetName()
-                << "GC freed " << PrettySize(collector->GetFreedBytes()) << ", "
+                << " GC freed " << PrettySize(collector->GetFreedBytes()) << ", "
                 << percent_free << "% free, " << PrettySize(current_heap_size) << "/"
                 << PrettySize(total_memory) << ", " << "paused " << pause_string.str()
                 << " total " << PrettyDuration((duration / 1000) * 1000);
@@ -1835,17 +1841,18 @@
 void Heap::EnqueuePendingReference(mirror::Object* ref, mirror::Object** list) {
   DCHECK(ref != NULL);
   DCHECK(list != NULL);
-
-  // TODO: Remove this lock, use atomic stacks for storing references.
-  MutexLock mu(Thread::Current(), *reference_queue_lock_);
-  if (*list == NULL) {
-    ref->SetFieldObject(reference_pendingNext_offset_, ref, false);
-    *list = ref;
-  } else {
-    mirror::Object* head =
-        (*list)->GetFieldObject<mirror::Object*>(reference_pendingNext_offset_, false);
-    ref->SetFieldObject(reference_pendingNext_offset_, head, false);
-    (*list)->SetFieldObject(reference_pendingNext_offset_, ref, false);
+  mirror::Object* pending =
+      ref->GetFieldObject<mirror::Object*>(reference_pendingNext_offset_, false);
+  if (pending == NULL) {
+    if (*list == NULL) {
+      ref->SetFieldObject(reference_pendingNext_offset_, ref, false);
+      *list = ref;
+    } else {
+      mirror::Object* head =
+          (*list)->GetFieldObject<mirror::Object*>(reference_pendingNext_offset_, false);
+      ref->SetFieldObject(reference_pendingNext_offset_, head, false);
+      (*list)->SetFieldObject(reference_pendingNext_offset_, ref, false);
+    }
   }
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index b3346e8..72e6e43 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -380,6 +380,22 @@
     return large_object_space_;
   }
 
+  Mutex* GetSoftRefQueueLock() {
+    return soft_ref_queue_lock_;
+  }
+
+  Mutex* GetWeakRefQueueLock() {
+    return weak_ref_queue_lock_;
+  }
+
+  Mutex* GetFinalizerRefQueueLock() {
+    return finalizer_ref_queue_lock_;
+  }
+
+  Mutex* GetPhantomRefQueueLock() {
+    return phantom_ref_queue_lock_;
+  }
+
   void DumpSpaces();
 
   // GC performance measuring
@@ -499,7 +515,7 @@
   const bool concurrent_gc_;
 
   // How many GC threads we may use for garbage collection.
-  const bool num_gc_threads_;
+  const size_t num_gc_threads_;
 
   // Boolean for if we are in low memory mode.
   const bool low_memory_mode_;
@@ -512,9 +528,12 @@
   Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
 
-  // Mutex held when adding references to reference queues.
+  // Mutexes held when adding references to reference queues.
   // TODO: move to a UniquePtr, currently annotalysis is confused that UniquePtr isn't lockable.
-  Mutex* reference_queue_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Mutex* soft_ref_queue_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Mutex* weak_ref_queue_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Mutex* finalizer_ref_queue_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Mutex* phantom_ref_queue_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   // True while the garbage collector is running.
   volatile bool is_gc_running_ GUARDED_BY(gc_complete_lock_);
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index 067ef2d..f7fdcfb 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -23,6 +23,8 @@
 
 namespace art {
 
+static const bool kMeasureWaitTime = false;
+
 ThreadPoolWorker::ThreadPoolWorker(ThreadPool* thread_pool, const std::string& name,
                                    size_t stack_size)
     : thread_pool_(thread_pool),
@@ -64,7 +66,7 @@
   MutexLock mu(self, task_queue_lock_);
   tasks_.push_back(task);
   // If we have any waiters, signal one.
-  if (waiting_count_ != 0) {
+  if (started_ && waiting_count_ != 0) {
     task_queue_condition_.Signal(self);
   }
 }
@@ -129,11 +131,13 @@
       // We may be done, lets broadcast to the completion condition.
       completion_condition_.Broadcast(self);
     }
-    const uint64_t wait_start = NanoTime();
+    const uint64_t wait_start = kMeasureWaitTime ? NanoTime() : 0;
     task_queue_condition_.Wait(self);
-    const uint64_t wait_end = NanoTime();
-    total_wait_time_ += wait_end - std::max(wait_start, start_time_);
-    waiting_count_--;
+    if (kMeasureWaitTime) {
+      const uint64_t wait_end = NanoTime();
+      total_wait_time_ += wait_end - std::max(wait_start, start_time_);
+    }
+    --waiting_count_;
   }
 
   // We are shutting down, return NULL to tell the worker thread to stop looping.