Merge "Thread-local mark stacks for the CC collector."
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 20d75f3..9142012 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -89,7 +89,7 @@
             art::Thread::ThinLockIdOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.card_table.
-#define THREAD_CARD_TABLE_OFFSET 128
+#define THREAD_CARD_TABLE_OFFSET 136
 ADD_TEST_EQ(THREAD_CARD_TABLE_OFFSET,
             art::Thread::CardTableOffset<__SIZEOF_POINTER__>().Int32Value())
 
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 0a5ebfa..77b1e86 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -133,7 +133,8 @@
                         sizeof(void*) * kLockLevelCount);
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, nested_signal_state, flip_function, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, flip_function, method_verifier, sizeof(void*));
-    EXPECT_OFFSET_DIFF(Thread, tlsPtr_.method_verifier, Thread, wait_mutex_, sizeof(void*),
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, method_verifier, thread_local_mark_stack, sizeof(void*));
+    EXPECT_OFFSET_DIFF(Thread, tlsPtr_.thread_local_mark_stack, Thread, wait_mutex_, sizeof(void*),
                        thread_tlsptr_end);
   }
 
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ac716ea..93f32e8 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -156,6 +156,10 @@
     return Size() == 0;
   }
 
+  bool IsFull() const {
+    return Size() == growth_limit_;
+  }
+
   size_t Size() const {
     DCHECK_LE(front_index_.LoadRelaxed(), back_index_.LoadRelaxed());
     return back_index_.LoadRelaxed() - front_index_.LoadRelaxed();
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 6537ed2..88c475b 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -265,7 +265,8 @@
   }
 
   // Wait for GC's sweeping to complete and allow new records
-  while (UNLIKELY(!records->allow_new_record_)) {
+  while (UNLIKELY((!kUseReadBarrier && !records->allow_new_record_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     records->new_record_condition_.WaitHoldingLocks(self);
   }
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index ccf5154..9316b27 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -17,6 +17,7 @@
 #include "concurrent_copying.h"
 
 #include "art_field-inl.h"
+#include "base/stl_util.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/reference_processor.h"
@@ -38,17 +39,22 @@
     : GarbageCollector(heap,
                        name_prefix + (name_prefix.empty() ? "" : " ") +
                        "concurrent copying + mark sweep"),
-      region_space_(nullptr), gc_barrier_(new Barrier(0)), mark_queue_(2 * MB),
+      region_space_(nullptr), gc_barrier_(new Barrier(0)),
+      gc_mark_stack_(accounting::ObjectStack::Create("concurrent copying gc mark stack",
+                                                     2 * MB, 2 * MB)),
+      mark_stack_lock_("concurrent copying mark stack lock", kMarkSweepMarkStackLock),
+      thread_running_gc_(nullptr),
       is_marking_(false), is_active_(false), is_asserting_to_space_invariant_(false),
-      heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0),
+      heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0), mark_stack_mode_(kMarkStackModeOff),
+      weak_ref_access_enabled_(true),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
       rb_table_(heap_->GetReadBarrierTable()),
       force_evacuate_all_(false) {
   static_assert(space::RegionSpace::kRegionSize == accounting::ReadBarrierTable::kRegionSize,
                 "The region space size and the read barrier table region size must match");
   cc_heap_bitmap_.reset(new accounting::HeapBitmap(heap));
+  Thread* self = Thread::Current();
   {
-    Thread* self = Thread::Current();
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // Cache this so that we won't have to lock heap_bitmap_lock_ in
     // Mark() which could cause a nested lock on heap_bitmap_lock_
@@ -56,9 +62,19 @@
     // (class_linker_lock_ and heap_bitmap_lock_).
     heap_mark_bitmap_ = heap->GetMarkBitmap();
   }
+  {
+    MutexLock mu(self, mark_stack_lock_);
+    for (size_t i = 0; i < kMarkStackPoolSize; ++i) {
+      accounting::AtomicStack<mirror::Object>* mark_stack =
+          accounting::AtomicStack<mirror::Object>::Create(
+              "thread local mark stack", kMarkStackSize, kMarkStackSize);
+      pooled_mark_stacks_.push_back(mark_stack);
+    }
+  }
 }
 
 ConcurrentCopying::~ConcurrentCopying() {
+  STLDeleteElements(&pooled_mark_stacks_);
 }
 
 void ConcurrentCopying::RunPhases() {
@@ -66,6 +82,7 @@
   CHECK(!is_active_);
   is_active_ = true;
   Thread* self = Thread::Current();
+  thread_running_gc_ = self;
   Locks::mutator_lock_->AssertNotHeld(self);
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -80,7 +97,7 @@
   if (kEnableNoFromSpaceRefsVerification || kIsDebugBuild) {
     TimingLogger::ScopedTiming split("(Paused)VerifyNoFromSpaceReferences", GetTimings());
     ScopedPause pause(this);
-    CheckEmptyMarkQueue();
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
       LOG(INFO) << "Verifying no from-space refs";
     }
@@ -88,7 +105,7 @@
     if (kVerboseMode) {
       LOG(INFO) << "Done verifying no from-space refs";
     }
-    CheckEmptyMarkQueue();
+    CheckEmptyMarkStack();
   }
   {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
@@ -97,6 +114,7 @@
   FinishPhase();
   CHECK(is_active_);
   is_active_ = false;
+  thread_running_gc_ = nullptr;
 }
 
 void ConcurrentCopying::BindBitmaps() {
@@ -133,7 +151,7 @@
     LOG(INFO) << "Region-space : " << reinterpret_cast<void*>(region_space_->Begin()) << "-"
               << reinterpret_cast<void*>(region_space_->Limit());
   }
-  CHECK(mark_queue_.IsEmpty());
+  CheckEmptyMarkStack();
   immune_region_.Reset();
   bytes_moved_.StoreRelaxed(0);
   objects_moved_.StoreRelaxed(0);
@@ -210,6 +228,7 @@
       cc->from_space_num_bytes_at_first_pause_ = cc->region_space_->GetBytesAllocated();
     }
     cc->is_marking_ = true;
+    cc->mark_stack_mode_.StoreRelaxed(ConcurrentCopying::kMarkStackModeThreadLocal);
     if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
       CHECK(Runtime::Current()->IsAotCompiler());
       TimingLogger::ScopedTiming split2("(Paused)VisitTransactionRoots", cc->GetTimings());
@@ -284,7 +303,7 @@
     } else {
       // Newly marked. Set the gray bit and push it onto the mark stack.
       CHECK(!kUseBakerReadBarrier || obj->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
-      collector_->PushOntoMarkStack<true>(obj);
+      collector_->PushOntoMarkStack(obj);
     }
   }
 
@@ -320,6 +339,7 @@
   if (kVerboseMode) {
     LOG(INFO) << "GC MarkingPhase";
   }
+  CHECK(weak_ref_access_enabled_);
   {
     // Mark the image root. The WB-based collectors do not need to
     // scan the image objects from roots by relying on the card table,
@@ -371,37 +391,47 @@
   Thread* self = Thread::Current();
   {
     TimingLogger::ScopedTiming split6("ProcessMarkStack", GetTimings());
-    // Process the mark stack and issue an empty check point. If the
-    // mark stack is still empty after the check point, we're
-    // done. Otherwise, repeat.
+    // We transition through three mark stack modes (thread-local, shared, GC-exclusive). The
+    // primary reasons are the fact that we need to use a checkpoint to process thread-local mark
+    // stacks, but after we disable weak refs accesses, we can't use a checkpoint due to a deadlock
+    // issue because running threads potentially blocking at WaitHoldingLocks, and that once we
+    // reach the point where we process weak references, we can avoid using a lock when accessing
+    // the GC mark stack, which makes mark stack processing more efficient.
+
+    // Process the mark stack once in the thread local stack mode. This marks most of the live
+    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and system
+    // weaks) that may happen concurrently while we processing the mark stack and newly mark/gray
+    // objects and push refs on the mark stack.
     ProcessMarkStack();
-    size_t count = 0;
-    while (!ProcessMarkStack()) {
-      ++count;
-      if (kVerboseMode) {
-        LOG(INFO) << "Issue an empty check point. " << count;
-      }
-      IssueEmptyCheckpoint();
-    }
-    // Need to ensure the mark stack is empty before reference
-    // processing to get rid of non-reference gray objects.
-    CheckEmptyMarkQueue();
-    // Enable the GetReference slow path and disallow access to the system weaks.
-    GetHeap()->GetReferenceProcessor()->EnableSlowPath();
-    Runtime::Current()->DisallowNewSystemWeaks();
-    QuasiAtomic::ThreadFenceForConstructor();
-    // Lock-unlock the system weak locks so that there's no thread in
-    // the middle of accessing system weaks.
-    Runtime::Current()->EnsureNewSystemWeaksDisallowed();
-    // Note: Do not issue a checkpoint from here to the
-    // SweepSystemWeaks call or else a deadlock due to
-    // WaitHoldingLocks() would occur.
+    // Switch to the shared mark stack mode. That is, revoke and process thread-local mark stacks
+    // for the last time before transitioning to the shared mark stack mode, which would process new
+    // refs that may have been concurrently pushed onto the mark stack during the ProcessMarkStack()
+    // call above. At the same time, disable weak ref accesses using a per-thread flag. It's
+    // important to do these together in a single checkpoint so that we can ensure that mutators
+    // won't newly gray objects and push new refs onto the mark stack due to weak ref accesses and
+    // mutators safely transition to the shared mark stack mode (without leaving unprocessed refs on
+    // the thread-local mark stacks), without a race. This is why we use a thread-local weak ref
+    // access flag Thread::tls32_.weak_ref_access_enabled_ instead of the global ones.
+    SwitchToSharedMarkStackMode();
+    CHECK(!self->GetWeakRefAccessEnabled());
+    // Now that weak refs accesses are disabled, once we exhaust the shared mark stack again here
+    // (which may be non-empty if there were refs found on thread-local mark stacks during the above
+    // SwitchToSharedMarkStackMode() call), we won't have new refs to process, that is, mutators
+    // (via read barriers) have no way to produce any more refs to process. Marking converges once
+    // before we process weak refs below.
+    ProcessMarkStack();
+    CheckEmptyMarkStack();
+    // Switch to the GC exclusive mark stack mode so that we can process the mark stack without a
+    // lock from this point on.
+    SwitchToGcExclusiveMarkStackMode();
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
-      LOG(INFO) << "Enabled the ref proc slow path & disabled access to system weaks.";
       LOG(INFO) << "ProcessReferences";
     }
-    ProcessReferences(self, true);
-    CheckEmptyMarkQueue();
+    // Process weak references. This may produce new refs to process and have them processed via
+    // ProcessMarkStackCallback (in the GC exclusive mark stack mode).
+    ProcessReferences(self);
+    CheckEmptyMarkStack();
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks";
     }
@@ -409,33 +439,52 @@
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks done";
     }
-    // Because hash_set::Erase() can call the hash function for
-    // arbitrary elements in the weak intern table in
-    // InternTable::Table::SweepWeaks(), the above SweepSystemWeaks()
-    // call may have marked some objects (strings) alive. So process
-    // the mark stack here once again.
+    // Process the mark stack here one last time because the above SweepSystemWeaks() call may have
+    // marked some objects (strings alive) as hash_set::Erase() can call the hash function for
+    // arbitrary elements in the weak intern table in InternTable::Table::SweepWeaks().
     ProcessMarkStack();
-    CheckEmptyMarkQueue();
-    if (kVerboseMode) {
-      LOG(INFO) << "AllowNewSystemWeaks";
-    }
-    Runtime::Current()->AllowNewSystemWeaks();
+    CheckEmptyMarkStack();
+    // Re-enable weak ref accesses.
+    ReenableWeakRefAccess(self);
+    // Issue an empty checkpoint to ensure no threads are still in the middle of a read barrier
+    // which may have a from-space ref cached in a local variable.
     IssueEmptyCheckpoint();
-    // Disable marking.
+    // Marking is done. Disable marking.
     if (kUseTableLookupReadBarrier) {
       heap_->rb_table_->ClearAll();
       DCHECK(heap_->rb_table_->IsAllCleared());
     }
-    is_mark_queue_push_disallowed_.StoreSequentiallyConsistent(1);
-    is_marking_ = false;
-    CheckEmptyMarkQueue();
+    is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(1);
+    is_marking_ = false;  // This disables the read barrier/marking of weak roots.
+    mark_stack_mode_.StoreSequentiallyConsistent(kMarkStackModeOff);
+    CheckEmptyMarkStack();
   }
 
+  CHECK(weak_ref_access_enabled_);
   if (kVerboseMode) {
     LOG(INFO) << "GC end of MarkingPhase";
   }
 }
 
+void ConcurrentCopying::ReenableWeakRefAccess(Thread* self) {
+  if (kVerboseMode) {
+    LOG(INFO) << "ReenableWeakRefAccess";
+  }
+  weak_ref_access_enabled_.StoreRelaxed(true);  // This is for new threads.
+  QuasiAtomic::ThreadFenceForConstructor();
+  // Iterate all threads (don't need to or can't use a checkpoint) and re-enable weak ref access.
+  {
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+    for (Thread* thread : thread_list) {
+      thread->SetWeakRefAccessEnabled(true);
+    }
+  }
+  // Unblock blocking threads.
+  GetHeap()->GetReferenceProcessor()->BroadcastForSlowPath(self);
+  Runtime::Current()->BroadcastForNewSystemWeaks();
+}
+
 void ConcurrentCopying::IssueEmptyCheckpoint() {
   Thread* self = Thread::Current();
   EmptyCheckpoint check_point(this);
@@ -456,18 +505,61 @@
   Locks::mutator_lock_->SharedLock(self);
 }
 
-mirror::Object* ConcurrentCopying::PopOffMarkStack() {
-  return mark_queue_.Dequeue();
-}
-
-template<bool kThreadSafe>
 void ConcurrentCopying::PushOntoMarkStack(mirror::Object* to_ref) {
-  CHECK_EQ(is_mark_queue_push_disallowed_.LoadRelaxed(), 0)
+  CHECK_EQ(is_mark_stack_push_disallowed_.LoadRelaxed(), 0)
       << " " << to_ref << " " << PrettyTypeOf(to_ref);
-  if (kThreadSafe) {
-    CHECK(mark_queue_.Enqueue(to_ref)) << "Mark queue overflow";
+  Thread* self = Thread::Current();  // TODO: pass self as an argument from call sites?
+  CHECK(thread_running_gc_ != nullptr);
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    if (self == thread_running_gc_) {
+      // If GC-running thread, use the GC mark stack instead of a thread-local mark stack.
+      CHECK(self->GetThreadLocalMarkStack() == nullptr);
+      CHECK(!gc_mark_stack_->IsFull());
+      gc_mark_stack_->PushBack(to_ref);
+    } else {
+      // Otherwise, use a thread-local mark stack.
+      accounting::AtomicStack<mirror::Object>* tl_mark_stack = self->GetThreadLocalMarkStack();
+      if (UNLIKELY(tl_mark_stack == nullptr || tl_mark_stack->IsFull())) {
+        MutexLock mu(self, mark_stack_lock_);
+        // Get a new thread local mark stack.
+        accounting::AtomicStack<mirror::Object>* new_tl_mark_stack;
+        if (!pooled_mark_stacks_.empty()) {
+          // Use a pooled mark stack.
+          new_tl_mark_stack = pooled_mark_stacks_.back();
+          pooled_mark_stacks_.pop_back();
+        } else {
+          // None pooled. Create a new one.
+          new_tl_mark_stack =
+              accounting::AtomicStack<mirror::Object>::Create(
+                  "thread local mark stack", 4 * KB, 4 * KB);
+        }
+        DCHECK(new_tl_mark_stack != nullptr);
+        DCHECK(new_tl_mark_stack->IsEmpty());
+        new_tl_mark_stack->PushBack(to_ref);
+        self->SetThreadLocalMarkStack(new_tl_mark_stack);
+        if (tl_mark_stack != nullptr) {
+          // Store the old full stack into a vector.
+          revoked_mark_stacks_.push_back(tl_mark_stack);
+        }
+      } else {
+        tl_mark_stack->PushBack(to_ref);
+      }
+    }
+  } else if (mark_stack_mode == kMarkStackModeShared) {
+    // Access the shared GC mark stack with a lock.
+    MutexLock mu(self, mark_stack_lock_);
+    CHECK(!gc_mark_stack_->IsFull());
+    gc_mark_stack_->PushBack(to_ref);
   } else {
-    CHECK(mark_queue_.EnqueueThreadUnsafe(to_ref)) << "Mark queue overflow";
+    CHECK_EQ(static_cast<uint32_t>(mark_stack_mode),
+             static_cast<uint32_t>(kMarkStackModeGcExclusive));
+    CHECK(self == thread_running_gc_)
+        << "Only GC-running thread should access the mark stack "
+        << "in the GC exclusive mark stack mode";
+    // Access the GC mark stack without a lock.
+    CHECK(!gc_mark_stack_->IsFull());
+    gc_mark_stack_->PushBack(to_ref);
   }
 }
 
@@ -696,83 +788,300 @@
   ConcurrentCopying* collector_;
 };
 
-bool ConcurrentCopying::ProcessMarkStack() {
+class RevokeThreadLocalMarkStackCheckpoint : public Closure {
+ public:
+  explicit RevokeThreadLocalMarkStackCheckpoint(ConcurrentCopying* concurrent_copying,
+                                                bool disable_weak_ref_access)
+      : concurrent_copying_(concurrent_copying),
+        disable_weak_ref_access_(disable_weak_ref_access) {
+  }
+
+  virtual void Run(Thread* thread) OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    // Revoke thread local mark stacks.
+    accounting::AtomicStack<mirror::Object>* tl_mark_stack = thread->GetThreadLocalMarkStack();
+    if (tl_mark_stack != nullptr) {
+      MutexLock mu(self, concurrent_copying_->mark_stack_lock_);
+      concurrent_copying_->revoked_mark_stacks_.push_back(tl_mark_stack);
+      thread->SetThreadLocalMarkStack(nullptr);
+    }
+    // Disable weak ref access.
+    if (disable_weak_ref_access_) {
+      thread->SetWeakRefAccessEnabled(false);
+    }
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      concurrent_copying_->GetBarrier().Pass(self);
+    }
+  }
+
+ private:
+  ConcurrentCopying* const concurrent_copying_;
+  const bool disable_weak_ref_access_;
+};
+
+void ConcurrentCopying::RevokeThreadLocalMarkStacks(bool disable_weak_ref_access) {
+  Thread* self = Thread::Current();
+  RevokeThreadLocalMarkStackCheckpoint check_point(this, disable_weak_ref_access);
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+  gc_barrier_->Init(self, 0);
+  size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release the mutator lock.
+  if (barrier_count == 0) {
+    return;
+  }
+  Locks::mutator_lock_->SharedUnlock(self);
+  {
+    ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+    gc_barrier_->Increment(self, barrier_count);
+  }
+  Locks::mutator_lock_->SharedLock(self);
+}
+
+void ConcurrentCopying::RevokeThreadLocalMarkStack(Thread* thread) {
+  Thread* self = Thread::Current();
+  CHECK_EQ(self, thread);
+  accounting::AtomicStack<mirror::Object>* tl_mark_stack = thread->GetThreadLocalMarkStack();
+  if (tl_mark_stack != nullptr) {
+    CHECK(is_marking_);
+    MutexLock mu(self, mark_stack_lock_);
+    revoked_mark_stacks_.push_back(tl_mark_stack);
+    thread->SetThreadLocalMarkStack(nullptr);
+  }
+}
+
+void ConcurrentCopying::ProcessMarkStack() {
   if (kVerboseMode) {
     LOG(INFO) << "ProcessMarkStack. ";
   }
-  size_t count = 0;
-  mirror::Object* to_ref;
-  while ((to_ref = PopOffMarkStack()) != nullptr) {
-    ++count;
-    DCHECK(!region_space_->IsInFromSpace(to_ref));
-    if (kUseBakerReadBarrier) {
-      DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
-          << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
-          << " is_marked=" << IsMarked(to_ref);
+  bool empty_prev = false;
+  while (true) {
+    bool empty = ProcessMarkStackOnce();
+    if (empty_prev && empty) {
+      // Saw empty mark stack for a second time, done.
+      break;
     }
-    // Scan ref fields.
-    Scan(to_ref);
-    // Mark the gray ref as white or black.
-    if (kUseBakerReadBarrier) {
-      DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
-          << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
-          << " is_marked=" << IsMarked(to_ref);
-    }
-    if (to_ref->GetClass<kVerifyNone, kWithoutReadBarrier>()->IsTypeOfReferenceClass() &&
-        to_ref->AsReference()->GetReferent<kWithoutReadBarrier>() != nullptr &&
-        !IsInToSpace(to_ref->AsReference()->GetReferent<kWithoutReadBarrier>())) {
-      // Leave References gray so that GetReferent() will trigger RB.
-      CHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
-    } else {
-#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
-      if (kUseBakerReadBarrier) {
-        if (region_space_->IsInToSpace(to_ref)) {
-          // If to-space, change from gray to white.
-          bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                             ReadBarrier::WhitePtr());
-          CHECK(success) << "Must succeed as we won the race.";
-          CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr());
-        } else {
-          // If non-moving space/unevac from space, change from gray
-          // to black. We can't change gray to white because it's not
-          // safe to use CAS if two threads change values in opposite
-          // directions (A->B and B->A). So, we change it to black to
-          // indicate non-moving objects that have been marked
-          // through. Note we'd need to change from black to white
-          // later (concurrently).
-          bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
-                                                             ReadBarrier::BlackPtr());
-          CHECK(success) << "Must succeed as we won the race.";
-          CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::BlackPtr());
-        }
-      }
-#else
-      DCHECK(!kUseBakerReadBarrier);
-#endif
-    }
-    if (ReadBarrier::kEnableToSpaceInvariantChecks || kIsDebugBuild) {
-      ConcurrentCopyingAssertToSpaceInvariantObjectVisitor visitor(this);
-      visitor(to_ref);
-    }
+    empty_prev = empty;
   }
+}
+
+bool ConcurrentCopying::ProcessMarkStackOnce() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK(self == thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  size_t count = 0;
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    // Process the thread-local mark stacks and the GC mark stack.
+    count += ProcessThreadLocalMarkStacks(false);
+    while (!gc_mark_stack_->IsEmpty()) {
+      mirror::Object* to_ref = gc_mark_stack_->PopBack();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    gc_mark_stack_->Reset();
+  } else if (mark_stack_mode == kMarkStackModeShared) {
+    // Process the shared GC mark stack with a lock.
+    {
+      MutexLock mu(self, mark_stack_lock_);
+      CHECK(revoked_mark_stacks_.empty());
+    }
+    while (true) {
+      std::vector<mirror::Object*> refs;
+      {
+        // Copy refs with lock. Note the number of refs should be small.
+        MutexLock mu(self, mark_stack_lock_);
+        if (gc_mark_stack_->IsEmpty()) {
+          break;
+        }
+        for (StackReference<mirror::Object>* p = gc_mark_stack_->Begin();
+             p != gc_mark_stack_->End(); ++p) {
+          refs.push_back(p->AsMirrorPtr());
+        }
+        gc_mark_stack_->Reset();
+      }
+      for (mirror::Object* ref : refs) {
+        ProcessMarkStackRef(ref);
+        ++count;
+      }
+    }
+  } else {
+    CHECK_EQ(static_cast<uint32_t>(mark_stack_mode),
+             static_cast<uint32_t>(kMarkStackModeGcExclusive));
+    {
+      MutexLock mu(self, mark_stack_lock_);
+      CHECK(revoked_mark_stacks_.empty());
+    }
+    // Process the GC mark stack in the exclusive mode. No need to take the lock.
+    while (!gc_mark_stack_->IsEmpty()) {
+      mirror::Object* to_ref = gc_mark_stack_->PopBack();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    gc_mark_stack_->Reset();
+  }
+
   // Return true if the stack was empty.
   return count == 0;
 }
 
-void ConcurrentCopying::CheckEmptyMarkQueue() {
-  if (!mark_queue_.IsEmpty()) {
-    while (!mark_queue_.IsEmpty()) {
-      mirror::Object* obj = mark_queue_.Dequeue();
-      if (kUseBakerReadBarrier) {
-        mirror::Object* rb_ptr = obj->GetReadBarrierPointer();
-        LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj) << " rb_ptr=" << rb_ptr
-                  << " is_marked=" << IsMarked(obj);
+size_t ConcurrentCopying::ProcessThreadLocalMarkStacks(bool disable_weak_ref_access) {
+  // Run a checkpoint to collect all thread local mark stacks and iterate over them all.
+  RevokeThreadLocalMarkStacks(disable_weak_ref_access);
+  size_t count = 0;
+  std::vector<accounting::AtomicStack<mirror::Object>*> mark_stacks;
+  {
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    // Make a copy of the mark stack vector.
+    mark_stacks = revoked_mark_stacks_;
+    revoked_mark_stacks_.clear();
+  }
+  for (accounting::AtomicStack<mirror::Object>* mark_stack : mark_stacks) {
+    for (StackReference<mirror::Object>* p = mark_stack->Begin(); p != mark_stack->End(); ++p) {
+      mirror::Object* to_ref = p->AsMirrorPtr();
+      ProcessMarkStackRef(to_ref);
+      ++count;
+    }
+    {
+      MutexLock mu(Thread::Current(), mark_stack_lock_);
+      if (pooled_mark_stacks_.size() >= kMarkStackPoolSize) {
+        // The pool has enough. Delete it.
+        delete mark_stack;
       } else {
-        LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj)
-                  << " is_marked=" << IsMarked(obj);
+        // Otherwise, put it into the pool for later reuse.
+        mark_stack->Reset();
+        pooled_mark_stacks_.push_back(mark_stack);
       }
     }
-    LOG(FATAL) << "mark queue is not empty";
+  }
+  return count;
+}
+
+void ConcurrentCopying::ProcessMarkStackRef(mirror::Object* to_ref) {
+  DCHECK(!region_space_->IsInFromSpace(to_ref));
+  if (kUseBakerReadBarrier) {
+    DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
+        << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
+        << " is_marked=" << IsMarked(to_ref);
+  }
+  // Scan ref fields.
+  Scan(to_ref);
+  // Mark the gray ref as white or black.
+  if (kUseBakerReadBarrier) {
+    DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
+        << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
+        << " is_marked=" << IsMarked(to_ref);
+  }
+  if (to_ref->GetClass<kVerifyNone, kWithoutReadBarrier>()->IsTypeOfReferenceClass() &&
+      to_ref->AsReference()->GetReferent<kWithoutReadBarrier>() != nullptr &&
+      !IsInToSpace(to_ref->AsReference()->GetReferent<kWithoutReadBarrier>())) {
+    // Leave References gray so that GetReferent() will trigger RB.
+    CHECK(to_ref->AsReference()->IsEnqueued()) << "Left unenqueued ref gray " << to_ref;
+  } else {
+#ifdef USE_BAKER_OR_BROOKS_READ_BARRIER
+    if (kUseBakerReadBarrier) {
+      if (region_space_->IsInToSpace(to_ref)) {
+        // If to-space, change from gray to white.
+        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
+                                                           ReadBarrier::WhitePtr());
+        CHECK(success) << "Must succeed as we won the race.";
+        CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr());
+      } else {
+        // If non-moving space/unevac from space, change from gray
+        // to black. We can't change gray to white because it's not
+        // safe to use CAS if two threads change values in opposite
+        // directions (A->B and B->A). So, we change it to black to
+        // indicate non-moving objects that have been marked
+        // through. Note we'd need to change from black to white
+        // later (concurrently).
+        bool success = to_ref->AtomicSetReadBarrierPointer(ReadBarrier::GrayPtr(),
+                                                           ReadBarrier::BlackPtr());
+        CHECK(success) << "Must succeed as we won the race.";
+        CHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::BlackPtr());
+      }
+    }
+#else
+    DCHECK(!kUseBakerReadBarrier);
+#endif
+  }
+  if (ReadBarrier::kEnableToSpaceInvariantChecks || kIsDebugBuild) {
+    ConcurrentCopyingAssertToSpaceInvariantObjectVisitor visitor(this);
+    visitor(to_ref);
+  }
+}
+
+void ConcurrentCopying::SwitchToSharedMarkStackMode() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
+           static_cast<uint32_t>(kMarkStackModeThreadLocal));
+  mark_stack_mode_.StoreRelaxed(kMarkStackModeShared);
+  CHECK(weak_ref_access_enabled_.LoadRelaxed());
+  weak_ref_access_enabled_.StoreRelaxed(false);
+  QuasiAtomic::ThreadFenceForConstructor();
+  // Process the thread local mark stacks one last time after switching to the shared mark stack
+  // mode and disable weak ref accesses.
+  ProcessThreadLocalMarkStacks(true);
+  if (kVerboseMode) {
+    LOG(INFO) << "Switched to shared mark stack mode and disabled weak ref access";
+  }
+}
+
+void ConcurrentCopying::SwitchToGcExclusiveMarkStackMode() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode before_mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  CHECK_EQ(static_cast<uint32_t>(before_mark_stack_mode),
+           static_cast<uint32_t>(kMarkStackModeShared));
+  mark_stack_mode_.StoreRelaxed(kMarkStackModeGcExclusive);
+  QuasiAtomic::ThreadFenceForConstructor();
+  if (kVerboseMode) {
+    LOG(INFO) << "Switched to GC exclusive mark stack mode";
+  }
+}
+
+void ConcurrentCopying::CheckEmptyMarkStack() {
+  Thread* self = Thread::Current();
+  CHECK(thread_running_gc_ != nullptr);
+  CHECK_EQ(self, thread_running_gc_);
+  CHECK(self->GetThreadLocalMarkStack() == nullptr);
+  MarkStackMode mark_stack_mode = mark_stack_mode_.LoadRelaxed();
+  if (mark_stack_mode == kMarkStackModeThreadLocal) {
+    // Thread-local mark stack mode.
+    RevokeThreadLocalMarkStacks(false);
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    if (!revoked_mark_stacks_.empty()) {
+      for (accounting::AtomicStack<mirror::Object>* mark_stack : revoked_mark_stacks_) {
+        while (!mark_stack->IsEmpty()) {
+          mirror::Object* obj = mark_stack->PopBack();
+          if (kUseBakerReadBarrier) {
+            mirror::Object* rb_ptr = obj->GetReadBarrierPointer();
+            LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj) << " rb_ptr=" << rb_ptr
+                      << " is_marked=" << IsMarked(obj);
+          } else {
+            LOG(INFO) << "On mark queue : " << obj << " " << PrettyTypeOf(obj)
+                      << " is_marked=" << IsMarked(obj);
+          }
+        }
+      }
+      LOG(FATAL) << "mark stack is not empty";
+    }
+  } else {
+    // Shared, GC-exclusive, or off.
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    CHECK(gc_mark_stack_->IsEmpty());
+    CHECK(revoked_mark_stacks_.empty());
   }
 }
 
@@ -792,7 +1101,7 @@
     heap_->MarkAllocStackAsLive(live_stack);
     live_stack->Reset();
   }
-  CHECK(mark_queue_.IsEmpty());
+  CheckEmptyMarkStack();
   TimingLogger::ScopedTiming split("Sweep", GetTimings());
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
@@ -888,8 +1197,8 @@
     }
     IssueEmptyCheckpoint();
     // Disable the check.
-    is_mark_queue_push_disallowed_.StoreSequentiallyConsistent(0);
-    CheckEmptyMarkQueue();
+    is_mark_stack_push_disallowed_.StoreSequentiallyConsistent(0);
+    CheckEmptyMarkStack();
   }
 
   {
@@ -956,6 +1265,8 @@
     region_space_bitmap_ = nullptr;
   }
 
+  CheckEmptyMarkStack();
+
   if (kVerboseMode) {
     LOG(INFO) << "GC end of ReclaimPhase";
   }
@@ -1479,7 +1790,7 @@
       }
       DCHECK(GetFwdPtr(from_ref) == to_ref);
       CHECK_NE(to_ref->GetLockWord(false).GetState(), LockWord::kForwardingAddress);
-      PushOntoMarkStack<true>(to_ref);
+      PushOntoMarkStack(to_ref);
       return to_ref;
     } else {
       // The CAS failed. It may have lost the race or may have failed
@@ -1612,7 +1923,7 @@
       if (kUseBakerReadBarrier) {
         DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
       }
-      PushOntoMarkStack<true>(to_ref);
+      PushOntoMarkStack(to_ref);
     }
   } else {
     // from_ref is in a non-moving space.
@@ -1639,7 +1950,7 @@
         if (kUseBakerReadBarrier) {
           DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
         }
-        PushOntoMarkStack<true>(to_ref);
+        PushOntoMarkStack(to_ref);
       }
     } else {
       // Use the mark bitmap.
@@ -1695,7 +2006,7 @@
             if (kUseBakerReadBarrier) {
               DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr());
             }
-            PushOntoMarkStack<true>(to_ref);
+            PushOntoMarkStack(to_ref);
           }
         }
       }
@@ -1705,9 +2016,11 @@
 }
 
 void ConcurrentCopying::FinishPhase() {
+  {
+    MutexLock mu(Thread::Current(), mark_stack_lock_);
+    CHECK_EQ(pooled_mark_stacks_.size(), kMarkStackPoolSize);
+  }
   region_space_ = nullptr;
-  CHECK(mark_queue_.IsEmpty());
-  mark_queue_.Clear();
   {
     MutexLock mu(Thread::Current(), skipped_blocks_lock_);
     skipped_blocks_map_.clear();
@@ -1740,7 +2053,8 @@
 }
 
 void ConcurrentCopying::ProcessMarkStackCallback(void* arg) {
-  reinterpret_cast<ConcurrentCopying*>(arg)->ProcessMarkStack();
+  ConcurrentCopying* concurrent_copying = reinterpret_cast<ConcurrentCopying*>(arg);
+  concurrent_copying->ProcessMarkStack();
 }
 
 void ConcurrentCopying::DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference) {
@@ -1748,11 +2062,12 @@
       klass, reference, &IsHeapReferenceMarkedCallback, this);
 }
 
-void ConcurrentCopying::ProcessReferences(Thread* self, bool concurrent) {
+void ConcurrentCopying::ProcessReferences(Thread* self) {
   TimingLogger::ScopedTiming split("ProcessReferences", GetTimings());
+  // We don't really need to lock the heap bitmap lock as we use CAS to mark in bitmaps.
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      concurrent, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
+      true /*concurrent*/, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
       &IsHeapReferenceMarkedCallback, &MarkCallback, &ProcessMarkStackCallback, this);
 }
 
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index b1897b8..1fb4703 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -49,89 +49,6 @@
 
 namespace collector {
 
-// Concurrent queue. Used as the mark stack. TODO: use a concurrent
-// stack for locality.
-class MarkQueue {
- public:
-  explicit MarkQueue(size_t size) : size_(size) {
-    CHECK(IsPowerOfTwo(size_));
-    buf_.reset(new Atomic<mirror::Object*>[size_]);
-    CHECK(buf_.get() != nullptr);
-    Clear();
-  }
-
-  ALWAYS_INLINE Atomic<mirror::Object*>* GetSlotAddr(size_t index) {
-    return &(buf_.get()[index & (size_ - 1)]);
-  }
-
-  // Multiple-proceducer enqueue.
-  bool Enqueue(mirror::Object* to_ref) {
-    size_t t;
-    do {
-      t = tail_.LoadRelaxed();
-      size_t h = head_.LoadSequentiallyConsistent();
-      if (t + size_ == h) {
-        // It's full.
-        return false;
-      }
-    } while (!tail_.CompareExchangeWeakSequentiallyConsistent(t, t + 1));
-    // We got a slot but its content has not been filled yet at this point.
-    GetSlotAddr(t)->StoreSequentiallyConsistent(to_ref);
-    return true;
-  }
-
-  // Thread-unsafe.
-  bool EnqueueThreadUnsafe(mirror::Object* to_ref) {
-    size_t t = tail_.LoadRelaxed();
-    size_t h = head_.LoadRelaxed();
-    if (t + size_ == h) {
-      // It's full.
-      return false;
-    }
-    GetSlotAddr(t)->StoreRelaxed(to_ref);
-    tail_.StoreRelaxed(t + 1);
-    return true;
-  }
-
-  // Single-consumer dequeue.
-  mirror::Object* Dequeue() {
-    size_t h = head_.LoadRelaxed();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    if (h == t) {
-      // it's empty.
-      return nullptr;
-    }
-    Atomic<mirror::Object*>* slot = GetSlotAddr(h);
-    mirror::Object* ref = slot->LoadSequentiallyConsistent();
-    while (ref == nullptr) {
-      // Wait until the slot content becomes visible.
-      ref = slot->LoadSequentiallyConsistent();
-    }
-    slot->StoreRelaxed(nullptr);
-    head_.StoreSequentiallyConsistent(h + 1);
-    return ref;
-  }
-
-  bool IsEmpty() {
-    size_t h = head_.LoadSequentiallyConsistent();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    return h == t;
-  }
-
-  void Clear() {
-    head_.StoreRelaxed(0);
-    tail_.StoreRelaxed(0);
-    memset(buf_.get(), 0, size_ * sizeof(Atomic<mirror::Object*>));
-  }
-
- private:
-  Atomic<size_t> head_;
-  Atomic<size_t> tail_;
-
-  size_t size_;
-  std::unique_ptr<Atomic<mirror::Object*>[]> buf_;
-};
-
 class ConcurrentCopying : public GarbageCollector {
  public:
   // TODO: disable thse flags for production use.
@@ -185,10 +102,12 @@
   Barrier& GetBarrier() {
     return *gc_barrier_;
   }
+  bool IsWeakRefAccessEnabled() {
+    return weak_ref_access_enabled_.LoadRelaxed();
+  }
+  void RevokeThreadLocalMarkStack(Thread* thread) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
-  mirror::Object* PopOffMarkStack();
-  template<bool kThreadSafe>
   void PushOntoMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* Copy(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Scan(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -202,11 +121,18 @@
   void VerifyNoFromSpaceReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   accounting::ObjectStack* GetAllocationStack();
   accounting::ObjectStack* GetLiveStack();
-  bool ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool ProcessMarkStackOnce() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessMarkStackRef(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  size_t ProcessThreadLocalMarkStacks(bool disable_weak_ref_access)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void RevokeThreadLocalMarkStacks(bool disable_weak_ref_access)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SwitchToSharedMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SwitchToGcExclusiveMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ProcessReferences(Thread* self, bool concurrent)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessReferences(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* IsMarked(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static mirror::Object* MarkCallback(mirror::Object* from_ref, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -229,7 +155,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* AllocateInSkippedBlock(size_t alloc_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void CheckEmptyMarkQueue() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void CheckEmptyMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void IssueEmptyCheckpoint() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
@@ -242,10 +168,19 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AssertToSpaceInvariantInNonMovingSpace(mirror::Object* obj, mirror::Object* ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ReenableWeakRefAccess(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
-  MarkQueue mark_queue_;
+  std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+  Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  std::vector<accounting::ObjectStack*> revoked_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  static constexpr size_t kMarkStackSize = kPageSize;
+  static constexpr size_t kMarkStackPoolSize = 256;
+  std::vector<accounting::ObjectStack*> pooled_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  Thread* thread_running_gc_;
   bool is_marking_;                       // True while marking is ongoing.
   bool is_active_;                        // True while the collection is ongoing.
   bool is_asserting_to_space_invariant_;  // True while asserting the to-space invariant.
@@ -258,7 +193,18 @@
   size_t live_stack_freeze_size_;
   size_t from_space_num_objects_at_first_pause_;
   size_t from_space_num_bytes_at_first_pause_;
-  Atomic<int> is_mark_queue_push_disallowed_;
+  Atomic<int> is_mark_stack_push_disallowed_;
+  enum MarkStackMode {
+    kMarkStackModeOff = 0,      // Mark stack is off.
+    kMarkStackModeThreadLocal,  // All threads except for the GC-running thread push refs onto
+                                // thread-local mark stacks. The GC-running thread pushes onto and
+                                // pops off the GC mark stack without a lock.
+    kMarkStackModeShared,       // All threads share the GC mark stack with a lock.
+    kMarkStackModeGcExclusive   // The GC-running thread pushes onto and pops from the GC mark stack
+                                // without a lock. Other threads won't access the mark stack.
+  };
+  Atomic<MarkStackMode> mark_stack_mode_;
+  Atomic<bool> weak_ref_access_enabled_;
 
   // How many objects and bytes we moved. Used for accounting.
   Atomic<size_t> bytes_moved_;
@@ -284,6 +230,7 @@
   friend class ThreadFlipVisitor;
   friend class FlipCallback;
   friend class ConcurrentCopyingComputeUnevacFromSpaceLiveRatioVisitor;
+  friend class RevokeThreadLocalMarkStackCheckpoint;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ConcurrentCopying);
 };
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 4d51d38..c08ed0e 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -53,15 +53,27 @@
   condition_.Broadcast(self);
 }
 
+void ReferenceProcessor::BroadcastForSlowPath(Thread* self) {
+  CHECK(kUseReadBarrier);
+  MutexLock mu(self, *Locks::reference_processor_lock_);
+  condition_.Broadcast(self);
+}
+
 mirror::Object* ReferenceProcessor::GetReferent(Thread* self, mirror::Reference* reference) {
-  mirror::Object* const referent = reference->GetReferent();
-  // If the referent is null then it is already cleared, we can just return null since there is no
-  // scenario where it becomes non-null during the reference processing phase.
-  if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
-    return referent;
+  if (!kUseReadBarrier || self->GetWeakRefAccessEnabled()) {
+    // Under read barrier / concurrent copying collector, it's not safe to call GetReferent() when
+    // weak ref access is disabled as the call includes a read barrier which may push a ref onto the
+    // mark stack and interfere with termination of marking.
+    mirror::Object* const referent = reference->GetReferent();
+    // If the referent is null then it is already cleared, we can just return null since there is no
+    // scenario where it becomes non-null during the reference processing phase.
+    if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
+      return referent;
+    }
   }
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  while (SlowPathEnabled()) {
+  while ((!kUseReadBarrier && SlowPathEnabled()) ||
+         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     mirror::HeapReference<mirror::Object>* const referent_addr =
         reference->GetReferentReferenceAddr();
     // If the referent became cleared, return it. Don't need barrier since thread roots can't get
@@ -128,7 +140,12 @@
     process_references_args_.is_marked_callback_ = is_marked_callback;
     process_references_args_.mark_callback_ = mark_object_callback;
     process_references_args_.arg_ = arg;
-    CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
+    if (!kUseReadBarrier) {
+      CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
+    } else {
+      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent == false).
+      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent);
+    }
   }
   // Unless required to clear soft references with white references, preserve some white referents.
   if (!clear_soft_references) {
@@ -178,9 +195,11 @@
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
     process_references_args_.is_marked_callback_ = nullptr;
-    if (concurrent) {
-      // Done processing, disable the slow path and broadcast to the waiters.
-      DisableSlowPath(self);
+    if (!kUseReadBarrier) {
+      if (concurrent) {
+        // Done processing, disable the slow path and broadcast to the waiters.
+        DisableSlowPath(self);
+      }
     }
   }
 }
@@ -264,7 +283,8 @@
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::reference_processor_lock_);
   // Wait untul we are done processing reference.
-  while (SlowPathEnabled()) {
+  while ((!kUseReadBarrier && SlowPathEnabled()) ||
+         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     condition_.WaitHoldingLocks(self);
   }
   // At this point, since the sentinel of the reference is live, it is guaranteed to not be
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index a44319b..284d13c 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h
@@ -54,6 +54,7 @@
   // Only allow setting this with mutators suspended so that we can avoid using a lock in the
   // GetReferent fast path as an optimization.
   void EnableSlowPath() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForSlowPath(Thread* self);
   // Decode the referent, may block if references are being processed.
   mirror::Object* GetReferent(Thread* self, mirror::Reference* reference)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) LOCKS_EXCLUDED(Locks::reference_processor_lock_);
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 2a96278..2a06ab3 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -231,13 +231,21 @@
   CHECK(!allow_new_interns_);
 }
 
+void InternTable::BroadcastForNewInterns() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, *Locks::intern_table_lock_);
+  new_intern_condition_.Broadcast(self);
+}
+
 mirror::String* InternTable::Insert(mirror::String* s, bool is_strong) {
   if (s == nullptr) {
     return nullptr;
   }
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::intern_table_lock_);
-  while (UNLIKELY(!allow_new_interns_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_interns_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     new_intern_condition_.WaitHoldingLocks(self);
   }
   // Check the strong table for a match.
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 97ce73c..53f6f75 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -88,6 +88,7 @@
   void DisallowNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewInternsDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewInterns() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Adds all of the resolved image strings from the image space into the intern table. The
   // advantage of doing this is preventing expensive DexFile::FindStringId calls.
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index f1deacf..36adbea 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -473,7 +473,8 @@
     return nullptr;
   }
   MutexLock mu(self, weak_globals_lock_);
-  while (UNLIKELY(!allow_new_weak_globals_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_weak_globals_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     weak_globals_add_condition_.WaitHoldingLocks(self);
   }
   IndirectRef ref = weak_globals_.Add(IRT_FIRST_SEGMENT, obj);
@@ -559,6 +560,13 @@
   CHECK(!allow_new_weak_globals_);
 }
 
+void JavaVMExt::BroadcastForNewWeakGlobals() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, weak_globals_lock_);
+  weak_globals_add_condition_.Broadcast(self);
+}
+
 mirror::Object* JavaVMExt::DecodeGlobal(Thread* self, IndirectRef ref) {
   return globals_.SynchronizedGet(self, &globals_lock_, ref);
 }
@@ -570,7 +578,8 @@
 
 mirror::Object* JavaVMExt::DecodeWeakGlobal(Thread* self, IndirectRef ref) {
   MutexLock mu(self, weak_globals_lock_);
-  while (UNLIKELY(!allow_new_weak_globals_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_weak_globals_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     weak_globals_add_condition_.WaitHoldingLocks(self);
   }
   return weak_globals_.Get(ref);
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 4fdf45a..694a545 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -108,6 +108,7 @@
   void DisallowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewWeakGlobalsDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewWeakGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   jobject AddGlobalRef(Thread* self, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 4be25d6..bc89890 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1146,10 +1146,18 @@
   CHECK(!allow_new_monitors_);
 }
 
+void MonitorList::BroadcastForNewMonitors() {
+  CHECK(kUseReadBarrier);
+  Thread* self = Thread::Current();
+  MutexLock mu(self, monitor_list_lock_);
+  monitor_add_condition_.Broadcast(self);
+}
+
 void MonitorList::Add(Monitor* m) {
   Thread* self = Thread::Current();
   MutexLock mu(self, monitor_list_lock_);
-  while (UNLIKELY(!allow_new_monitors_)) {
+  while (UNLIKELY((!kUseReadBarrier && !allow_new_monitors_) ||
+                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     monitor_add_condition_.WaitHoldingLocks(self);
   }
   list_.push_front(m);
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 8f3a91d..8f6fb75 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -292,6 +292,7 @@
   void DisallowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   void AllowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   void EnsureNewMonitorsDisallowed() LOCKS_EXCLUDED(monitor_list_lock_);
+  void BroadcastForNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   // Returns how many monitors were deflated.
   size_t DeflateMonitors() LOCKS_EXCLUDED(monitor_list_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/oat.h b/runtime/oat.h
index 5706c4e..3451d0f 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '5', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '6', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 884662d..1f954ca 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1517,6 +1517,13 @@
   java_vm_->EnsureNewWeakGlobalsDisallowed();
 }
 
+void Runtime::BroadcastForNewSystemWeaks() {
+  CHECK(kUseReadBarrier);
+  monitor_list_->BroadcastForNewMonitors();
+  intern_table_->BroadcastForNewInterns();
+  java_vm_->BroadcastForNewWeakGlobals();
+}
+
 void Runtime::SetInstructionSet(InstructionSet instruction_set) {
   instruction_set_ = instruction_set;
   if ((instruction_set_ == kThumb2) || (instruction_set_ == kArm)) {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 6fd1b07..f2f3c97 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -299,6 +299,7 @@
   void DisallowNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AllowNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EnsureNewSystemWeaksDisallowed() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void BroadcastForNewSystemWeaks() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Visit all the roots. If only_dirty is true then non-dirty roots won't be visited. If
   // clean_dirty is true then dirty roots will be marked as non-dirty after visiting.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 6656fe5..e0d65ad 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1317,6 +1317,7 @@
     tlsPtr_.checkpoint_functions[i] = nullptr;
   }
   tlsPtr_.flip_function = nullptr;
+  tlsPtr_.thread_local_mark_stack = nullptr;
   tls32_.suspended_at_suspend_check = false;
 }
 
@@ -1435,6 +1436,9 @@
   {
     ScopedObjectAccess soa(self);
     Runtime::Current()->GetHeap()->RevokeThreadLocalBuffers(this);
+    if (kUseReadBarrier) {
+      Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->RevokeThreadLocalMarkStack(this);
+    }
   }
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 0e71c08..ee308a8 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -46,6 +46,9 @@
 namespace art {
 
 namespace gc {
+namespace accounting {
+  template<class T> class AtomicStack;
+}  // namespace accounting
 namespace collector {
   class SemiSpace;
 }  // namespace collector
@@ -232,6 +235,15 @@
   void SetFlipFunction(Closure* function);
   Closure* GetFlipFunction();
 
+  gc::accounting::AtomicStack<mirror::Object>* GetThreadLocalMarkStack() {
+    CHECK(kUseReadBarrier);
+    return tlsPtr_.thread_local_mark_stack;
+  }
+  void SetThreadLocalMarkStack(gc::accounting::AtomicStack<mirror::Object>* stack) {
+    CHECK(kUseReadBarrier);
+    tlsPtr_.thread_local_mark_stack = stack;
+  }
+
   // Called when thread detected that the thread_suspend_count_ was non-zero. Gives up share of
   // mutator_lock_ and waits until it is resumed and thread_suspend_count_ is zero.
   void FullSuspendCheck()
@@ -772,6 +784,16 @@
     tls32_.debug_method_entry_ = false;
   }
 
+  bool GetWeakRefAccessEnabled() const {
+    CHECK(kUseReadBarrier);
+    return tls32_.weak_ref_access_enabled;
+  }
+
+  void SetWeakRefAccessEnabled(bool enabled) {
+    CHECK(kUseReadBarrier);
+    tls32_.weak_ref_access_enabled = enabled;
+  }
+
   // Activates single step control for debugging. The thread takes the
   // ownership of the given SingleStepControl*. It is deleted by a call
   // to DeactivateSingleStepControl or upon thread destruction.
@@ -1060,7 +1082,7 @@
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
       deoptimization_return_value_is_reference(false), suspended_at_suspend_check(false),
-      ready_for_debug_invoke(false), debug_method_entry_(false) {
+      ready_for_debug_invoke(false), debug_method_entry_(false), weak_ref_access_enabled(true) {
     }
 
     union StateAndFlags state_and_flags;
@@ -1117,6 +1139,15 @@
     // True if the thread enters a method. This is used to detect method entry
     // event for the debugger.
     bool32_t debug_method_entry_;
+
+    // True if the thread is allowed to access a weak ref (Reference::GetReferent() and system
+    // weaks) and to potentially mark an object alive/gray. This is used for concurrent reference
+    // processing of the CC collector only. This is thread local so that we can enable/disable weak
+    // ref access by using a checkpoint and avoid a race around the time weak ref access gets
+    // disabled and concurrent reference processing begins (if weak ref access is disabled during a
+    // pause, this is not an issue.) Other collectors use Runtime::DisallowNewSystemWeaks() and
+    // ReferenceProcessor::EnableSlowPath().
+    bool32_t weak_ref_access_enabled;
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {
@@ -1268,6 +1299,9 @@
 
     // Current method verifier, used for root marking.
     verifier::MethodVerifier* method_verifier;
+
+    // Thread-local mark stack for the concurrent copying collector.
+    gc::accounting::AtomicStack<mirror::Object>* thread_local_mark_stack;
   } tlsPtr_;
 
   // Guards the 'interrupted_' and 'wait_monitor_' members.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 7e8128f..9d907eb 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -32,6 +32,7 @@
 #include "base/time_utils.h"
 #include "base/timing_logger.h"
 #include "debugger.h"
+#include "gc/collector/concurrent_copying.h"
 #include "jni_internal.h"
 #include "lock_word.h"
 #include "monitor.h"
@@ -1101,6 +1102,12 @@
   }
   CHECK(!Contains(self));
   list_.push_back(self);
+  if (kUseReadBarrier) {
+    // Initialize this according to the state of the CC collector.
+    bool weak_ref_access_enabled =
+        Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsWeakRefAccessEnabled();
+    self->SetWeakRefAccessEnabled(weak_ref_access_enabled);
+  }
 }
 
 void ThreadList::Unregister(Thread* self) {