Don't expand mark-stack during mutator stack scanning

With the current implementation, we left a window where one mutator is
writing to the mark-stack, while another thread finds it full and tries
to expand it. This can cause data race and missing references from being
marked.

We can instead have thread-local overflow arrays be allocated when the
mark-stack is full, and pass them to the gc-thread to be processed
after checkpoint.

The CL also now directly pushes references to the mark-stack, rather
than creating a buffer on the stack. This avoids one (unnecessary) copy
of references.

Test: art/test/testrunner/testrunner.py --host --debug
Change-Id: Id9d4303f9cc9e3f172e7b172508157836bd6a915
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 82f5e1b..ce50f40 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -480,6 +480,7 @@
 
 MarkCompact::MarkCompact(Heap* heap)
     : GarbageCollector(heap, "concurrent mark compact"),
+      overflow_arrays_(nullptr),
       gc_barrier_(0),
       lock_("mark compact lock", kGenericBottomLock),
       sigbus_in_progress_count_{kSigbusCounterCompactionDoneMask, kSigbusCounterCompactionDoneMask},
@@ -4135,11 +4136,31 @@
 template <size_t kBufferSize>
 class MarkCompact::ThreadRootsVisitor : public RootVisitor {
  public:
+  using RefType = StackReference<mirror::Object>;
+
   explicit ThreadRootsVisitor(MarkCompact* mark_compact, Thread* const self)
         : mark_compact_(mark_compact), self_(self) {}
 
   ~ThreadRootsVisitor() {
-    Flush();
+    if (overflow_arr_start_ != nullptr) {
+      // Pass on the thread-local overflow array to the gc-thread for processing
+      // after checkpoint.
+      CHECK_GT(top_, overflow_arr_start_);
+      auto pair = std::make_pair(overflow_arr_start_, top_ - overflow_arr_start_);
+      MutexLock mu(self_, mark_compact_->lock_);
+      if (mark_compact_->overflow_arrays_ == nullptr) {
+        mark_compact_->overflow_arrays_ = new std::vector<std::pair<RefType*, size_t>>(1, pair);
+      } else {
+        mark_compact_->overflow_arrays_->push_back(pair);
+      }
+    } else {
+      // Since we don't reset mark-stack between the two stack-scan checkpoints
+      // in marking phase, we need to clear the stale references that are left
+      // unused in the stack.
+      for (; top_ < end_; top_++) {
+        top_->Assign(nullptr);
+      }
+    }
   }
 
   void VisitRoots(mirror::Object*** roots,
@@ -4167,35 +4188,46 @@
   }
 
  private:
-  void Flush() REQUIRES_SHARED(Locks::mutator_lock_)
-               REQUIRES(Locks::heap_bitmap_lock_) {
-    StackReference<mirror::Object>* start;
-    StackReference<mirror::Object>* end;
-    {
-      MutexLock mu(self_, mark_compact_->lock_);
-      // Loop here because even after expanding once it may not be sufficient to
-      // accommodate all references. It's almost impossible, but there is no harm
-      // in implementing it this way.
-      while (!mark_compact_->mark_stack_->BumpBack(idx_, &start, &end)) {
-        mark_compact_->ExpandMarkStack();
+  void FetchBuffer() REQUIRES_SHARED(Locks::mutator_lock_) {
+    size_t requested_size;
+    ptrdiff_t new_top_offset;
+    if (LIKELY(overflow_arr_start_ == nullptr)) {
+      // During stack scanning threads can only be calling AtomicBumpBack() on
+      // the mark-stack.
+      if (mark_compact_->mark_stack_->AtomicBumpBack(kBufferSize, &top_, &end_)) {
+        return;
       }
+      new_top_offset = 0;
+      requested_size = kBufferSize;
+    } else {
+      DCHECK_GT(end_, overflow_arr_start_);
+      new_top_offset = end_ - overflow_arr_start_;
+      requested_size = 2 * new_top_offset;
     }
-    while (idx_ > 0) {
-      *start++ = roots_[--idx_];
-    }
-    DCHECK_EQ(start, end);
+    // realloc() acts like malloc() when overflow_arr_start_ is null.
+    overflow_arr_start_ =
+        static_cast<RefType*>(realloc(overflow_arr_start_, requested_size * sizeof(RefType)));
+    top_ = overflow_arr_start_ + new_top_offset;
+    end_ = overflow_arr_start_ + requested_size;
   }
 
   void Push(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
                                  REQUIRES(Locks::heap_bitmap_lock_) {
-    if (UNLIKELY(idx_ >= kBufferSize)) {
-      Flush();
+    if (UNLIKELY(top_ == end_)) {
+      FetchBuffer();
+      DCHECK_GE(end_ - top_, static_cast<ssize_t>(kBufferSize));
     }
-    roots_[idx_++].Assign(obj);
+    top_->Assign(obj);
+    top_++;
   }
 
-  StackReference<mirror::Object> roots_[kBufferSize];
-  size_t idx_ = 0;
+  // If mark-stack has slots available, [top_, end_) represents the slots
+  // acquired from the mark-stack for storing references. After mark-stack
+  // is full, [top_, end_) is the range in overflow array.
+  RefType* top_ = nullptr;
+  RefType* end_ = nullptr;
+  // Thread-local array of references to be used if and when mark-stack is full.
+  RefType* overflow_arr_start_ = nullptr;
   MarkCompact* const mark_compact_;
   Thread* const self_;
 };
@@ -4229,6 +4261,21 @@
   MarkCompact* const mark_compact_;
 };
 
+inline void MarkCompact::ProcessMarkObject(mirror::Object* obj) {
+  DCHECK(obj != nullptr);
+  ScanObject</*kUpdateLiveWords=*/true>(obj);
+}
+
+void MarkCompact::ProcessMarkStackNonNull() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  while (!mark_stack_->IsEmpty()) {
+    mirror::Object* obj = mark_stack_->PopBack();
+    if (obj != nullptr) {
+      ProcessMarkObject(obj);
+    }
+  }
+}
+
 void MarkCompact::MarkRootsCheckpoint(Thread* self, Runtime* runtime) {
   // We revote TLABs later during paused round of marking.
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
@@ -4239,20 +4286,39 @@
   // run through the barrier including self.
   size_t barrier_count = thread_list->RunCheckpoint(&check_point);
   // Release locks then wait for all mutator threads to pass the barrier.
-  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // If there are no threads to wait which implies that all the checkpoint functions are finished,
   // then no need to release locks.
-  if (barrier_count == 0) {
-    return;
+  if (barrier_count > 0) {
+    Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
+    Locks::mutator_lock_->SharedUnlock(self);
+    {
+      ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
+      gc_barrier_.Increment(self, barrier_count);
+    }
+    Locks::mutator_lock_->SharedLock(self);
+    Locks::heap_bitmap_lock_->ExclusiveLock(self);
   }
-  Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
-  Locks::mutator_lock_->SharedUnlock(self);
+  // We may have null in the mark-stack as some thread(s) may have not filled
+  // the buffer completely.
+  ProcessMarkStackNonNull();
+  std::vector<std::pair<StackReference<mirror::Object>*, size_t>>* vec = nullptr;
   {
-    ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
-    gc_barrier_.Increment(self, barrier_count);
+    MutexLock mu(self, lock_);
+    if (overflow_arrays_ != nullptr) {
+      vec = overflow_arrays_;
+      overflow_arrays_ = nullptr;
+    }
   }
-  Locks::mutator_lock_->SharedLock(self);
-  Locks::heap_bitmap_lock_->ExclusiveLock(self);
-  ProcessMarkStack();
+  if (vec != nullptr) {
+    for (auto [arr, size] : *vec) {
+      for (size_t i = 0; i < size; i++) {
+        ProcessMarkObject(arr[i].AsMirrorPtr());
+      }
+      free(arr);
+      ProcessMarkStack();
+    }
+    delete vec;
+  }
 }
 
 void MarkCompact::MarkNonThreadRoots(Runtime* runtime) {
@@ -4625,14 +4691,14 @@
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // TODO: try prefetch like in CMS
   while (!mark_stack_->IsEmpty()) {
-    mirror::Object* obj = mark_stack_->PopBack();
-    DCHECK(obj != nullptr);
-    ScanObject</*kUpdateLiveWords*/ true>(obj);
+    ProcessMarkObject(mark_stack_->PopBack());
   }
 }
 
 void MarkCompact::ExpandMarkStack() {
   const size_t new_size = mark_stack_->Capacity() * 2;
+  // TODO: We could reduce the overhead here by making the Resize() of
+  // AtomicStack take care of transferring references.
   std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(),
                                                    mark_stack_->End());
   mark_stack_->Resize(new_size);
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 41d2ab3..21d626e 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -692,6 +692,18 @@
   void VerifyPostGCObjects(bool performed_compaction, uint8_t* mark_bitmap_clear_end)
       REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Like ProcessMarkStack(), but ensures that a non-null popped reference is
+  // scanned.
+  void ProcessMarkStackNonNull() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Process one object popped out of mark_stack. Expects obj to be non-null.
+  void ProcessMarkObject(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Vector to hold thread-local overflow arrays (and the number of entries in
+  // there) of gc-roots found during mutator-stack scanning in marking phase.
+  std::vector<std::pair<StackReference<mirror::Object>*, size_t>>* overflow_arrays_
+      GUARDED_BY(lock_);
   // For checkpoints
   Barrier gc_barrier_;
   // Required only when mark-stack is accessed in shared mode, which happens
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index dcb25a0..dee86de 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -161,8 +161,10 @@
 // How many reserve entries are at the end of the allocation stack, these are only needed if the
 // allocation stack overflows.
 static constexpr size_t kAllocationStackReserveSize = 1024;
-// Default mark stack size in bytes.
-static const size_t kDefaultMarkStackSize = 64 * KB;
+// Default mark stack size in bytes. Use a smaller size for debug builds to
+// stress stack expansion logic in GC code.
+static const size_t kDefaultMarkStackSize =
+    kIsDebugBuild ? (kMaxPageSize / sizeof(StackReference<mirror::Object>)) : 64 * KB;
 // Define space name.
 static const char* kDlMallocSpaceName[2] = {"main dlmalloc space", "main dlmalloc space 1"};
 static const char* kRosAllocSpaceName[2] = {"main rosalloc space", "main rosalloc space 1"};