Don't expand mark-stack during mutator stack scanning With the current implementation, we left a window where one mutator is writing to the mark-stack, while another thread finds it full and tries to expand it. This can cause data race and missing references from being marked. We can instead have thread-local overflow arrays be allocated when the mark-stack is full, and pass them to the gc-thread to be processed after checkpoint. The CL also now directly pushes references to the mark-stack, rather than creating a buffer on the stack. This avoids one (unnecessary) copy of references. Test: art/test/testrunner/testrunner.py --host --debug Change-Id: Id9d4303f9cc9e3f172e7b172508157836bd6a915
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc index 82f5e1b..ce50f40 100644 --- a/runtime/gc/collector/mark_compact.cc +++ b/runtime/gc/collector/mark_compact.cc
@@ -480,6 +480,7 @@ MarkCompact::MarkCompact(Heap* heap) : GarbageCollector(heap, "concurrent mark compact"), + overflow_arrays_(nullptr), gc_barrier_(0), lock_("mark compact lock", kGenericBottomLock), sigbus_in_progress_count_{kSigbusCounterCompactionDoneMask, kSigbusCounterCompactionDoneMask}, @@ -4135,11 +4136,31 @@ template <size_t kBufferSize> class MarkCompact::ThreadRootsVisitor : public RootVisitor { public: + using RefType = StackReference<mirror::Object>; + explicit ThreadRootsVisitor(MarkCompact* mark_compact, Thread* const self) : mark_compact_(mark_compact), self_(self) {} ~ThreadRootsVisitor() { - Flush(); + if (overflow_arr_start_ != nullptr) { + // Pass on the thread-local overflow array to the gc-thread for processing + // after checkpoint. + CHECK_GT(top_, overflow_arr_start_); + auto pair = std::make_pair(overflow_arr_start_, top_ - overflow_arr_start_); + MutexLock mu(self_, mark_compact_->lock_); + if (mark_compact_->overflow_arrays_ == nullptr) { + mark_compact_->overflow_arrays_ = new std::vector<std::pair<RefType*, size_t>>(1, pair); + } else { + mark_compact_->overflow_arrays_->push_back(pair); + } + } else { + // Since we don't reset mark-stack between the two stack-scan checkpoints + // in marking phase, we need to clear the stale references that are left + // unused in the stack. + for (; top_ < end_; top_++) { + top_->Assign(nullptr); + } + } } void VisitRoots(mirror::Object*** roots, @@ -4167,35 +4188,46 @@ } private: - void Flush() REQUIRES_SHARED(Locks::mutator_lock_) - REQUIRES(Locks::heap_bitmap_lock_) { - StackReference<mirror::Object>* start; - StackReference<mirror::Object>* end; - { - MutexLock mu(self_, mark_compact_->lock_); - // Loop here because even after expanding once it may not be sufficient to - // accommodate all references. It's almost impossible, but there is no harm - // in implementing it this way. - while (!mark_compact_->mark_stack_->BumpBack(idx_, &start, &end)) { - mark_compact_->ExpandMarkStack(); + void FetchBuffer() REQUIRES_SHARED(Locks::mutator_lock_) { + size_t requested_size; + ptrdiff_t new_top_offset; + if (LIKELY(overflow_arr_start_ == nullptr)) { + // During stack scanning threads can only be calling AtomicBumpBack() on + // the mark-stack. + if (mark_compact_->mark_stack_->AtomicBumpBack(kBufferSize, &top_, &end_)) { + return; } + new_top_offset = 0; + requested_size = kBufferSize; + } else { + DCHECK_GT(end_, overflow_arr_start_); + new_top_offset = end_ - overflow_arr_start_; + requested_size = 2 * new_top_offset; } - while (idx_ > 0) { - *start++ = roots_[--idx_]; - } - DCHECK_EQ(start, end); + // realloc() acts like malloc() when overflow_arr_start_ is null. + overflow_arr_start_ = + static_cast<RefType*>(realloc(overflow_arr_start_, requested_size * sizeof(RefType))); + top_ = overflow_arr_start_ + new_top_offset; + end_ = overflow_arr_start_ + requested_size; } void Push(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_) { - if (UNLIKELY(idx_ >= kBufferSize)) { - Flush(); + if (UNLIKELY(top_ == end_)) { + FetchBuffer(); + DCHECK_GE(end_ - top_, static_cast<ssize_t>(kBufferSize)); } - roots_[idx_++].Assign(obj); + top_->Assign(obj); + top_++; } - StackReference<mirror::Object> roots_[kBufferSize]; - size_t idx_ = 0; + // If mark-stack has slots available, [top_, end_) represents the slots + // acquired from the mark-stack for storing references. After mark-stack + // is full, [top_, end_) is the range in overflow array. + RefType* top_ = nullptr; + RefType* end_ = nullptr; + // Thread-local array of references to be used if and when mark-stack is full. + RefType* overflow_arr_start_ = nullptr; MarkCompact* const mark_compact_; Thread* const self_; }; @@ -4229,6 +4261,21 @@ MarkCompact* const mark_compact_; }; +inline void MarkCompact::ProcessMarkObject(mirror::Object* obj) { + DCHECK(obj != nullptr); + ScanObject</*kUpdateLiveWords=*/true>(obj); +} + +void MarkCompact::ProcessMarkStackNonNull() { + TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings()); + while (!mark_stack_->IsEmpty()) { + mirror::Object* obj = mark_stack_->PopBack(); + if (obj != nullptr) { + ProcessMarkObject(obj); + } + } +} + void MarkCompact::MarkRootsCheckpoint(Thread* self, Runtime* runtime) { // We revote TLABs later during paused round of marking. TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings()); @@ -4239,20 +4286,39 @@ // run through the barrier including self. size_t barrier_count = thread_list->RunCheckpoint(&check_point); // Release locks then wait for all mutator threads to pass the barrier. - // If there are no threads to wait which implys that all the checkpoint functions are finished, + // If there are no threads to wait which implies that all the checkpoint functions are finished, // then no need to release locks. - if (barrier_count == 0) { - return; + if (barrier_count > 0) { + Locks::heap_bitmap_lock_->ExclusiveUnlock(self); + Locks::mutator_lock_->SharedUnlock(self); + { + ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun); + gc_barrier_.Increment(self, barrier_count); + } + Locks::mutator_lock_->SharedLock(self); + Locks::heap_bitmap_lock_->ExclusiveLock(self); } - Locks::heap_bitmap_lock_->ExclusiveUnlock(self); - Locks::mutator_lock_->SharedUnlock(self); + // We may have null in the mark-stack as some thread(s) may have not filled + // the buffer completely. + ProcessMarkStackNonNull(); + std::vector<std::pair<StackReference<mirror::Object>*, size_t>>* vec = nullptr; { - ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun); - gc_barrier_.Increment(self, barrier_count); + MutexLock mu(self, lock_); + if (overflow_arrays_ != nullptr) { + vec = overflow_arrays_; + overflow_arrays_ = nullptr; + } } - Locks::mutator_lock_->SharedLock(self); - Locks::heap_bitmap_lock_->ExclusiveLock(self); - ProcessMarkStack(); + if (vec != nullptr) { + for (auto [arr, size] : *vec) { + for (size_t i = 0; i < size; i++) { + ProcessMarkObject(arr[i].AsMirrorPtr()); + } + free(arr); + ProcessMarkStack(); + } + delete vec; + } } void MarkCompact::MarkNonThreadRoots(Runtime* runtime) { @@ -4625,14 +4691,14 @@ TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings()); // TODO: try prefetch like in CMS while (!mark_stack_->IsEmpty()) { - mirror::Object* obj = mark_stack_->PopBack(); - DCHECK(obj != nullptr); - ScanObject</*kUpdateLiveWords*/ true>(obj); + ProcessMarkObject(mark_stack_->PopBack()); } } void MarkCompact::ExpandMarkStack() { const size_t new_size = mark_stack_->Capacity() * 2; + // TODO: We could reduce the overhead here by making the Resize() of + // AtomicStack take care of transferring references. std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(), mark_stack_->End()); mark_stack_->Resize(new_size);
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h index 41d2ab3..21d626e 100644 --- a/runtime/gc/collector/mark_compact.h +++ b/runtime/gc/collector/mark_compact.h
@@ -692,6 +692,18 @@ void VerifyPostGCObjects(bool performed_compaction, uint8_t* mark_bitmap_clear_end) REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_); + // Like ProcessMarkStack(), but ensures that a non-null popped reference is + // scanned. + void ProcessMarkStackNonNull() REQUIRES_SHARED(Locks::mutator_lock_) + REQUIRES(Locks::heap_bitmap_lock_); + // Process one object popped out of mark_stack. Expects obj to be non-null. + void ProcessMarkObject(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_) + REQUIRES(Locks::heap_bitmap_lock_); + + // Vector to hold thread-local overflow arrays (and the number of entries in + // there) of gc-roots found during mutator-stack scanning in marking phase. + std::vector<std::pair<StackReference<mirror::Object>*, size_t>>* overflow_arrays_ + GUARDED_BY(lock_); // For checkpoints Barrier gc_barrier_; // Required only when mark-stack is accessed in shared mode, which happens
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index dcb25a0..dee86de 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc
@@ -161,8 +161,10 @@ // How many reserve entries are at the end of the allocation stack, these are only needed if the // allocation stack overflows. static constexpr size_t kAllocationStackReserveSize = 1024; -// Default mark stack size in bytes. -static const size_t kDefaultMarkStackSize = 64 * KB; +// Default mark stack size in bytes. Use a smaller size for debug builds to +// stress stack expansion logic in GC code. +static const size_t kDefaultMarkStackSize = + kIsDebugBuild ? (kMaxPageSize / sizeof(StackReference<mirror::Object>)) : 64 * KB; // Define space name. static const char* kDlMallocSpaceName[2] = {"main dlmalloc space", "main dlmalloc space 1"}; static const char* kRosAllocSpaceName[2] = {"main rosalloc space", "main rosalloc space 1"};