Speedup marking inter-region refs in 2-phase CC

Using card table to indicate objects with inter-regions references has
performance implications. Due to coarse-granularity of card-table,
it leads to excessive number of unneeded objects being re-visited. Also,
it forces calling Scan with true for kNoUnEvac as the distinction
between mutated objects and ones with inter-region references is lost.

A separate bitmap is introduced which is updated in the marking phase
during full-heap GC to track objects with inter-region references. Then,
in copying phase, this bitmap is scanned to re-visit such objects.

Test: art/test/testrunner/testrunner.py --target --runtime-option=-XX:DumpGCPerformanceOnShutdown
Bug: 112720851
Change-Id: Idd032c18ffdddc13c71668502ef1f53a19dcee71
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 861f0d3..23bc7a2 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -95,6 +95,7 @@
       weak_ref_access_enabled_(true),
       copied_live_bytes_ratio_sum_(0.f),
       gc_count_(0),
+      inter_region_bitmap_(nullptr),
       reclaimed_bytes_ratio_sum_(0.f),
       young_gen_(young_gen),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
@@ -288,6 +289,9 @@
 void ConcurrentCopying::BindBitmaps() {
   Thread* self = Thread::Current();
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+  uintptr_t continuous_spaces_begin = UINTPTR_MAX;
+  uintptr_t continuous_spaces_limit = 0;
+  DCHECK(inter_region_bitmap_ == nullptr);
   // Mark all of the spaces we never collect as immune.
   for (const auto& space : heap_->GetContinuousSpaces()) {
     if (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyNeverCollect ||
@@ -319,6 +323,11 @@
           // be captured after the thread-flip of this GC cycle, as that is when
           // the young-gen for the next GC cycle starts getting populated.
           heap_->GetCardTable()->ClearCardRange(space->Begin(), space->Limit());
+
+          continuous_spaces_begin =
+              std::min(continuous_spaces_begin, reinterpret_cast<uintptr_t>(space->Begin()));
+          continuous_spaces_limit =
+              std::max(continuous_spaces_limit, reinterpret_cast<uintptr_t>(space->Limit()));
         }
       } else {
         if (space == region_space_) {
@@ -330,10 +339,18 @@
       }
     }
   }
-  if (kEnableGenerationalConcurrentCopyingCollection && young_gen_) {
-    for (const auto& space : GetHeap()->GetDiscontinuousSpaces()) {
-      CHECK(space->IsLargeObjectSpace());
-      space->AsLargeObjectSpace()->CopyLiveToMarked();
+  if (kEnableGenerationalConcurrentCopyingCollection) {
+    if (young_gen_) {
+      for (const auto& space : GetHeap()->GetDiscontinuousSpaces()) {
+        CHECK(space->IsLargeObjectSpace());
+        space->AsLargeObjectSpace()->CopyLiveToMarked();
+      }
+    } else {
+      inter_region_bitmap_.reset(accounting::ContinuousSpaceBitmap::Create(
+          "inter region ref bitmap",
+          reinterpret_cast<uint8_t*>(continuous_spaces_begin),
+          continuous_spaces_limit - continuous_spaces_begin));
+      CHECK(inter_region_bitmap_ != nullptr) << "Couldn't allocate inter region ref bitmap";
     }
   }
 }
@@ -1100,7 +1117,7 @@
   // Mark the corresponding card dirty if the object contains any
   // inter-region reference.
   if (visitor.ContainsInterRegionRefs()) {
-    heap_->GetCardTable()->MarkCard(ref);
+    inter_region_bitmap_->Set(ref);
   }
 }
 
@@ -1316,15 +1333,6 @@
   // Process mark stack
   ProcessMarkStackForMarkingAndComputeLiveBytes();
 
-  // Age the cards.
-  for (space::ContinuousSpace* space : GetHeap()->GetContinuousSpaces()) {
-    if (space->IsImageSpace() || space->IsZygoteSpace()) {
-      // Image and zygote spaces are already handled since we gray the objects in the pause.
-      continue;
-    }
-    card_table->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), VoidFunctor());
-  }
-
   if (kVerboseMode) {
     LOG(INFO) << "GC end of MarkingPhase";
   }
@@ -1420,10 +1428,28 @@
               }
               ScanDirtyObject</*kNoUnEvac*/ true>(obj);
             } else if (space != region_space_ || region_space_->IsInUnevacFromSpace(obj)) {
+              // We need to process un-evac references as they may be unprocessed,
+              // if they skipped the marking phase due to heap mutation.
               ScanDirtyObject</*kNoUnEvac*/ false>(obj);
+              inter_region_bitmap_->Clear(obj);
             }
           },
-          accounting::CardTable::kCardDirty - 1);
+          accounting::CardTable::kCardAged);
+
+      if (!young_gen_) {
+        auto visitor = [this](mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_) {
+                         // We don't need to process un-evac references as any unprocessed
+                         // ones will be taken care of in the card-table scan above.
+                         ScanDirtyObject</*kNoUnEvac*/ true>(obj);
+                       };
+        if (space == region_space_) {
+          region_space_->ScanUnevacFromSpace(inter_region_bitmap_.get(), visitor);
+        } else {
+          inter_region_bitmap_->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
+                                                 reinterpret_cast<uintptr_t>(space->End()),
+                                                 visitor);
+        }
+      }
     }
     // Done scanning unevac space.
     done_scanning_.store(true, std::memory_order_release);
@@ -3495,6 +3521,8 @@
     TimingLogger::ScopedTiming split("ClearRegionSpaceCards", GetTimings());
     // We do not currently use the region space cards at all, madvise them away to save ram.
     heap_->GetCardTable()->ClearCardRange(region_space_->Begin(), region_space_->Limit());
+  } else if (kEnableGenerationalConcurrentCopyingCollection && !young_gen_) {
+    inter_region_bitmap_.reset();
   }
   {
     MutexLock mu(self, skipped_blocks_lock_);
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 4442ad5..aabfc8e 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -25,7 +25,7 @@
 #include "mirror/object_reference.h"
 #include "offsets.h"
 
-#include <unordered_map>
+#include <memory>
 #include <vector>
 
 namespace art {
@@ -389,6 +389,9 @@
   // possible for minor GC if all allocated objects are in non-moving
   // space.)
   size_t gc_count_;
+  // Bit is set if the corresponding object has inter-region references that
+  // were found during the marking phase of two-phase full-heap GC cycle.
+  std::unique_ptr<accounting::ContinuousSpaceBitmap> inter_region_bitmap_;
 
   // reclaimed_bytes_ratio = reclaimed_bytes/num_allocated_bytes per GC cycle
   float reclaimed_bytes_ratio_sum_;
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index dbec4ea..9f5c117 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -193,6 +193,40 @@
   return bytes;
 }
 
+template <typename Visitor>
+inline void RegionSpace::ScanUnevacFromSpace(accounting::ContinuousSpaceBitmap* bitmap,
+                                             Visitor&& visitor) {
+  const size_t iter_limit = kUseTableLookupReadBarrier
+      ? num_regions_ : std::min(num_regions_, non_free_region_index_limit_);
+  // Instead of region-wise scan, find contiguous blocks of un-evac regions and then
+  // visit them. Everything before visit_block_begin has been processed, while
+  // [visit_block_begin, visit_block_end) still needs to be visited.
+  uint8_t* visit_block_begin = nullptr;
+  uint8_t* visit_block_end = nullptr;
+  for (size_t i = 0; i < iter_limit; ++i) {
+    Region* r = &regions_[i];
+    if (r->IsInUnevacFromSpace()) {
+      // visit_block_begin set to nullptr means a new visit block needs to be stated.
+      if (visit_block_begin == nullptr) {
+        visit_block_begin = r->Begin();
+      }
+      visit_block_end = r->End();
+    } else if (visit_block_begin != nullptr) {
+      // Visit the block range as r is not adjacent to current visit block.
+      bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(visit_block_begin),
+                               reinterpret_cast<uintptr_t>(visit_block_end),
+                               visitor);
+      visit_block_begin = nullptr;
+    }
+  }
+  // Visit last block, if not processed yet.
+  if (visit_block_begin != nullptr) {
+    bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(visit_block_begin),
+                             reinterpret_cast<uintptr_t>(visit_block_end),
+                             visitor);
+  }
+}
+
 template<bool kToSpaceOnly, typename Visitor>
 inline void RegionSpace::WalkInternal(Visitor&& visitor) {
   // TODO: MutexLock on region_lock_ won't work due to lock order
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 0d5ebcc..75c99ec 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -209,6 +209,15 @@
   template <typename Visitor>
   ALWAYS_INLINE void WalkToSpace(Visitor&& visitor) REQUIRES(Locks::mutator_lock_);
 
+  // Scans regions and calls visitor for objects in unevac-space corresponding
+  // to the bits set in 'bitmap'.
+  // Cannot acquire region_lock_ as visitor may need to acquire it for allocation.
+  // Should not be called concurrently with functions (like SetFromSpace()) which
+  // change regions' type.
+  template <typename Visitor>
+  ALWAYS_INLINE void ScanUnevacFromSpace(accounting::ContinuousSpaceBitmap* bitmap,
+                                         Visitor&& visitor) NO_THREAD_SAFETY_ANALYSIS;
+
   accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() override {
     return nullptr;
   }