[metrics] Add GC-work throughput metrics

Track work done (bytes processed) per second by the GC.

Some other minor changes:
1) Adjusted ConcurrentCopying class member's order to make access to
them more cache-access friendly. Counters accessed by GC-thread should
not be in the same cacheline as the one containing counters meant for
mutators, if either of the two modify those counters.
2) Increased max to 10'000 for throughput histograms in case
the throughput is > GB/s

Bug: 191404436
Test: manual
Merged-In: Iefaf1106690b6bae670a3a917f61194b3fcacfe0
Change-Id: Iefaf1106690b6bae670a3a917f61194b3fcacfe0
(cherry picked from commit 7f0473851d9a8d5644fde8c483390a985c238433)
diff --git a/libartbase/base/metrics/metrics.h b/libartbase/base/metrics/metrics.h
index 72d8365..78a6387 100644
--- a/libartbase/base/metrics/metrics.h
+++ b/libartbase/base/metrics/metrics.h
@@ -49,8 +49,10 @@
   METRIC(JitMethodCompileCount, MetricsCounter)                    \
   METRIC(YoungGcCollectionTime, MetricsHistogram, 15, 0, 60'000)   \
   METRIC(FullGcCollectionTime, MetricsHistogram, 15, 0, 60'000)    \
-  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 1'000)        \
-  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 1'000)
+  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 10'000)       \
+  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 10'000)        \
+  METRIC(YoungGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)   \
+  METRIC(FullGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)
 
 // A lot of the metrics implementation code is generated by passing one-off macros into ART_COUNTERS
 // and ART_HISTOGRAMS. This means metrics.h and metrics.cc are very #define-heavy, which can be
diff --git a/libartbase/base/time_utils.h b/libartbase/base/time_utils.h
index fbf3e94..dd73b1c 100644
--- a/libartbase/base/time_utils.h
+++ b/libartbase/base/time_utils.h
@@ -77,6 +77,11 @@
   return ns / 1000 / 1000;
 }
 
+// Converts the given number of nanoseconds to microseconds.
+static constexpr uint64_t NsToUs(uint64_t ns) {
+  return ns / 1000;
+}
+
 // Converts the given number of milliseconds to nanoseconds
 static constexpr uint64_t MsToNs(uint64_t ms) {
   return ms * 1000 * 1000;
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index f58224d..96c792b 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -102,6 +102,8 @@
       copied_live_bytes_ratio_sum_(0.f),
       gc_count_(0),
       reclaimed_bytes_ratio_sum_(0.f),
+      cumulative_bytes_moved_(0),
+      cumulative_objects_moved_(0),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
       measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
       mark_from_read_barrier_measurements_(false),
@@ -158,10 +160,12 @@
     gc_time_histogram_ = metrics->YoungGcCollectionTime();
     metrics_gc_count_ = metrics->YoungGcCount();
     gc_throughput_histogram_ = metrics->YoungGcThroughput();
+    gc_tracing_throughput_hist_ = metrics->YoungGcTracingThroughput();
   } else {
     gc_time_histogram_ = metrics->FullGcCollectionTime();
     metrics_gc_count_ = metrics->FullGcCount();
     gc_throughput_histogram_ = metrics->FullGcThroughput();
+    gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
   }
 }
 
@@ -401,6 +405,7 @@
   objects_moved_.store(0, std::memory_order_relaxed);
   bytes_moved_gc_thread_ = 0;
   objects_moved_gc_thread_ = 0;
+  bytes_scanned_ = 0;
   GcCause gc_cause = GetCurrentIteration()->GetGcCause();
 
   force_evacuate_all_ = false;
@@ -2196,6 +2201,7 @@
 
 inline void ConcurrentCopying::ProcessMarkStackRef(mirror::Object* to_ref) {
   DCHECK(!region_space_->IsInFromSpace(to_ref));
+  size_t obj_size = 0;
   space::RegionSpace::RegionType rtype = region_space_->GetRegionType(to_ref);
   if (kUseBakerReadBarrier) {
     DCHECK(to_ref->GetReadBarrierState() == ReadBarrier::GrayState())
@@ -2282,10 +2288,11 @@
       }
   }
   if (perform_scan) {
+    obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
     if (use_generational_cc_ && young_gen_) {
-      Scan<true>(to_ref);
+      Scan<true>(to_ref, obj_size);
     } else {
-      Scan<false>(to_ref);
+      Scan<false>(to_ref, obj_size);
     }
   }
   if (kUseBakerReadBarrier) {
@@ -2329,9 +2336,10 @@
     // Add to the live bytes per unevacuated from-space. Note this code is always run by the
     // GC-running thread (no synchronization required).
     DCHECK(region_space_bitmap_->Test(to_ref));
-    size_t obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
-    size_t alloc_size = RoundUp(obj_size, space::RegionSpace::kAlignment);
-    region_space_->AddLiveBytes(to_ref, alloc_size);
+    if (obj_size == 0) {
+      obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+    }
+    region_space_->AddLiveBytes(to_ref, RoundUp(obj_size, space::RegionSpace::kAlignment));
   }
   if (ReadBarrier::kEnableToSpaceInvariantChecks) {
     CHECK(to_ref != nullptr);
@@ -2713,9 +2721,9 @@
     const uint64_t unevac_from_bytes = region_space_->GetBytesAllocatedInUnevacFromSpace();
     const uint64_t unevac_from_objects = region_space_->GetObjectsAllocatedInUnevacFromSpace();
     uint64_t to_bytes = bytes_moved_.load(std::memory_order_relaxed) + bytes_moved_gc_thread_;
-    cumulative_bytes_moved_.fetch_add(to_bytes, std::memory_order_relaxed);
+    cumulative_bytes_moved_ += to_bytes;
     uint64_t to_objects = objects_moved_.load(std::memory_order_relaxed) + objects_moved_gc_thread_;
-    cumulative_objects_moved_.fetch_add(to_objects, std::memory_order_relaxed);
+    cumulative_objects_moved_ += to_objects;
     if (kEnableFromSpaceAccountingCheck) {
       CHECK_EQ(from_space_num_objects_at_first_pause_, from_objects + unevac_from_objects);
       CHECK_EQ(from_space_num_bytes_at_first_pause_, from_bytes + unevac_from_bytes);
@@ -2757,6 +2765,7 @@
                 << heap_->num_bytes_allocated_.load();
     }
     RecordFree(ObjectBytePair(freed_objects, freed_bytes));
+    GetCurrentIteration()->SetScannedBytes(bytes_scanned_);
     if (kVerboseMode) {
       LOG(INFO) << "(after) num_bytes_allocated="
                 << heap_->num_bytes_allocated_.load();
@@ -3135,7 +3144,7 @@
 };
 
 template <bool kNoUnEvac>
-inline void ConcurrentCopying::Scan(mirror::Object* to_ref) {
+inline void ConcurrentCopying::Scan(mirror::Object* to_ref, size_t obj_size) {
   // Cannot have `kNoUnEvac` when Generational CC collection is disabled.
   DCHECK(!kNoUnEvac || use_generational_cc_);
   if (kDisallowReadBarrierDuringScan && !Runtime::Current()->IsActiveTransaction()) {
@@ -3144,6 +3153,11 @@
     // trigger read barriers.
     Thread::Current()->ModifyDebugDisallowReadBarrier(1);
   }
+  if (obj_size == 0) {
+    obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+  }
+  bytes_scanned_ += obj_size;
+
   DCHECK(!region_space_->IsInFromSpace(to_ref));
   DCHECK_EQ(Thread::Current(), thread_running_gc_);
   RefFieldsVisitor<kNoUnEvac> visitor(this, thread_running_gc_);
@@ -3841,10 +3855,8 @@
      << (copied_live_bytes_ratio_sum_ / gc_count_) << " over " << gc_count_
      << " " << (young_gen_ ? "minor" : "major") << " GCs\n";
 
-  os << "Cumulative bytes moved "
-     << cumulative_bytes_moved_.load(std::memory_order_relaxed) << "\n";
-  os << "Cumulative objects moved "
-     << cumulative_objects_moved_.load(std::memory_order_relaxed) << "\n";
+  os << "Cumulative bytes moved " << cumulative_bytes_moved_ << "\n";
+  os << "Cumulative objects moved " << cumulative_objects_moved_ << "\n";
 
   os << "Peak regions allocated "
      << region_space_->GetMaxPeakNumNonFreeRegions() << " ("
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 582328f..72d460d 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -177,7 +177,7 @@
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   // Scan the reference fields of object `to_ref`.
   template <bool kNoUnEvac>
-  void Scan(mirror::Object* to_ref) REQUIRES_SHARED(Locks::mutator_lock_)
+  void Scan(mirror::Object* to_ref, size_t obj_size = 0) REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
   // Scan the reference fields of object 'obj' in the dirty cards during
   // card-table scan. In addition to visiting the references, it also sets the
@@ -410,10 +410,6 @@
   // bytes_moved_gc_thread_ are critical for GC triggering; the others are just informative.
   Atomic<size_t> bytes_moved_;  // Used by mutators
   Atomic<size_t> objects_moved_;  // Used by mutators
-  size_t bytes_moved_gc_thread_;  // Used by GC
-  size_t objects_moved_gc_thread_;  // Used by GC
-  Atomic<uint64_t> cumulative_bytes_moved_;
-  Atomic<uint64_t> cumulative_objects_moved_;
 
   // copied_live_bytes_ratio_sum_ is read and written by CC per GC, in
   // ReclaimPhase, and is read by DumpPerformanceInfo (potentially from another
@@ -436,6 +432,15 @@
   // reclaimed_bytes_ratio = reclaimed_bytes/num_allocated_bytes per GC cycle
   float reclaimed_bytes_ratio_sum_;
 
+  // Used only by GC thread, so need not be atomic. Also, should be kept
+  // in a different cacheline than bytes/objects_moved_ (above) to avoid false
+  // cacheline sharing.
+  size_t bytes_moved_gc_thread_;
+  size_t objects_moved_gc_thread_;
+  uint64_t bytes_scanned_;
+  uint64_t cumulative_bytes_moved_;
+  uint64_t cumulative_objects_moved_;
+
   // The skipped blocks are memory blocks/chucks that were copies of
   // objects that were unused due to lost races (cas failures) at
   // object copy/forward pointer install. They may be reused.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index f6bd996..70aec9a 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -51,6 +51,7 @@
   timings_.Reset();
   pause_times_.clear();
   duration_ns_ = 0;
+  bytes_scanned_ = 0;
   clear_soft_references_ = clear_soft_references;
   gc_cause_ = gc_cause;
   freed_ = ObjectBytePair();
@@ -72,6 +73,7 @@
       gc_time_histogram_(nullptr),
       metrics_gc_count_(nullptr),
       gc_throughput_histogram_(nullptr),
+      gc_tracing_throughput_hist_(nullptr),
       cumulative_timings_(name),
       pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
       is_transaction_active_(false) {
@@ -88,6 +90,7 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  total_scanned_bytes_ = 0;
   rss_histogram_.Reset();
   freed_bytes_histogram_.Reset();
   MutexLock mu(Thread::Current(), pause_histogram_lock_);
@@ -169,6 +172,7 @@
   // Update cumulative statistics with how many bytes the GC iteration freed.
   total_freed_objects_ += current_iteration->GetFreedObjects() +
       current_iteration->GetFreedLargeObjects();
+  total_scanned_bytes_ += current_iteration->GetScannedBytes();
   int64_t freed_bytes = current_iteration->GetFreedBytes() +
       current_iteration->GetFreedLargeObjectBytes();
   total_freed_bytes_ += freed_bytes;
@@ -177,17 +181,18 @@
   uint64_t end_time = NanoTime();
   uint64_t thread_cpu_end_time = ThreadCpuNanoTime();
   total_thread_cpu_time_ns_ += thread_cpu_end_time - thread_cpu_start_time;
-  current_iteration->SetDurationNs(end_time - start_time);
+  uint64_t duration_ns = end_time - start_time;
+  current_iteration->SetDurationNs(duration_ns);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // The entire GC was paused, clear the fake pauses which might be in the pause times and add
     // the whole GC duration.
     current_iteration->pause_times_.clear();
-    RegisterPause(current_iteration->GetDurationNs());
+    RegisterPause(duration_ns);
   }
-  total_time_ns_ += current_iteration->GetDurationNs();
+  total_time_ns_ += duration_ns;
   if (gc_time_histogram_ != nullptr) {
     // Report GC time in milliseconds.
-    gc_time_histogram_->Add(NsToMs(current_iteration->GetDurationNs()));
+    gc_time_histogram_->Add(NsToMs(duration_ns));
   }
   uint64_t total_pause_time = 0;
   for (uint64_t pause_time : current_iteration->GetPauseTimes()) {
@@ -201,6 +206,15 @@
   if (metrics_gc_count_ != nullptr) {
     metrics_gc_count_->Add(1);
   }
+
+  if (gc_tracing_throughput_hist_ != nullptr) {
+    // Throughput in bytes/s. Add 1us to prevent possible division by 0.
+    uint64_t throughput = (current_iteration->GetScannedBytes() * 1'000'000)
+            / (NsToUs(duration_ns) + 1);
+    // Report in MB/s.
+    gc_tracing_throughput_hist_->Add(throughput / MB);
+  }
+
   if (gc_throughput_histogram_ != nullptr) {
     // Report GC throughput in MB/s.
     gc_throughput_histogram_->Add(current_iteration->GetEstimatedThroughput() / MB);
@@ -247,6 +261,7 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  total_scanned_bytes_ = 0u;
 }
 
 GarbageCollector::ScopedPause::ScopedPause(GarbageCollector* collector, bool with_reporting)
@@ -306,6 +321,7 @@
   const double seconds = NsToMs(total_ns) / 1000.0;
   const uint64_t freed_bytes = GetTotalFreedBytes();
   const uint64_t freed_objects = GetTotalFreedObjects();
+  const uint64_t scanned_bytes = GetTotalScannedBytes();
   {
     MutexLock mu(Thread::Current(), pause_histogram_lock_);
     if (pause_histogram_.SampleSize() > 0) {
@@ -343,7 +359,11 @@
      << PrettySize(freed_bytes / seconds) << "/s"
      << "  per cpu-time: "
      << static_cast<uint64_t>(freed_bytes / cpu_seconds) << "/s / "
-     << PrettySize(freed_bytes / cpu_seconds) << "/s\n";
+     << PrettySize(freed_bytes / cpu_seconds) << "/s\n"
+     << GetName() << " tracing throughput: "
+     << PrettySize(scanned_bytes / seconds) << "/s "
+     << " per cpu-time: "
+     << PrettySize(scanned_bytes / cpu_seconds) << "/s\n";
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index b42f189..d2b1170 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -92,6 +92,9 @@
   uint64_t GetTotalFreedObjects() const {
     return total_freed_objects_;
   }
+  uint64_t GetTotalScannedBytes() const {
+    return total_scanned_bytes_;
+  }
   // Reset the cumulative timings and pause histogram.
   void ResetMeasurements() REQUIRES(!pause_histogram_lock_);
   // Returns the estimated throughput in bytes / second.
@@ -160,10 +163,12 @@
   metrics::MetricsBase<int64_t>* gc_time_histogram_;
   metrics::MetricsBase<uint64_t>* metrics_gc_count_;
   metrics::MetricsBase<int64_t>* gc_throughput_histogram_;
+  metrics::MetricsBase<int64_t>* gc_tracing_throughput_hist_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
   int64_t total_freed_bytes_;
+  uint64_t total_scanned_bytes_;
   CumulativeLogger cumulative_timings_;
   mutable Mutex pause_histogram_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   bool is_transaction_active_;
diff --git a/runtime/gc/collector/iteration.h b/runtime/gc/collector/iteration.h
index 363459a..348f49d 100644
--- a/runtime/gc/collector/iteration.h
+++ b/runtime/gc/collector/iteration.h
@@ -60,6 +60,12 @@
   uint64_t GetFreedRevokeBytes() const {
     return freed_bytes_revoke_;
   }
+  uint64_t GetScannedBytes() const {
+    return bytes_scanned_;
+  }
+  void SetScannedBytes(uint64_t bytes) {
+      bytes_scanned_ = bytes;
+  }
   void SetFreedRevoke(uint64_t freed) {
     freed_bytes_revoke_ = freed;
   }
@@ -84,6 +90,7 @@
   GcCause gc_cause_;
   bool clear_soft_references_;
   uint64_t duration_ns_;
+  uint64_t bytes_scanned_;
   TimingLogger timings_;
   ObjectBytePair freed_;
   ObjectBytePair freed_los_;
diff --git a/runtime/metrics/statsd.cc b/runtime/metrics/statsd.cc
index dac41b9..7e74cf6 100644
--- a/runtime/metrics/statsd.cc
+++ b/runtime/metrics/statsd.cc
@@ -31,6 +31,7 @@
 // EncodeDatumId returns a std::optional that provides a enum value from atoms.proto if the datum is
 // one that we support logging to statsd. The list of datums that ART collects is a superset of what
 // we report to statsd. Therefore, we only have mappings for the DatumIds that statsd recognizes.
+// Also it must be noted that histograms are not handled yet by statsd yet.
 //
 // Other code can use whether the result of this function has a value to decide whether to report
 // the atom to statsd.
@@ -76,14 +77,22 @@
     case DatumId::kYoungGcThroughput:
       return std::make_optional(
           statsd::
-              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_MB_PER_SEC);
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
     case DatumId::kFullGcThroughput:
       return std::make_optional(
           statsd::
-              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_MB_PER_SEC);
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
     case DatumId::kJitMethodCompileCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_JIT_METHOD_COMPILE_COUNT);
+    case DatumId::kYoungGcTracingThroughput:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
+    case DatumId::kFullGcTracingThroughput:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
   }
 }