Trigger fewer GCs during startup

Instead of explicitly triggering a GC after two seconds, gradually
reduce the GC triggering threshold. In particular, a small process
that almost immediately goes into the background should GC only
as part of the transition to background.

Ensure that the first collection is a full (technically "partial",
non-sticky) gc, that tries to collect everything but zygote space.
There should be very few allocated objects except in zygote space.

Clarify the concurrency rules for accessing concurrent_start_bytes_
in the process.

Test: Build and boot AOSP
Bug: 181351667
Change-Id: Id85d7e405b9305a70ba751a82ab7c2226eb83308
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 55fdb72..b9f1eba 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -326,6 +326,7 @@
       next_gc_type_(collector::kGcTypePartial),
       capacity_(capacity),
       growth_limit_(growth_limit),
+      initial_heap_size_(initial_size),
       target_footprint_(initial_size),
       // Using kPostMonitorLock as a lock at kDefaultMutexLevel is acquired after
       // this one.
@@ -2142,6 +2143,27 @@
   return HomogeneousSpaceCompactResult::kSuccess;
 }
 
+void Heap::SetDefaultConcurrentStartBytes() {
+  MutexLock mu(Thread::Current(), *gc_complete_lock_);
+  if (collector_type_running_ != kCollectorTypeNone) {
+    // If a collector is already running, just let it set concurrent_start_bytes_ .
+    return;
+  }
+  SetDefaultConcurrentStartBytesLocked();
+}
+
+void Heap::SetDefaultConcurrentStartBytesLocked() {
+  if (IsGcConcurrent()) {
+    size_t target_footprint = target_footprint_.load(std::memory_order_relaxed);
+    size_t reserve_bytes = target_footprint / 4;
+    reserve_bytes = std::min(reserve_bytes, kMaxConcurrentRemainingBytes);
+    reserve_bytes = std::max(reserve_bytes, kMinConcurrentRemainingBytes);
+    concurrent_start_bytes_ = UnsignedDifference(target_footprint, reserve_bytes);
+  } else {
+    concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
+  }
+}
+
 void Heap::ChangeCollector(CollectorType collector_type) {
   // TODO: Only do this with all mutators suspended to avoid races.
   if (collector_type != collector_type_) {
@@ -2188,13 +2210,7 @@
         UNREACHABLE();
       }
     }
-    if (IsGcConcurrent()) {
-      concurrent_start_bytes_ =
-          UnsignedDifference(target_footprint_.load(std::memory_order_relaxed),
-                             kMinConcurrentRemainingBytes);
-    } else {
-      concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
-    }
+    SetDefaultConcurrentStartBytesLocked();
   }
 }
 
@@ -3568,6 +3584,11 @@
 
 void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran,
                               size_t bytes_allocated_before_gc) {
+  // We're running in the thread that set collector_type_running_ to something other than none,
+  // thus ensuring that there is only one of us running. Thus
+  // collector_type_running_ != kCollectorTypeNone, but that's a little tricky to turn into a
+  // DCHECK.
+
   // We know what our utilization is at this moment.
   // This doesn't actually resize any memory. It just lets the heap grow more when necessary.
   const size_t bytes_allocated = GetBytesAllocated();
@@ -3693,8 +3714,7 @@
   if (target_footprint_.load(std::memory_order_relaxed) == growth_limit_
       && growth_limit_ < capacity_) {
     target_footprint_.store(capacity_, std::memory_order_relaxed);
-    concurrent_start_bytes_ =
-        UnsignedDifference(capacity_, kMinConcurrentRemainingBytes);
+    SetDefaultConcurrentStartBytes();
   }
   growth_limit_ = capacity_;
   ScopedObjectAccess soa(Thread::Current());
@@ -4471,17 +4491,29 @@
              << PrettySize(new_footprint) << " for a " << PrettySize(alloc_size) << " allocation";
 }
 
-class Heap::TriggerPostForkCCGcTask : public HeapTask {
+// Reduce target footprint, if no GC has occurred since initial_gc_num.
+// If a GC already occurred, it will have done this for us.
+class Heap::ReduceTargetFootprintTask : public HeapTask {
  public:
-  explicit TriggerPostForkCCGcTask(uint64_t target_time) : HeapTask(target_time) {}
+  explicit ReduceTargetFootprintTask(uint64_t target_time, size_t new_target_sz,
+                                     uint32_t initial_gc_num) :
+      HeapTask(target_time), new_target_sz_(new_target_sz), initial_gc_num_(initial_gc_num) {}
   void Run(Thread* self) override {
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    // Trigger a GC, if not already done. The first GC after fork, whenever it
-    // takes place, will adjust the thresholds to normal levels.
-    if (heap->target_footprint_.load(std::memory_order_relaxed) == heap->growth_limit_) {
-      heap->RequestConcurrentGC(self, kGcCauseBackground, false, heap->GetCurrentGcNum());
+    MutexLock mu(self, *(heap->gc_complete_lock_));
+    if (heap->GetCurrentGcNum() == initial_gc_num_
+        && heap->collector_type_running_ == kCollectorTypeNone) {
+      size_t target_footprint = heap->target_footprint_.load(std::memory_order_relaxed);
+      if (target_footprint > new_target_sz_) {
+        if (heap->target_footprint_.CompareAndSetStrongRelaxed(target_footprint, new_target_sz_)) {
+          heap->SetDefaultConcurrentStartBytesLocked();
+        }
+      }
     }
   }
+ private:
+  size_t new_target_sz_;
+  uint32_t initial_gc_num_;
 };
 
 void Heap::PostForkChildAction(Thread* self) {
@@ -4490,12 +4522,30 @@
   if (collector_type_ == kCollectorTypeCC && !IsLowMemoryMode()) {
     // Set target_footprint_ to the largest allowed value.
     SetIdealFootprint(growth_limit_);
-    // Set concurrent_start_bytes_ to half of the heap size.
-    size_t target_footprint = target_footprint_.load(std::memory_order_relaxed);
-    concurrent_start_bytes_ = std::max(target_footprint / 2, GetBytesAllocated());
+    SetDefaultConcurrentStartBytes();
 
-    GetTaskProcessor()->AddTask(
-        self, new TriggerPostForkCCGcTask(NanoTime() + MsToNs(kPostForkMaxHeapDurationMS)));
+    uint32_t starting_gc_num = GetCurrentGcNum();
+    uint64_t current_time = NanoTime();
+    next_gc_type_ = NonStickyGcType();
+    // Shrink heap after kPostForkMaxHeapDurationMS, to force a memory hog process to GC.
+    // This remains high enough that many processes will continue without a GC.
+    if (initial_heap_size_ < growth_limit_) {
+      size_t first_shrink_size = std::max(growth_limit_ / 4, initial_heap_size_);
+      GetTaskProcessor()->AddTask(
+          self, new ReduceTargetFootprintTask(current_time + MsToNs(kPostForkMaxHeapDurationMS),
+                                              first_shrink_size,
+                                              starting_gc_num));
+      // Shrink to a small value after a substantial time period. This will typically force a
+      // GC if none has occurred yet. Has no effect if there was a GC before this anyway, which
+      // is commonly the case, e.g. because of a process transition.
+      if (initial_heap_size_ < first_shrink_size) {
+        GetTaskProcessor()->AddTask(
+            self,
+            new ReduceTargetFootprintTask(current_time +  5 * MsToNs(kPostForkMaxHeapDurationMS),
+                                          initial_heap_size_,
+                                          starting_gc_num));
+      }
+    }
   }
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 27616fb..1a5663e 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -336,9 +336,10 @@
   void ChangeAllocator(AllocatorType allocator)
       REQUIRES(Locks::mutator_lock_, !Locks::runtime_shutdown_lock_);
 
-  // Change the collector to be one of the possible options (MS, CMS, SS).
+  // Change the collector to be one of the possible options (MS, CMS, SS). Only safe when no
+  // concurrent accesses to the heap are possible.
   void ChangeCollector(CollectorType collector_type)
-      REQUIRES(Locks::mutator_lock_);
+      REQUIRES(Locks::mutator_lock_, !*gc_complete_lock_);
 
   // The given reference is believed to be to an object in the Java heap, check the soundness of it.
   // TODO: NO_THREAD_SAFETY_ANALYSIS since we call this everywhere and it is impossible to find a
@@ -411,7 +412,7 @@
 
   // Removes the growth limit on the alloc space so it may grow to its maximum capacity. Used to
   // implement dalvik.system.VMRuntime.clearGrowthLimit.
-  void ClearGrowthLimit();
+  void ClearGrowthLimit() REQUIRES(!*gc_complete_lock_);
 
   // Make the current growth limit the new maximum capacity, unmaps pages at the end of spaces
   // which will never be used. Used to implement dalvik.system.VMRuntime.clampGrowthLimit.
@@ -464,6 +465,7 @@
 
   // For the alloc space, sets the maximum number of bytes that the heap is allowed to allocate
   // from the system. Doesn't allow the space to exceed its growth limit.
+  // Set while we hold gc_complete_lock or collector_type_running_ != kCollectorTypeNone.
   void SetIdealFootprint(size_t max_allowed_footprint);
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
@@ -960,7 +962,7 @@
 
   const Verification* GetVerification() const;
 
-  void PostForkChildAction(Thread* self);
+  void PostForkChildAction(Thread* self) REQUIRES(!*gc_complete_lock_);
 
   void TraceHeapSize(size_t heap_size);
 
@@ -970,7 +972,7 @@
   class ConcurrentGCTask;
   class CollectorTransitionTask;
   class HeapTrimTask;
-  class TriggerPostForkCCGcTask;
+  class ReduceTargetFootprintTask;
 
   // Compact source space to target space. Returns the collector used.
   collector::GarbageCollector* Compact(space::ContinuousMemMapAllocSpace* target_space,
@@ -1177,6 +1179,9 @@
   // the target utilization ratio.  This should only be called immediately after a full garbage
   // collection. bytes_allocated_before_gc is used to measure bytes / second for the period which
   // the GC was run.
+  // This is only called by the thread that set collector_type_running_ to a value other than
+  // kCollectorTypeNone, or while holding gc_complete_lock, and ensuring that
+  // collector_type_running_ is kCollectorTypeNone.
   void GrowForUtilization(collector::GarbageCollector* collector_ran,
                           size_t bytes_allocated_before_gc = 0)
       REQUIRES(!process_state_update_lock_);
@@ -1269,6 +1274,11 @@
   // of a garbage collection.
   size_t GetNativeBytes();
 
+  // Set concurrent_start_bytes_ to a reasonable guess, given target_footprint_ .
+  void SetDefaultConcurrentStartBytes() REQUIRES(!*gc_complete_lock_);
+  // This version assumes no concurrent updaters.
+  void SetDefaultConcurrentStartBytesLocked();
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_ GUARDED_BY(Locks::mutator_lock_);
 
@@ -1385,6 +1395,9 @@
   // Task processor, proxies heap trim requests to the daemon threads.
   std::unique_ptr<TaskProcessor> task_processor_;
 
+  // The following are declared volatile only for debugging purposes; it shouldn't otherwise
+  // matter.
+
   // Collector type of the running GC.
   volatile CollectorType collector_type_running_ GUARDED_BY(gc_complete_lock_);
 
@@ -1406,21 +1419,29 @@
   // Only weakly enforced for simultaneous allocations.
   size_t growth_limit_;
 
+  // Requested initial heap size. Temporarily ignored after a fork, but then reestablished after
+  // a while to usually trigger the initial GC.
+  size_t initial_heap_size_;
+
   // Target size (as in maximum allocatable bytes) for the heap. Weakly enforced as a limit for
   // non-concurrent GC. Used as a guideline for computing concurrent_start_bytes_ in the
-  // concurrent GC case.
+  // concurrent GC case. Updates normally occur while collector_type_running_ is not none.
   Atomic<size_t> target_footprint_;
 
+  Mutex process_state_update_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
   // Computed with foreground-multiplier in GrowForUtilization() when run in
   // jank non-perceptible state. On update to process state from background to
   // foreground we set target_footprint_ to this value.
-  Mutex process_state_update_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   size_t min_foreground_target_footprint_ GUARDED_BY(process_state_update_lock_);
 
   // When num_bytes_allocated_ exceeds this amount then a concurrent GC should be requested so that
   // it completes ahead of an allocation failing.
   // A multiple of this is also used to determine when to trigger a GC in response to native
   // allocation.
+  // After initialization, this is only updated by the thread that set collector_type_running_ to
+  // a value other than kCollectorTypeNone, or while holding gc_complete_lock, and ensuring that
+  // collector_type_running_ is kCollectorTypeNone.
   size_t concurrent_start_bytes_;
 
   // Since the heap was created, how many bytes have been freed.