New experimental GC stress mode

Tries to do a GC for every unique call stack (up to 16 frames).
The goal is to catch moving GC bugs and lock violations without being
rediculously slow. Some tests fail on 64 bits, 32 bit host doesn't
work. N5 is booting.

Added runtime -Xgc options: gcstress and nogcstress.

Bug: 21664466

(cherry picked from commit 310008008c90fea246efd00cb99ee7ded97c5209)

Change-Id: Icb8e420f2048e8ee83bcca7937563166a2638f5c
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index f38478c..2cb86a6 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -472,6 +472,7 @@
   bool verify_pre_gc_rosalloc_ = kIsDebugBuild;
   bool verify_pre_sweeping_rosalloc_ = false;
   bool verify_post_gc_rosalloc_ = false;
+  bool gcstress_ = false;
 };
 
 template <>
@@ -509,6 +510,10 @@
         xgc.verify_post_gc_rosalloc_ = true;
       } else if (gc_option == "nopostverify_rosalloc") {
         xgc.verify_post_gc_rosalloc_ = false;
+      } else if (gc_option == "gcstress") {
+        xgc.gcstress_ = true;
+      } else if (gc_option == "nogcstress") {
+        xgc.gcstress_ = false;
       } else if ((gc_option == "precise") ||
                  (gc_option == "noprecise") ||
                  (gc_option == "verifycardtable") ||
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 0ab148e..aa91ca1 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -61,6 +61,7 @@
   kAbortLock,
   kJdwpSocketLock,
   kRegionSpaceRegionLock,
+  kTransactionLogLock,
   kReferenceQueueSoftReferencesLock,
   kReferenceQueuePhantomReferencesLock,
   kReferenceQueueFinalizerReferencesLock,
@@ -77,7 +78,6 @@
   kDexFileMethodInlinerLock,
   kDexFileToMethodInlinerMapLock,
   kMarkSweepMarkStackLock,
-  kTransactionLogLock,
   kInternTableLock,
   kOatFileSecondaryLookupLock,
   kDefaultMutexLevel,
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index d2805cd..2e97f3c 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1310,7 +1310,7 @@
 // reinit references to when reinitializing a ClassLinker from a
 // mapped image.
 void ClassLinker::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
-  class_roots_.VisitRoot(visitor, RootInfo(kRootVMInternal));
+  class_roots_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
   Thread* const self = Thread::Current();
   {
     ReaderMutexLock mu(self, dex_lock_);
@@ -1333,9 +1333,9 @@
     }
   }
   VisitClassRoots(visitor, flags);
-  array_iftable_.VisitRoot(visitor, RootInfo(kRootVMInternal));
-  for (size_t i = 0; i < kFindArrayCacheSize; ++i) {
-    find_array_class_cache_[i].VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  array_iftable_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
+  for (GcRoot<mirror::Class>& root : find_array_class_cache_) {
+    root.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
   }
 }
 
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index a6cafd6..2ec9c86 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -174,6 +174,13 @@
   } else {
     DCHECK(!Dbg::IsAllocTrackingEnabled());
   }
+  if (kInstrumented) {
+    if (gc_stress_mode_) {
+      CheckGcStressMode(self, &obj);
+    }
+  } else {
+    DCHECK(!gc_stress_mode_);
+  }
   // IsConcurrentGc() isn't known at compile time so we can optimize by not checking it for
   // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
   // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 9a70d69..57557e2 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -21,6 +21,7 @@
 
 #include <limits>
 #include <memory>
+#include <unwind.h>  // For GC verification.
 #include <vector>
 
 #include "art_field-inl.h"
@@ -125,7 +126,8 @@
            bool ignore_max_footprint, bool use_tlab,
            bool verify_pre_gc_heap, bool verify_pre_sweeping_heap, bool verify_post_gc_heap,
            bool verify_pre_gc_rosalloc, bool verify_pre_sweeping_rosalloc,
-           bool verify_post_gc_rosalloc, bool use_homogeneous_space_compaction_for_oom,
+           bool verify_post_gc_rosalloc, bool gc_stress_mode,
+           bool use_homogeneous_space_compaction_for_oom,
            uint64_t min_interval_homogeneous_space_compaction_by_oom)
     : non_moving_space_(nullptr),
       rosalloc_space_(nullptr),
@@ -170,6 +172,7 @@
       verify_pre_gc_rosalloc_(verify_pre_gc_rosalloc),
       verify_pre_sweeping_rosalloc_(verify_pre_sweeping_rosalloc),
       verify_post_gc_rosalloc_(verify_post_gc_rosalloc),
+      gc_stress_mode_(gc_stress_mode),
       /* For GC a lot mode, we limit the allocations stacks to be kGcAlotInterval allocations. This
        * causes a lot of GC since we do a GC for alloc whenever the stack is full. When heap
        * verification is enabled, we limit the size of allocation stacks to speed up their
@@ -209,13 +212,17 @@
       blocking_gc_count_last_window_(0U),
       gc_count_rate_histogram_("gc count rate histogram", 1U, kGcCountRateMaxBucketCount),
       blocking_gc_count_rate_histogram_("blocking gc count rate histogram", 1U,
-                                        kGcCountRateMaxBucketCount) {
+                                        kGcCountRateMaxBucketCount),
+      backtrace_lock_(nullptr),
+      seen_backtrace_count_(0u),
+      unique_backtrace_count_(0u) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
+  Runtime* const runtime = Runtime::Current();
   // If we aren't the zygote, switch to the default non zygote allocator. This may update the
   // entrypoints.
-  const bool is_zygote = Runtime::Current()->IsZygote();
+  const bool is_zygote = runtime->IsZygote();
   if (!is_zygote) {
     // Background compaction is currently not supported for command line runs.
     if (background_collector_type_ != foreground_collector_type_) {
@@ -507,8 +514,12 @@
       LOG(FATAL) << "There's a gap between the image space and the non-moving space";
     }
   }
-  if (running_on_valgrind_) {
-    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
+  instrumentation::Instrumentation* const instrumentation = runtime->GetInstrumentation();
+  if (gc_stress_mode_) {
+    backtrace_lock_ = new Mutex("GC complete lock");
+  }
+  if (running_on_valgrind_ || gc_stress_mode_) {
+    instrumentation->InstrumentQuickAllocEntryPoints();
   }
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() exiting";
@@ -1072,6 +1083,12 @@
   STLDeleteElements(&discontinuous_spaces_);
   delete gc_complete_lock_;
   delete pending_task_lock_;
+  delete backtrace_lock_;
+  if (unique_backtrace_count_.LoadRelaxed() != 0 || seen_backtrace_count_.LoadRelaxed() != 0) {
+    LOG(INFO) << "gc stress unique=" << unique_backtrace_count_.LoadRelaxed()
+        << " total=" << seen_backtrace_count_.LoadRelaxed() +
+            unique_backtrace_count_.LoadRelaxed();
+  }
   VLOG(heap) << "Finished ~Heap()";
 }
 
@@ -3675,5 +3692,73 @@
   }
 }
 
+// Based on debug malloc logic from libc/bionic/debug_stacktrace.cpp.
+class StackCrawlState {
+ public:
+  StackCrawlState(uintptr_t* frames, size_t max_depth, size_t skip_count)
+      : frames_(frames), frame_count_(0), max_depth_(max_depth), skip_count_(skip_count) {
+  }
+  size_t GetFrameCount() const {
+    return frame_count_;
+  }
+  static _Unwind_Reason_Code Callback(_Unwind_Context* context, void* arg) {
+    auto* const state = reinterpret_cast<StackCrawlState*>(arg);
+    const uintptr_t ip = _Unwind_GetIP(context);
+    // The first stack frame is get_backtrace itself. Skip it.
+    if (ip != 0 && state->skip_count_ > 0) {
+      --state->skip_count_;
+      return _URC_NO_REASON;
+    }
+    // ip may be off for ARM but it shouldn't matter since we only use it for hashing.
+    state->frames_[state->frame_count_] = ip;
+    state->frame_count_++;
+    return state->frame_count_ >= state->max_depth_ ? _URC_END_OF_STACK : _URC_NO_REASON;
+  }
+
+ private:
+  uintptr_t* const frames_;
+  size_t frame_count_;
+  const size_t max_depth_;
+  size_t skip_count_;
+};
+
+static size_t get_backtrace(uintptr_t* frames, size_t max_depth) {
+  StackCrawlState state(frames, max_depth, 0u);
+  _Unwind_Backtrace(&StackCrawlState::Callback, &state);
+  return state.GetFrameCount();
+}
+
+void Heap::CheckGcStressMode(Thread* self, mirror::Object** obj) {
+  auto* const runtime = Runtime::Current();
+  if (gc_stress_mode_ && runtime->GetClassLinker()->IsInitialized() &&
+      !runtime->IsActiveTransaction() && mirror::Class::HasJavaLangClass()) {
+    // Check if we should GC.
+    bool new_backtrace = false;
+    {
+      static constexpr size_t kMaxFrames = 16u;
+      uintptr_t backtrace[kMaxFrames];
+      const size_t frames = get_backtrace(backtrace, kMaxFrames);
+      uint64_t hash = 0;
+      for (size_t i = 0; i < frames; ++i) {
+        hash = hash * 2654435761 + backtrace[i];
+        hash += (hash >> 13) ^ (hash << 6);
+      }
+      MutexLock mu(self, *backtrace_lock_);
+      new_backtrace = seen_backtraces_.find(hash) == seen_backtraces_.end();
+      if (new_backtrace) {
+        seen_backtraces_.insert(hash);
+      }
+    }
+    if (new_backtrace) {
+      StackHandleScope<1> hs(self);
+      auto h = hs.NewHandleWrapper(obj);
+      CollectGarbage(false);
+      unique_backtrace_count_.FetchAndAddSequentiallyConsistent(1);
+    } else {
+      seen_backtrace_count_.FetchAndAddSequentiallyConsistent(1);
+    }
+  }
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index dac747b..81476a4 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -19,6 +19,7 @@
 
 #include <iosfwd>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "allocator_type.h"
@@ -180,7 +181,8 @@
                 bool ignore_max_footprint, bool use_tlab,
                 bool verify_pre_gc_heap, bool verify_pre_sweeping_heap, bool verify_post_gc_heap,
                 bool verify_pre_gc_rosalloc, bool verify_pre_sweeping_rosalloc,
-                bool verify_post_gc_rosalloc, bool use_homogeneous_space_compaction,
+                bool verify_post_gc_rosalloc, bool gc_stress_mode,
+                bool use_homogeneous_space_compaction,
                 uint64_t min_interval_homogeneous_space_compaction_by_oom);
 
   ~Heap();
@@ -887,6 +889,10 @@
 
   void UpdateGcCountRateHistograms() EXCLUSIVE_LOCKS_REQUIRED(gc_complete_lock_);
 
+  // GC stress mode attempts to do one GC per unique backtrace.
+  void CheckGcStressMode(Thread* self, mirror::Object** obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_;
 
@@ -1042,6 +1048,7 @@
   bool verify_pre_gc_rosalloc_;
   bool verify_pre_sweeping_rosalloc_;
   bool verify_post_gc_rosalloc_;
+  const bool gc_stress_mode_;
 
   // RAII that temporarily disables the rosalloc verification during
   // the zygote fork.
@@ -1192,6 +1199,14 @@
   // The histogram of the number of blocking GC invocations per window duration.
   Histogram<uint64_t> blocking_gc_count_rate_histogram_ GUARDED_BY(gc_complete_lock_);
 
+  // GC stress related data structures.
+  Mutex* backtrace_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  // Debugging variables, seen backtraces vs unique backtraces.
+  Atomic<uint64_t> seen_backtrace_count_;
+  Atomic<uint64_t> unique_backtrace_count_;
+  // Stack trace hashes that we already saw,
+  std::unordered_set<uint64_t> seen_backtraces_ GUARDED_BY(backtrace_lock_);
+
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::MarkCompact;
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 835b94a..0538f4b 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -757,7 +757,7 @@
 }
 
 inline void Class::SetSlowPath(bool enabled) {
-  SetFieldBoolean<false>(GetSlowPathFlagOffset(), enabled);
+  SetFieldBoolean<false, false>(GetSlowPathFlagOffset(), enabled);
 }
 
 inline void Class::InitializeClassVisitor::operator()(
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index ba8a693..0453906 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -1030,10 +1030,14 @@
   }
 
   static Class* GetJavaLangClass() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    DCHECK(!java_lang_Class_.IsNull());
+    DCHECK(HasJavaLangClass());
     return java_lang_Class_.Read();
   }
 
+  static bool HasJavaLangClass() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return !java_lang_Class_.IsNull();
+  }
+
   // Can't call this SetClass or else gets called instead of Object::SetClass in places.
   static void SetClassClass(Class* java_lang_Class) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void ResetClass();
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 4a2a0c9..6c55129 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -144,7 +144,10 @@
 };
 
 Runtime::Runtime()
-    : instruction_set_(kNone),
+    : resolution_method_(nullptr),
+      imt_conflict_method_(nullptr),
+      imt_unimplemented_method_(nullptr),
+      instruction_set_(kNone),
       compiler_callbacks_(nullptr),
       is_zygote_(false),
       must_relocate_(false),
@@ -870,6 +873,7 @@
                        xgc_option.verify_pre_gc_rosalloc_,
                        xgc_option.verify_pre_sweeping_rosalloc_,
                        xgc_option.verify_post_gc_rosalloc_,
+                       xgc_option.gcstress_,
                        runtime_options.GetOrDefault(Opt::EnableHSpaceCompactForOOM),
                        runtime_options.GetOrDefault(Opt::HSpaceCompactForOOMMinIntervalsMs));
   ATRACE_END();