Thread-local allocation stack.

With this change, Ritz MemAllocTest gets ~14% faster on N4.

Bug: 9986565
Change-Id: I2fb7d6f7c5daa63dd4fc73ba739e6ae4ed820617
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ea8f89c..d6f3228 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -73,6 +73,41 @@
     return true;
   }
 
+  // Atomically bump the back index by the given number of
+  // slots. Returns false if we overflowed the stack.
+  bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    int32_t index;
+    int32_t new_index;
+    do {
+      index = back_index_;
+      new_index = index + num_slots;
+      if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
+        // Stack overflow.
+        return false;
+      }
+    } while (!back_index_.CompareAndSwap(index, new_index));
+    *start_address = &begin_[index];
+    *end_address = &begin_[new_index];
+    if (kIsDebugBuild) {
+      // Sanity check that the memory is zero.
+      for (int32_t i = index; i < new_index; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i << " index=" << index << " new_index=" << new_index;
+      }
+    }
+    return true;
+  }
+
+  void AssertAllZero() {
+    if (kIsDebugBuild) {
+      for (size_t i = 0; i < capacity_; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i;
+      }
+    }
+  }
+
   void PushBack(const T& value) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 65d4c441..d02b851 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1560,7 +1560,8 @@
 void RosAlloc::RevokeAllThreadLocalRuns() {
   // This is called when a mutator thread won't allocate such as at
   // the Zygote creation time or during the GC pause.
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
   std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
   for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
     Thread* t = *it;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index de9f59e..dbbc115 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -206,6 +206,10 @@
     // This second sweep makes sure that we don't have any objects in the live stack which point to
     // freed objects. These cause problems since their references may be previously freed objects.
     SweepArray(GetHeap()->allocation_stack_.get(), false);
+    // Since SweepArray() above resets the (active) allocation
+    // stack. Need to revoke the thread-local allocation stacks that
+    // point into it.
+    GetHeap()->RevokeAllThreadLocalAllocationStacks(self);
   }
 
   timings_.StartSplit("PreSweepingGcVerification");
@@ -241,12 +245,15 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  heap_->SwapStacks(self);
 
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // If we exclusively hold the mutator lock, all threads must be suspended.
     MarkRoots();
+    if (kUseThreadLocalAllocationStack) {
+      heap_->RevokeAllThreadLocalAllocationStacks(self);
+    }
   } else {
     MarkThreadRoots(self);
     // At this point the live stack should no longer have any mutators which push into it.
@@ -995,6 +1002,9 @@
         << thread->GetState() << " thread " << thread << " self " << self;
     thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_);
     ATRACE_END();
+    if (kUseThreadLocalAllocationStack) {
+      thread->RevokeThreadLocalAllocationStack();
+    }
     mark_sweep_->GetBarrier().Pass(self);
   }
 
@@ -1062,6 +1072,9 @@
     Object** out = objects;
     for (size_t i = 0; i < count; ++i) {
       Object* obj = objects[i];
+      if (kUseThreadLocalAllocationStack && obj == nullptr) {
+        continue;
+      }
       if (space->HasAddress(obj)) {
         // This object is in the space, remove it from the array and add it to the sweep buffer
         // if needed.
@@ -1100,6 +1113,9 @@
   for (size_t i = 0; i < count; ++i) {
     Object* obj = objects[i];
     // Handle large objects.
+    if (kUseThreadLocalAllocationStack && obj == nullptr) {
+      continue;
+    }
     if (!large_mark_objects->Test(obj)) {
       ++freed_large_objects;
       freed_large_object_bytes += large_object_space->Free(self, obj);
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index ac33cc7..b1122b9 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -210,7 +210,10 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  if (kUseThreadLocalAllocationStack) {
+    heap_->RevokeAllThreadLocalAllocationStacks(self_);
+  }
+  heap_->SwapStacks(self_);
   WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
   MarkRoots();
   // Mark roots of immune spaces.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 5e1136b..9c91b0e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -82,11 +82,7 @@
     DCHECK(!Runtime::Current()->HasStatsEnabled());
   }
   if (AllocatorHasAllocationStack(allocator)) {
-    // This is safe to do since the GC will never free objects which are neither in the allocation
-    // stack or the live bitmap.
-    while (!allocation_stack_->AtomicPushBack(obj)) {
-      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
-    }
+    PushOnAllocationStack(self, obj);
   }
   if (kInstrumented) {
     if (Dbg::IsAllocTrackingEnabled()) {
@@ -111,6 +107,35 @@
   return obj;
 }
 
+// The size of a thread-local allocation stack in the number of references.
+static constexpr size_t kThreadLocalAllocationStackSize = 128;
+
+inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) {
+  if (kUseThreadLocalAllocationStack) {
+    bool success = self->PushOnThreadLocalAllocationStack(obj);
+    if (UNLIKELY(!success)) {
+      // Slow path. Allocate a new thread-local allocation stack.
+      mirror::Object** start_address;
+      mirror::Object** end_address;
+      while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
+                                                &start_address, &end_address)) {
+        CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+      }
+      self->SetThreadLocalAllocationStack(start_address, end_address);
+      // Retry on the new thread-local allocation stack.
+      success = self->PushOnThreadLocalAllocationStack(obj);
+      // Must succeed.
+      CHECK(success);
+    }
+  } else {
+    // This is safe to do since the GC will never free objects which are neither in the allocation
+    // stack or the live bitmap.
+    while (!allocation_stack_->AtomicPushBack(obj)) {
+      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+    }
+  }
+}
+
 template <bool kInstrumented, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
                                               size_t byte_count,
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5c174f8..f1126ef 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -383,7 +383,8 @@
     mirror::Object* obj = *it;
     if (obj != nullptr && obj->GetClass() != nullptr) {
       // Avoid the race condition caused by the object not yet being written into the allocation
-      // stack or the class not yet being written in the object.
+      // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack,
+      // there can be nulls on the allocation stack.
       callback(obj, arg);
     }
   }
@@ -1533,13 +1534,14 @@
   mirror::Object** limit = stack->End();
   for (mirror::Object** it = stack->Begin(); it != limit; ++it) {
     const mirror::Object* obj = *it;
-    DCHECK(obj != nullptr);
-    if (bitmap1->HasAddress(obj)) {
-      bitmap1->Set(obj);
-    } else if (bitmap2->HasAddress(obj)) {
-      bitmap2->Set(obj);
-    } else {
-      large_objects->Set(obj);
+    if (!kUseThreadLocalAllocationStack || obj != nullptr) {
+      if (bitmap1->HasAddress(obj)) {
+        bitmap1->Set(obj);
+      } else if (bitmap2->HasAddress(obj)) {
+        bitmap2->Set(obj);
+      } else {
+        large_objects->Set(obj);
+      }
     }
   }
 }
@@ -2004,7 +2006,9 @@
 
   // We can verify objects in the live stack since none of these should reference dead objects.
   for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-    visitor(*it);
+    if (!kUseThreadLocalAllocationStack || *it != nullptr) {
+      visitor(*it);
+    }
   }
 
   if (visitor.Failed()) {
@@ -2014,10 +2018,30 @@
   return true;
 }
 
-void Heap::SwapStacks() {
+void Heap::SwapStacks(Thread* self) {
+  if (kUseThreadLocalAllocationStack) {
+    live_stack_->AssertAllZero();
+  }
   allocation_stack_.swap(live_stack_);
 }
 
+void Heap::RevokeAllThreadLocalAllocationStacks(Thread* self) {
+  if (!Runtime::Current()->IsStarted()) {
+    // There's no thread list if the runtime hasn't started (eg
+    // dex2oat or a test). Just revoke for self.
+    self->RevokeThreadLocalAllocationStack();
+    return;
+  }
+  // This must be called only during the pause.
+  CHECK(Locks::mutator_lock_->IsExclusiveHeld(self));
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* t : thread_list) {
+    t->RevokeThreadLocalAllocationStack();
+  }
+}
+
 accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) {
   auto it = mod_union_tables_.find(space);
   if (it == mod_union_tables_.end()) {
@@ -2072,12 +2096,12 @@
     thread_list->SuspendAll();
     {
       ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      SwapStacks();
+      SwapStacks(self);
       // Sort the live stack so that we can quickly binary search it later.
       if (!VerifyMissingCardMarks()) {
         LOG(FATAL) << "Pre " << gc->GetName() << " missing card mark verification failed";
       }
-      SwapStacks();
+      SwapStacks(self);
     }
     thread_list->ResumeAll();
   }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index e416c0e..80a5a1a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -111,6 +111,9 @@
 // If true, use rosalloc/RosAllocSpace instead of dlmalloc/DlMallocSpace
 static constexpr bool kUseRosAlloc = true;
 
+// If true, use thread-local allocation stack.
+static constexpr bool kUseThreadLocalAllocationStack = true;
+
 // The process state passed in from the activity manager, used to determine when to do trimming
 // and compaction.
 enum ProcessState {
@@ -665,11 +668,17 @@
       SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
-  void SwapStacks();
+  void SwapStacks(Thread* self);
+
+  // Revoke all the thread-local allocation stacks.
+  void RevokeAllThreadLocalAllocationStacks(Thread* self);
 
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger& timings);
 
+  // Push an object onto the allocation stack.
+  void PushOnAllocationStack(Thread* self, mirror::Object* obj);
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_;
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 9420e7b..c0bf377 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -170,6 +170,42 @@
   return ret;
 }
 
+inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
+  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
+  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+    // There's room.
+    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
+    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
+    *thread_local_alloc_stack_top_ = obj;
+    ++thread_local_alloc_stack_top_;
+    return true;
+  }
+  return false;
+}
+
+inline void Thread::SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end) {
+  DCHECK(Thread::Current() == this) << "Should be called by self";
+  DCHECK(start != nullptr);
+  DCHECK(end != nullptr);
+  DCHECK_ALIGNED(start, sizeof(mirror::Object*));
+  DCHECK_ALIGNED(end, sizeof(mirror::Object*));
+  DCHECK_LT(start, end);
+  thread_local_alloc_stack_end_ = end;
+  thread_local_alloc_stack_top_ = start;
+}
+
+inline void Thread::RevokeThreadLocalAllocationStack() {
+  if (kIsDebugBuild) {
+    // Note: self is not necessarily equal to this thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
+        << GetState() << " thread " << this << " self " << self;
+  }
+  thread_local_alloc_stack_end_ = nullptr;
+  thread_local_alloc_stack_top_ = nullptr;
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 9797a48..3382811 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -963,7 +963,9 @@
       thread_local_start_(nullptr),
       thread_local_pos_(nullptr),
       thread_local_end_(nullptr),
-      thread_local_objects_(0) {
+      thread_local_objects_(0),
+      thread_local_alloc_stack_top_(nullptr),
+      thread_local_alloc_stack_end_(nullptr) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
   state_and_flags_.as_struct.state = kNative;
diff --git a/runtime/thread.h b/runtime/thread.h
index a3a77bb..6c072ba 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -829,6 +829,19 @@
   static const size_t kRosAllocNumOfSizeBrackets = 34;
   void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
 
+  // Thread-local allocation stack data/routines.
+  mirror::Object** thread_local_alloc_stack_top_;
+  mirror::Object** thread_local_alloc_stack_end_;
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
  private:
   friend class Dbg;  // For SetStateUnsafe.
   friend class Monitor;