Move startup thread pool back into runtime

Added logic in heap trim to delete the thread pool, if there are no
active users.

Added a scoped accessor to prevent ref counting errors.

Motivation, have workers already created when the app images are
loaded.

Bug: 116052292
Test: test-art-host
Change-Id: I8ea776d74e88601222a9989e0c6dac34cf77c683
diff --git a/runtime/base/locks.cc b/runtime/base/locks.cc
index cfc9f1d..a7922a2 100644
--- a/runtime/base/locks.cc
+++ b/runtime/base/locks.cc
@@ -61,6 +61,7 @@
 Mutex* Locks::reference_queue_soft_references_lock_ = nullptr;
 Mutex* Locks::reference_queue_weak_references_lock_ = nullptr;
 Mutex* Locks::runtime_shutdown_lock_ = nullptr;
+Mutex* Locks::runtime_thread_pool_lock_ = nullptr;
 Mutex* Locks::cha_lock_ = nullptr;
 Mutex* Locks::subtype_check_lock_ = nullptr;
 Mutex* Locks::thread_list_lock_ = nullptr;
@@ -154,6 +155,7 @@
     DCHECK(user_code_suspension_lock_ != nullptr);
     DCHECK(dex_lock_ != nullptr);
     DCHECK(native_debug_interface_lock_ != nullptr);
+    DCHECK(runtime_thread_pool_lock_ != nullptr);
   } else {
     // Create global locks in level order from highest lock level to lowest.
     LockLevel current_lock_level = kInstrumentEntrypointsLock;
@@ -189,6 +191,10 @@
     DCHECK(runtime_shutdown_lock_ == nullptr);
     runtime_shutdown_lock_ = new Mutex("runtime shutdown lock", current_lock_level);
 
+    UPDATE_CURRENT_LOCK_LEVEL(kRuntimeThreadPoolLock);
+    DCHECK(runtime_thread_pool_lock_ == nullptr);
+    runtime_thread_pool_lock_ = new Mutex("runtime thread pool lock", current_lock_level);
+
     UPDATE_CURRENT_LOCK_LEVEL(kProfilerLock);
     DCHECK(profiler_lock_ == nullptr);
     profiler_lock_ = new Mutex("profiler lock", current_lock_level);
diff --git a/runtime/base/locks.h b/runtime/base/locks.h
index 8cbe372..57719f1 100644
--- a/runtime/base/locks.h
+++ b/runtime/base/locks.h
@@ -117,6 +117,7 @@
   kJdwpEventListLock,
   kJdwpAttachLock,
   kJdwpStartLock,
+  kRuntimeThreadPoolLock,
   kRuntimeShutdownLock,
   kTraceLock,
   kHeapBitmapLock,
@@ -224,8 +225,11 @@
   // Guards shutdown of the runtime.
   static Mutex* runtime_shutdown_lock_ ACQUIRED_AFTER(heap_bitmap_lock_);
 
+  // Runtime thread pool lock.
+  static Mutex* runtime_thread_pool_lock_ ACQUIRED_AFTER(runtime_shutdown_lock_);
+
   // Guards background profiler global state.
-  static Mutex* profiler_lock_ ACQUIRED_AFTER(runtime_shutdown_lock_);
+  static Mutex* profiler_lock_ ACQUIRED_AFTER(runtime_thread_pool_lock_);
 
   // Guards trace (ie traceview) requests.
   static Mutex* trace_lock_ ACQUIRED_AFTER(profiler_lock_);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index bf8aaae..8f9967f 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1416,6 +1416,11 @@
   TrimSpaces(self);
   // Trim arenas that may have been used by JIT or verifier.
   runtime->GetArenaPool()->TrimMaps();
+  {
+    // TODO: Move this to a callback called when startup is finished (b/120671223).
+    ScopedTrace trace2("Delete thread pool");
+    runtime->DeleteThreadPool();
+  }
 }
 
 class TrimIndirectReferenceTableClosure : public Closure {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 66db063..4f9b3f9 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -21,7 +21,6 @@
 #include <unistd.h>
 
 #include <random>
-#include <thread>
 
 #include "android-base/stringprintf.h"
 #include "android-base/strings.h"
@@ -685,40 +684,12 @@
       REQUIRES_SHARED(Locks::mutator_lock_) {
     TimingLogger logger(__PRETTY_FUNCTION__, /*precise=*/ true, VLOG_IS_ON(image));
 
-    std::unique_ptr<ThreadPool> thread_pool;
     std::unique_ptr<ImageSpace> space = Init(image_filename,
                                              image_location,
                                              oat_file,
                                              &logger,
-                                             &thread_pool,
                                              image_reservation,
                                              error_msg);
-    if (thread_pool != nullptr) {
-      // Delay the thread pool deletion to prevent the deletion slowing down the startup by causing
-      // preemption. TODO: Just do this in heap trim.
-      static constexpr uint64_t kThreadPoolDeleteDelay = MsToNs(5000);
-
-      class DeleteThreadPoolTask : public HeapTask {
-       public:
-        explicit DeleteThreadPoolTask(std::unique_ptr<ThreadPool>&& thread_pool)
-            : HeapTask(NanoTime() + kThreadPoolDeleteDelay), thread_pool_(std::move(thread_pool)) {}
-
-        void Run(Thread* self) override {
-          ScopedTrace trace("DestroyThreadPool");
-          ScopedThreadStateChange stsc(self, kNative);
-          thread_pool_.reset();
-        }
-
-       private:
-        std::unique_ptr<ThreadPool> thread_pool_;
-      };
-      gc::TaskProcessor* const processor = Runtime::Current()->GetHeap()->GetTaskProcessor();
-      // The thread pool is already done being used since Init has finished running. Deleting the
-      // thread pool is done async since it takes a non-trivial amount of time to do.
-      if (processor != nullptr) {
-        processor->AddTask(Thread::Current(), new DeleteThreadPoolTask(std::move(thread_pool)));
-      }
-    }
     if (space != nullptr) {
       uint32_t expected_reservation_size =
           RoundUp(space->GetImageHeader().GetImageSize(), kPageSize);
@@ -779,7 +750,6 @@
                                           const char* image_location,
                                           const OatFile* oat_file,
                                           TimingLogger* logger,
-                                          std::unique_ptr<ThreadPool>* thread_pool,
                                           /*inout*/MemMap* image_reservation,
                                           /*out*/std::string* error_msg)
       REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -856,18 +826,6 @@
       return nullptr;
     }
 
-    const size_t kMinBlocks = 2;
-    if (thread_pool != nullptr && image_header->GetBlockCount() >= kMinBlocks) {
-      TimingLogger::ScopedTiming timing("CreateThreadPool", logger);
-      ScopedThreadStateChange stsc(Thread::Current(), kNative);
-      constexpr size_t kStackSize = 64 * KB;
-      constexpr size_t kMaxRuntimeWorkers = 4u;
-      const size_t num_workers =
-          std::min(static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeWorkers);
-      thread_pool->reset(new ThreadPool("Image", num_workers, /*create_peers=*/false, kStackSize));
-      thread_pool->get()->StartWorkers(Thread::Current());
-    }
-
     // GetImageBegin is the preferred address to map the image. If we manage to map the
     // image at the image begin, the amount of fixup work required is minimized.
     // If it is pic we will retry with error_msg for the failure case. Pass a null error_msg to
@@ -880,7 +838,6 @@
         *image_header,
         file->Fd(),
         logger,
-        thread_pool != nullptr ? thread_pool->get() : nullptr,
         image_reservation,
         error_msg);
     if (!map.IsValid()) {
@@ -971,7 +928,6 @@
                               const ImageHeader& image_header,
                               int fd,
                               TimingLogger* logger,
-                              ThreadPool* pool,
                               /*inout*/MemMap* image_reservation,
                               /*out*/std::string* error_msg) {
     TimingLogger::ScopedTiming timing("MapImageFile", logger);
@@ -1015,9 +971,12 @@
       }
       memcpy(map.Begin(), &image_header, sizeof(ImageHeader));
 
+      Runtime::ScopedThreadPoolUsage stpu;
+      ThreadPool* const pool = stpu.GetThreadPool();
       const uint64_t start = NanoTime();
       Thread* const self = Thread::Current();
-      const bool use_parallel = pool != nullptr;
+      static constexpr size_t kMinBlocks = 2u;
+      const bool use_parallel = pool != nullptr && image_header.GetBlockCount() >= kMinBlocks;
       for (const ImageHeader::Block& block : image_header.GetBlocks(temp_map.Begin())) {
         auto function = [&](Thread*) {
           const uint64_t start2 = NanoTime();
@@ -1915,7 +1874,6 @@
                         image_location.c_str(),
                         /*oat_file=*/ nullptr,
                         logger,
-                        /*thread_pool=*/ nullptr,
                         image_reservation,
                         error_msg);
   }
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 7eac3d9..bd0e5a4 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -34,6 +34,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <limits>
+#include <thread>
 #include <vector>
 
 #include "android-base/strings.h"
@@ -233,6 +234,7 @@
       class_linker_(nullptr),
       signal_catcher_(nullptr),
       java_vm_(nullptr),
+      thread_pool_ref_count_(0u),
       fault_message_(nullptr),
       threads_being_born_(0),
       shutdown_cond_(new ConditionVariable("Runtime shutdown", *Locks::runtime_shutdown_lock_)),
@@ -348,6 +350,8 @@
         << "\n";
   }
 
+  WaitForThreadPoolWorkersToStart();
+
   if (jit_ != nullptr) {
     // Wait for the workers to be created since there can't be any threads attaching during
     // shutdown.
@@ -400,6 +404,8 @@
     // JIT compiler threads.
     jit_->DeleteThreadPool();
   }
+  DeleteThreadPool();
+  CHECK(thread_pool_ == nullptr);
 
   // Make sure our internal threads are dead before we start tearing down things they're using.
   GetRuntimeCallbacks()->StopDebugger();
@@ -930,6 +936,18 @@
 
   // Create the thread pools.
   heap_->CreateThreadPool();
+  {
+    ScopedTrace timing("CreateThreadPool");
+    constexpr size_t kStackSize = 64 * KB;
+    constexpr size_t kMaxRuntimeWorkers = 4u;
+    const size_t num_workers =
+        std::min(static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeWorkers);
+    MutexLock mu(Thread::Current(), *Locks::runtime_thread_pool_lock_);
+    CHECK(thread_pool_ == nullptr);
+    thread_pool_.reset(new ThreadPool("Runtime", num_workers, /*create_peers=*/false, kStackSize));
+    thread_pool_->StartWorkers(Thread::Current());
+  }
+
   // Reset the gc performance data at zygote fork so that the GCs
   // before fork aren't attributed to an app.
   heap_->ResetGcPerformanceInfo();
@@ -2658,4 +2676,45 @@
     GetClassLinker()->VisitClasses(&visitor);
   }
 }
+
+Runtime::ScopedThreadPoolUsage::ScopedThreadPoolUsage()
+    : thread_pool_(Runtime::Current()->AcquireThreadPool()) {}
+
+Runtime::ScopedThreadPoolUsage::~ScopedThreadPoolUsage() {
+  Runtime::Current()->ReleaseThreadPool();
+}
+
+bool Runtime::DeleteThreadPool() {
+  // Make sure workers are started to prevent thread shutdown errors.
+  WaitForThreadPoolWorkersToStart();
+  std::unique_ptr<ThreadPool> thread_pool;
+  {
+    MutexLock mu(Thread::Current(), *Locks::runtime_thread_pool_lock_);
+    if (thread_pool_ref_count_ == 0) {
+      thread_pool = std::move(thread_pool_);
+    }
+  }
+  return thread_pool != nullptr;
+}
+
+ThreadPool* Runtime::AcquireThreadPool() {
+  MutexLock mu(Thread::Current(), *Locks::runtime_thread_pool_lock_);
+  ++thread_pool_ref_count_;
+  return thread_pool_.get();
+}
+
+void Runtime::ReleaseThreadPool() {
+  MutexLock mu(Thread::Current(), *Locks::runtime_thread_pool_lock_);
+  CHECK_GT(thread_pool_ref_count_, 0u);
+  --thread_pool_ref_count_;
+}
+
+void Runtime::WaitForThreadPoolWorkersToStart() {
+  // Need to make sure workers are created before deleting the pool.
+  ScopedThreadPoolUsage stpu;
+  if (stpu.GetThreadPool() != nullptr) {
+    stpu.GetThreadPool()->WaitForWorkersToBeCreated();
+  }
+}
+
 }  // namespace art
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 76cfcd1..a2d519d 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -792,6 +792,28 @@
     return verifier_logging_threshold_ms_;
   }
 
+  // Atomically delete the thread pool if the reference count is 0.
+  bool DeleteThreadPool() REQUIRES(!Locks::runtime_thread_pool_lock_);
+
+  // Wait for all the thread workers to be attached.
+  void WaitForThreadPoolWorkersToStart() REQUIRES(!Locks::runtime_thread_pool_lock_);
+
+  // Scoped usage of the runtime thread pool. Prevents the pool from being
+  // deleted. Note that the thread pool is only for startup and gets deleted after.
+  class ScopedThreadPoolUsage {
+   public:
+    ScopedThreadPoolUsage();
+    ~ScopedThreadPoolUsage();
+
+    // Return the thread pool.
+    ThreadPool* GetThreadPool() const {
+      return thread_pool_;
+    }
+
+   private:
+    ThreadPool* const thread_pool_;
+  };
+
  private:
   static void InitPlatformSignalHandlers();
 
@@ -828,6 +850,9 @@
   //       friend).
   std::string GetFaultMessage();
 
+  ThreadPool* AcquireThreadPool() REQUIRES(!Locks::runtime_thread_pool_lock_);
+  void ReleaseThreadPool() REQUIRES(!Locks::runtime_thread_pool_lock_);
+
   // A pointer to the active runtime or null.
   static Runtime* instance_;
 
@@ -911,6 +936,10 @@
   std::unique_ptr<jit::JitCodeCache> jit_code_cache_;
   std::unique_ptr<jit::JitOptions> jit_options_;
 
+  // Runtime thread pool. The pool is only for startup and gets deleted after.
+  std::unique_ptr<ThreadPool> thread_pool_ GUARDED_BY(Locks::runtime_thread_pool_lock_);
+  size_t thread_pool_ref_count_ GUARDED_BY(Locks::runtime_thread_pool_lock_);
+
   // Fault message, printed when we get a SIGSEGV. Stored as a native-heap object and accessed
   // lock-free, so needs to be atomic.
   std::atomic<std::string*> fault_message_;
@@ -1115,6 +1144,7 @@
 
   // Note: See comments on GetFaultMessage.
   friend std::string GetFaultMessageForAbortLogging();
+  friend class ScopedThreadPoolUsage;
 
   DISALLOW_COPY_AND_ASSIGN(Runtime);
 };
diff --git a/runtime/runtime_callbacks_test.cc b/runtime/runtime_callbacks_test.cc
index f2e5012..d08be72 100644
--- a/runtime/runtime_callbacks_test.cc
+++ b/runtime/runtime_callbacks_test.cc
@@ -147,6 +147,8 @@
   self->TransitionFromSuspendedToRunnable();
   bool started = runtime_->Start();
   ASSERT_TRUE(started);
+  // Make sure the workers are done starting so we don't get callbacks for them.
+  runtime_->WaitForThreadPoolWorkersToStart();
 
   cb_.state = CallbackState::kBase;  // Ignore main thread attach.
 
diff --git a/test/1919-vminit-thread-start-timing/src/art/Test1919.java b/test/1919-vminit-thread-start-timing/src/art/Test1919.java
index 3d5c079..f6b770f 100644
--- a/test/1919-vminit-thread-start-timing/src/art/Test1919.java
+++ b/test/1919-vminit-thread-start-timing/src/art/Test1919.java
@@ -21,10 +21,12 @@
 
   public static void run() {
     for (Event e : getEvents()) {
-      if (PRINT_ALL_THREADS ||
-          e.thr.equals(Thread.currentThread()) ||
-          e.thr.getName().equals("JVMTI_THREAD-Test1919")) {
-        System.out.println(e.name + ": " + e.thr.getName());
+      if (e.thr != null) {
+        if (PRINT_ALL_THREADS ||
+            e.thr.equals(Thread.currentThread()) ||
+            e.thr.getName().equals("JVMTI_THREAD-Test1919")) {
+          System.out.println(e.name + ": " + e.thr.getName());
+        }
       }
     }
   }