Avoid waiting for threads to create

Avoid blocking for threads to create in the constructor. This is
safe because the destructor will block until the threads join.

Also avoid creating the thread pool if the image only has one
block.

Get the GC task proccessor to delete the thread pool.

Bug: 116052292
Test: test-art-host
Change-Id: I80399525caa0775eddade73c11e7ebc06e41416a
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 5045888..6c55450 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -44,6 +44,7 @@
 #include "dex/dex_file_loader.h"
 #include "exec_utils.h"
 #include "gc/accounting/space_bitmap-inl.h"
+#include "gc/task_processor.h"
 #include "image-inl.h"
 #include "image_space_fs.h"
 #include "intern_table-inl.h"
@@ -682,30 +683,39 @@
       REQUIRES_SHARED(Locks::mutator_lock_) {
     TimingLogger logger(__PRETTY_FUNCTION__, /*precise=*/ true, VLOG_IS_ON(image));
 
-    const bool create_thread_pool = true;
     std::unique_ptr<ThreadPool> thread_pool;
-    if (create_thread_pool) {
-      TimingLogger::ScopedTiming timing("CreateThreadPool", &logger);
-      ScopedThreadStateChange stsc(Thread::Current(), kNative);
-      constexpr size_t kStackSize = 64 * KB;
-      constexpr size_t kMaxRuntimeWorkers = 4u;
-      const size_t num_workers =
-          std::min(static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeWorkers);
-      thread_pool.reset(new ThreadPool("Runtime", num_workers, /*create_peers=*/false, kStackSize));
-      thread_pool->StartWorkers(Thread::Current());
-    }
-
     std::unique_ptr<ImageSpace> space = Init(image_filename,
                                              image_location,
                                              oat_file,
                                              &logger,
-                                             thread_pool.get(),
+                                             &thread_pool,
                                              image_reservation,
                                              error_msg);
     if (thread_pool != nullptr) {
-      TimingLogger::ScopedTiming timing("CreateThreadPool", &logger);
-      ScopedThreadStateChange stsc(Thread::Current(), kNative);
-      thread_pool.reset();
+      // Delay the thread pool deletion to prevent the deletion slowing down the startup by causing
+      // preemption. TODO: Just do this in heap trim.
+      static constexpr uint64_t kThreadPoolDeleteDelay = MsToNs(5000);
+
+      class DeleteThreadPoolTask : public HeapTask {
+       public:
+        explicit DeleteThreadPoolTask(std::unique_ptr<ThreadPool>&& thread_pool)
+            : HeapTask(NanoTime() + kThreadPoolDeleteDelay), thread_pool_(std::move(thread_pool)) {}
+
+        void Run(Thread* self) override {
+          ScopedTrace trace("DestroyThreadPool");
+          ScopedThreadStateChange stsc(self, kNative);
+          thread_pool_.reset();
+        }
+
+       private:
+        std::unique_ptr<ThreadPool> thread_pool_;
+      };
+      gc::TaskProcessor* const processor = Runtime::Current()->GetHeap()->GetTaskProcessor();
+      // The thread pool is already done being used since Init has finished running. Deleting the
+      // thread pool is done async since it takes a non-trivial amount of time to do.
+      if (processor != nullptr) {
+        processor->AddTask(Thread::Current(), new DeleteThreadPoolTask(std::move(thread_pool)));
+      }
     }
     if (space != nullptr) {
       uint32_t expected_reservation_size =
@@ -767,7 +777,7 @@
                                           const char* image_location,
                                           const OatFile* oat_file,
                                           TimingLogger* logger,
-                                          ThreadPool* thread_pool,
+                                          std::unique_ptr<ThreadPool>* thread_pool,
                                           /*inout*/MemMap* image_reservation,
                                           /*out*/std::string* error_msg)
       REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -844,6 +854,18 @@
       return nullptr;
     }
 
+    const size_t kMinBlocks = 2;
+    if (thread_pool != nullptr && image_header->GetBlockCount() >= kMinBlocks) {
+      TimingLogger::ScopedTiming timing("CreateThreadPool", logger);
+      ScopedThreadStateChange stsc(Thread::Current(), kNative);
+      constexpr size_t kStackSize = 64 * KB;
+      constexpr size_t kMaxRuntimeWorkers = 4u;
+      const size_t num_workers =
+          std::min(static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeWorkers);
+      thread_pool->reset(new ThreadPool("Image", num_workers, /*create_peers=*/false, kStackSize));
+      thread_pool->get()->StartWorkers(Thread::Current());
+    }
+
     // GetImageBegin is the preferred address to map the image. If we manage to map the
     // image at the image begin, the amount of fixup work required is minimized.
     // If it is pic we will retry with error_msg for the failure case. Pass a null error_msg to
@@ -856,7 +878,7 @@
         *image_header,
         file->Fd(),
         logger,
-        thread_pool,
+        thread_pool != nullptr ? thread_pool->get() : nullptr,
         image_reservation,
         error_msg);
     if (!map.IsValid()) {
@@ -993,8 +1015,7 @@
 
       const uint64_t start = NanoTime();
       Thread* const self = Thread::Current();
-      const size_t kMinBlocks = 2;
-      const bool use_parallel = pool != nullptr && image_header.GetBlockCount() >= kMinBlocks;
+      const bool use_parallel = pool != nullptr;
       for (const ImageHeader::Block& block : image_header.GetBlocks(temp_map.Begin())) {
         auto function = [&](Thread*) {
           const uint64_t start2 = NanoTime();
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index e43d771..de4dd82 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -291,6 +291,12 @@
   return success;
 }
 
+void Jit::WaitForWorkersToBeCreated() {
+  if (thread_pool_ != nullptr) {
+    thread_pool_->WaitForWorkersToBeCreated();
+  }
+}
+
 void Jit::DeleteThreadPool() {
   Thread* self = Thread::Current();
   DCHECK(Runtime::Current()->IsShuttingDown(self));
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index 7ce5f07..1cfbb9c 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -174,6 +174,7 @@
 
   void CreateThreadPool();
   void DeleteThreadPool();
+  void WaitForWorkersToBeCreated();
 
   // Dump interesting info: #methods compiled, code vs data size, compile / verify cumulative
   // loggers.
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index d2c915e..a40ffbd 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -349,6 +349,9 @@
   }
 
   if (jit_ != nullptr) {
+    // Wait for the workers to be created since there can't be any threads attaching during
+    // shutdown.
+    jit_->WaitForWorkersToBeCreated();
     // Stop the profile saver thread before marking the runtime as shutting down.
     // The saver will try to dump the profiles before being sopped and that
     // requires holding the mutator lock.
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index de698c2..e1c756d 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (C) 2012 The Android Open Source Project
  *
@@ -86,7 +87,7 @@
 void ThreadPoolWorker::Run() {
   Thread* self = Thread::Current();
   Task* task = nullptr;
-  thread_pool_->creation_barier_.Wait(self);
+  thread_pool_->creation_barier_.Pass(self);
   while ((task = thread_pool_->GetTask(self)) != nullptr) {
     task->Run(self);
     task->Finalize();
@@ -150,7 +151,7 @@
     MutexLock mu(self, task_queue_lock_);
     shutting_down_ = false;
     // Add one since the caller of constructor waits on the barrier too.
-    creation_barier_.Init(self, max_active_workers_ + 1);
+    creation_barier_.Init(self, max_active_workers_);
     while (GetThreadCount() < max_active_workers_) {
       const std::string worker_name = StringPrintf("%s worker thread %zu", name_.c_str(),
                                                    GetThreadCount());
@@ -158,8 +159,16 @@
           new ThreadPoolWorker(this, worker_name, worker_stack_size_));
     }
   }
-  // Wait for all of the threads to attach.
-  creation_barier_.Wait(Thread::Current());
+}
+
+void ThreadPool::WaitForWorkersToBeCreated() {
+  creation_barier_.Increment(Thread::Current(), 0);
+}
+
+const std::vector<ThreadPoolWorker*>& ThreadPool::GetWorkers() {
+  // Wait for all the workers to be created before returning them.
+  WaitForWorkersToBeCreated();
+  return threads_;
 }
 
 void ThreadPool::DeleteThreads() {
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index f55d72e..0a2a50c 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -101,9 +101,7 @@
     return threads_.size();
   }
 
-  const std::vector<ThreadPoolWorker*>& GetWorkers() const {
-    return threads_;
-  }
+  const std::vector<ThreadPoolWorker*>& GetWorkers();
 
   // Broadcast to the workers and tell them to empty out the work queue.
   void StartWorkers(Thread* self) REQUIRES(!task_queue_lock_);
@@ -154,6 +152,9 @@
   // Set the "nice" priorty for threads in the pool.
   void SetPthreadPriority(int priority);
 
+  // Wait for workers to be created.
+  void WaitForWorkersToBeCreated();
+
  protected:
   // get a task to run, blocks if there are no tasks left
   virtual Task* GetTask(Thread* self) REQUIRES(!task_queue_lock_);