Parallel image decompression

Add a runtime thread pool to facilitate parallel app image loading.

Use the thread pool to decompress the image, this results in a ~1%
app startup speedup.

Test: test-art-host
Test: manual
Bug: 116052292

Change-Id: If35f71ff632ac58e67d11eed4b5f5b19656cc301
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 2131120..051db4c 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -91,7 +91,8 @@
         self->GetThreadName(name);
         if (name != "JDWP" &&
             name != "Signal Catcher" &&
-            !android::base::StartsWith(name, "Jit thread pool")) {
+            !android::base::StartsWith(name, "Jit thread pool") &&
+            !android::base::StartsWith(name, "Runtime worker thread")) {
           LOG(FATAL) << "Unexpected thread before start: " << name << " id: "
                      << self->GetThreadId();
         }
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c772bda..e494bd6 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -627,16 +627,36 @@
         return MemMap::Invalid();
       }
       memcpy(map.Begin(), &image_header, sizeof(ImageHeader));
+
       const uint64_t start = NanoTime();
+      ThreadPool* pool = Runtime::Current()->GetThreadPool();
+      Thread* const self = Thread::Current();
+      const size_t kMinBlocks = 2;
+      const bool use_parallel = pool != nullptr &&image_header.GetBlockCount() >= kMinBlocks;
       for (const ImageHeader::Block& block : image_header.GetBlocks(temp_map.Begin())) {
-        TimingLogger::ScopedTiming timing2("LZ4 decompress image", logger);
-        if (!block.Decompress(/*out_ptr=*/map.Begin(), /*in_ptr=*/temp_map.Begin(), error_msg)) {
-          if (error_msg != nullptr) {
-            *error_msg = "Failed to decompress image block " + *error_msg;
+        auto function = [&](Thread*) {
+          const uint64_t start2 = NanoTime();
+          ScopedTrace trace("LZ4 decompress block");
+          if (!block.Decompress(/*out_ptr=*/map.Begin(),
+                                /*in_ptr=*/temp_map.Begin(),
+                                error_msg)) {
+            if (error_msg != nullptr) {
+              *error_msg = "Failed to decompress image block " + *error_msg;
+            }
           }
-          return MemMap::Invalid();
+          VLOG(image) << "Decompress block " << block.GetDataSize() << " -> "
+                      << block.GetImageSize() << " in " << PrettyDuration(NanoTime() - start2);
+        };
+        if (use_parallel) {
+          pool->AddTask(self, new FunctionTask(std::move(function)));
+        } else {
+          function(self);
         }
       }
+      if (use_parallel) {
+        ScopedTrace trace("Waiting for workers");
+        pool->Wait(self, true, false);
+      }
       const uint64_t time = NanoTime() - start;
       // Add one 1 ns to prevent possible divide by 0.
       VLOG(image) << "Decompressing image took " << PrettyDuration(time) << " ("
diff --git a/runtime/image.h b/runtime/image.h
index 76fb3b7..9d98431 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -116,6 +116,14 @@
       return storage_mode_;
     }
 
+    uint32_t GetDataSize() const {
+      return data_size_;
+    }
+
+    uint32_t GetImageSize() const {
+      return image_size_;
+    }
+
    private:
     // Storage method for the image, the image may be compressed.
     StorageMode storage_mode_ = kDefaultStorageMode;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ab79b9e..0bf5967 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -34,6 +34,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <limits>
+#include <thread>
 #include <vector>
 
 #include "android-base/strings.h"
@@ -388,6 +389,11 @@
     jit_->DeleteThreadPool();
   }
 
+  // Thread pools must be deleted before the runtime shuts down to avoid hanging.
+  if (thread_pool_ != nullptr) {
+    thread_pool_.reset();
+  }
+
   // Make sure our internal threads are dead before we start tearing down things they're using.
   GetRuntimeCallbacks()->StopDebugger();
   delete signal_catcher_;
@@ -910,6 +916,14 @@
     jit_->CreateThreadPool();
   }
 
+  if (thread_pool_ == nullptr) {
+    constexpr size_t kMaxRuntimeThreads = 4u;
+    thread_pool_.reset(
+        new ThreadPool("Runtime", std::min(
+            static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeThreads)));
+    thread_pool_->StartWorkers(Thread::Current());
+  }
+
   // Create the thread pools.
   heap_->CreateThreadPool();
   // Reset the gc performance data at zygote fork so that the GCs
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 0ccc7b7..b8bc10d 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -99,6 +99,7 @@
 class StackOverflowHandler;
 class SuspensionHandler;
 class ThreadList;
+class ThreadPool;
 class Trace;
 struct TraceConfig;
 class Transaction;
@@ -789,6 +790,10 @@
     return verifier_logging_threshold_ms_;
   }
 
+  ThreadPool* GetThreadPool() {
+    return thread_pool_.get();
+  }
+
  private:
   static void InitPlatformSignalHandlers();
 
@@ -882,6 +887,9 @@
   // Shared linear alloc for now.
   std::unique_ptr<LinearAlloc> linear_alloc_;
 
+  // Thread pool
+  std::unique_ptr<ThreadPool> thread_pool_;
+
   // The number of spins that are done before thread suspension is used to forcibly inflate.
   size_t max_spins_before_thin_lock_inflation_;
   MonitorList* monitor_list_;
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index 98a1193..ca03bb6 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_THREAD_POOL_H_
 
 #include <deque>
+#include <functional>
 #include <vector>
 
 #include "barrier.h"
@@ -48,6 +49,18 @@
   }
 };
 
+class FunctionTask : public SelfDeletingTask {
+ public:
+  explicit FunctionTask(std::function<void(Thread*)>&& func) : func_(std::move(func)) {}
+
+  void Run(Thread* self) override {
+    func_(self);
+  }
+
+ private:
+  std::function<void(Thread*)> func_;
+};
+
 class ThreadPoolWorker {
  public:
   static const size_t kDefaultStackSize = 1 * MB;