Parallel image decompression
Add a runtime thread pool to facilitate parallel app image loading.
Use the thread pool to decompress the image, this results in a ~1%
app startup speedup.
Test: test-art-host
Test: manual
Bug: 116052292
Change-Id: If35f71ff632ac58e67d11eed4b5f5b19656cc301
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 2131120..051db4c 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -91,7 +91,8 @@
self->GetThreadName(name);
if (name != "JDWP" &&
name != "Signal Catcher" &&
- !android::base::StartsWith(name, "Jit thread pool")) {
+ !android::base::StartsWith(name, "Jit thread pool") &&
+ !android::base::StartsWith(name, "Runtime worker thread")) {
LOG(FATAL) << "Unexpected thread before start: " << name << " id: "
<< self->GetThreadId();
}
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c772bda..e494bd6 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -627,16 +627,36 @@
return MemMap::Invalid();
}
memcpy(map.Begin(), &image_header, sizeof(ImageHeader));
+
const uint64_t start = NanoTime();
+ ThreadPool* pool = Runtime::Current()->GetThreadPool();
+ Thread* const self = Thread::Current();
+ const size_t kMinBlocks = 2;
+ const bool use_parallel = pool != nullptr &&image_header.GetBlockCount() >= kMinBlocks;
for (const ImageHeader::Block& block : image_header.GetBlocks(temp_map.Begin())) {
- TimingLogger::ScopedTiming timing2("LZ4 decompress image", logger);
- if (!block.Decompress(/*out_ptr=*/map.Begin(), /*in_ptr=*/temp_map.Begin(), error_msg)) {
- if (error_msg != nullptr) {
- *error_msg = "Failed to decompress image block " + *error_msg;
+ auto function = [&](Thread*) {
+ const uint64_t start2 = NanoTime();
+ ScopedTrace trace("LZ4 decompress block");
+ if (!block.Decompress(/*out_ptr=*/map.Begin(),
+ /*in_ptr=*/temp_map.Begin(),
+ error_msg)) {
+ if (error_msg != nullptr) {
+ *error_msg = "Failed to decompress image block " + *error_msg;
+ }
}
- return MemMap::Invalid();
+ VLOG(image) << "Decompress block " << block.GetDataSize() << " -> "
+ << block.GetImageSize() << " in " << PrettyDuration(NanoTime() - start2);
+ };
+ if (use_parallel) {
+ pool->AddTask(self, new FunctionTask(std::move(function)));
+ } else {
+ function(self);
}
}
+ if (use_parallel) {
+ ScopedTrace trace("Waiting for workers");
+ pool->Wait(self, true, false);
+ }
const uint64_t time = NanoTime() - start;
// Add one 1 ns to prevent possible divide by 0.
VLOG(image) << "Decompressing image took " << PrettyDuration(time) << " ("
diff --git a/runtime/image.h b/runtime/image.h
index 76fb3b7..9d98431 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -116,6 +116,14 @@
return storage_mode_;
}
+ uint32_t GetDataSize() const {
+ return data_size_;
+ }
+
+ uint32_t GetImageSize() const {
+ return image_size_;
+ }
+
private:
// Storage method for the image, the image may be compressed.
StorageMode storage_mode_ = kDefaultStorageMode;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ab79b9e..0bf5967 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -34,6 +34,7 @@
#include <cstdio>
#include <cstdlib>
#include <limits>
+#include <thread>
#include <vector>
#include "android-base/strings.h"
@@ -388,6 +389,11 @@
jit_->DeleteThreadPool();
}
+ // Thread pools must be deleted before the runtime shuts down to avoid hanging.
+ if (thread_pool_ != nullptr) {
+ thread_pool_.reset();
+ }
+
// Make sure our internal threads are dead before we start tearing down things they're using.
GetRuntimeCallbacks()->StopDebugger();
delete signal_catcher_;
@@ -910,6 +916,14 @@
jit_->CreateThreadPool();
}
+ if (thread_pool_ == nullptr) {
+ constexpr size_t kMaxRuntimeThreads = 4u;
+ thread_pool_.reset(
+ new ThreadPool("Runtime", std::min(
+ static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeThreads)));
+ thread_pool_->StartWorkers(Thread::Current());
+ }
+
// Create the thread pools.
heap_->CreateThreadPool();
// Reset the gc performance data at zygote fork so that the GCs
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 0ccc7b7..b8bc10d 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -99,6 +99,7 @@
class StackOverflowHandler;
class SuspensionHandler;
class ThreadList;
+class ThreadPool;
class Trace;
struct TraceConfig;
class Transaction;
@@ -789,6 +790,10 @@
return verifier_logging_threshold_ms_;
}
+ ThreadPool* GetThreadPool() {
+ return thread_pool_.get();
+ }
+
private:
static void InitPlatformSignalHandlers();
@@ -882,6 +887,9 @@
// Shared linear alloc for now.
std::unique_ptr<LinearAlloc> linear_alloc_;
+ // Thread pool
+ std::unique_ptr<ThreadPool> thread_pool_;
+
// The number of spins that are done before thread suspension is used to forcibly inflate.
size_t max_spins_before_thin_lock_inflation_;
MonitorList* monitor_list_;
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index 98a1193..ca03bb6 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -18,6 +18,7 @@
#define ART_RUNTIME_THREAD_POOL_H_
#include <deque>
+#include <functional>
#include <vector>
#include "barrier.h"
@@ -48,6 +49,18 @@
}
};
+class FunctionTask : public SelfDeletingTask {
+ public:
+ explicit FunctionTask(std::function<void(Thread*)>&& func) : func_(std::move(func)) {}
+
+ void Run(Thread* self) override {
+ func_(self);
+ }
+
+ private:
+ std::function<void(Thread*)> func_;
+};
+
class ThreadPoolWorker {
public:
static const size_t kDefaultStackSize = 1 * MB;