Add a HealthMonitor that detects hanging tasks.

Introduced watchdogs on SyncThread and RenderThread operations. For
more detail, see go/gfxstream-health-monitor

Test: Unit testing. Built and ran emulator
Bug: 232819676
Change-Id: I83db70d2d1efba999f0a4b92c4bc4b67b78ed70c
diff --git a/base/Android.bp b/base/Android.bp
index aa07214..289ac4b 100644
--- a/base/Android.bp
+++ b/base/Android.bp
@@ -23,6 +23,7 @@
         "FileUtils.cpp",
         "FunctorThread.cpp",
         "GLObjectCounter.cpp",
+        "HealthMonitor.cpp",
         "LayoutResolver.cpp",
         "MemStream.cpp",
         "StdioStream.cpp",
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
index 40d3800..611f1f2 100644
--- a/base/CMakeLists.txt
+++ b/base/CMakeLists.txt
@@ -73,7 +73,11 @@
         ${gfxstream-base-platform-deps})
 
     target_include_directories(
-        gfxstream-base PUBLIC ${GFXSTREAM_REPO_ROOT})
+        gfxstream-base
+        PRIVATE
+        ${GFXSTREAM_REPO_ROOT}/include
+        PUBLIC
+        ${GFXSTREAM_REPO_ROOT})
     if (NOT MSVC)
         target_compile_options(
             gfxstream-base PRIVATE -fvisibility=default)
@@ -110,6 +114,7 @@
     add_executable(
         gfxstream-base_unittests
         AlignedBuf_unittest.cpp
+        HealthMonitor_unittest.cpp
         ArraySize_unittest.cpp
         LayoutResolver_unittest.cpp
         LruCache_unittest.cpp
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp
new file mode 100644
index 0000000..44c8481
--- /dev/null
+++ b/base/HealthMonitor.cpp
@@ -0,0 +1,237 @@
+/*

+ * Copyright (C) 2022 The Android Open Source Project

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ * http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+#include "HealthMonitor.h"

+

+#include <map>

+

+#include "base/System.h"

+#include "base/testing/TestClock.h"

+#include "host-common/logging.h"

+#include "host-common/GfxstreamFatalError.h"

+

+namespace emugl {

+

+using android::base::AutoLock;

+using android::base::MetricEventHang;

+using android::base::MetricEventUnHang;

+using android::base::TestClock;

+using std::chrono::duration_cast;

+using emugl::ABORT_REASON_OTHER;

+using emugl::FatalError;

+

+template <class... Ts>

+struct MonitoredEventVisitor : Ts... {

+    using Ts::operator()...;

+};

+template <class... Ts>

+MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;

+

+template <class Clock>

+HealthMonitor<Clock>::HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval)

+    : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mLogger(metricsLogger) {

+    start();

+}

+

+template <class Clock>

+HealthMonitor<Clock>::~HealthMonitor() {

+    auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});

+    {

+        AutoLock lock(mLock);

+        mEventQueue.push(std::move(event));

+    }

+    poll();

+    wait();

+}

+

+template <class Clock>

+typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(

+    std::unique_ptr<EventHangMetadata> metadata, uint64_t timeout) {

+    auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();

+    if (timeout < intervalMs) {

+        WARN("Timeout value %d is too low (heartbeat is every %d). Increasing to %d", timeout,

+             intervalMs, intervalMs * 2);

+        timeout = intervalMs * 2;

+    }

+

+    AutoLock lock(mLock);

+    auto id = mNextId++;

+    auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{

+        .id = id,

+        .metadata = std::move(metadata),

+        .timeOccurred = Clock::now(),

+        .timeoutThreshold = Duration(std::chrono::milliseconds(timeout))});

+    mEventQueue.push(std::move(event));

+    return id;

+}

+

+template <class Clock>

+void HealthMonitor<Clock>::touchMonitoredTask(Id id) {

+    auto event = std::make_unique<MonitoredEvent>(

+        typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});

+    AutoLock lock(mLock);

+    mEventQueue.push(std::move(event));

+}

+

+template <class Clock>

+void HealthMonitor<Clock>::stopMonitoringTask(Id id) {

+    auto event = std::make_unique<MonitoredEvent>(

+        typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});

+    AutoLock lock(mLock);

+    mEventQueue.push(std::move(event));

+}

+

+template <class Clock>

+std::future<void> HealthMonitor<Clock>::poll() {

+    auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});

+    std::future<void> ret =

+        std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();

+

+    AutoLock lock(mLock);

+    mEventQueue.push(std::move(event));

+    mCv.signalAndUnlock(&lock);

+    return ret;

+}

+

+// Thread's main loop

+template <class Clock>

+intptr_t HealthMonitor<Clock>::main() {

+    bool keepMonitoring = true;

+    std::queue<std::unique_ptr<MonitoredEvent>> events;

+

+    while (keepMonitoring) {

+        std::vector<std::promise<void>> pollPromises;

+        std::unordered_set<Id> tasksToRemove;

+        int newHungTasks = mHungTasks;

+        {

+            AutoLock lock(mLock);

+            mCv.timedWait(

+                &mLock,

+                android::base::getUnixTimeUs() +

+                    std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());

+            mEventQueue.swap(events);

+        }

+

+        Timestamp now = Clock::now();

+        while (!events.empty()) {

+            auto event(std::move(events.front()));

+            events.pop();

+

+            std::visit(MonitoredEventVisitor{

+                           [](std::monostate& event) {

+                               ERR("MonitoredEvent type not found");

+                               GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) <<

+                                   "MonitoredEvent type not found";

+                           },

+                           [this](typename MonitoredEventType::Start& event) {

+                               auto it = mMonitoredTasks.find(event.id);

+                               if (it != mMonitoredTasks.end()) {

+                                   ERR("Registered multiple start events for task %d", event.id);

+                                   return;

+                               }

+                               mMonitoredTasks.emplace(

+                                   event.id, std::move(MonitoredTask{

+                                                 .id = event.id,

+                                                 .timeoutTimestamp =

+                                                     event.timeOccurred + event.timeoutThreshold,

+                                                 .timeoutThreshold = event.timeoutThreshold,

+                                                 .hungTimestamp = std::nullopt,

+                                                 .metadata = std::move(event.metadata)}));

+                           },

+                           [this](typename MonitoredEventType::Touch& event) {

+                               auto it = mMonitoredTasks.find(event.id);

+                               if (it == mMonitoredTasks.end()) {

+                                   ERR("HealthMonitor has no task in progress for id %d", event.id);

+                                   return;

+                               }

+

+                               auto& task = it->second;

+                               task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;

+                           },

+                           [this, &tasksToRemove](typename MonitoredEventType::Stop& event) {

+                               auto it = mMonitoredTasks.find(event.id);

+                               if (it == mMonitoredTasks.end()) {

+                                   ERR("HealthMonitor has no task in progress for id %d", event.id);

+                                   return;

+                               }

+

+                               auto& task = it->second;

+                               task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;

+

+                               // Mark it for deletion, but retain it until the end of

+                               // the health check concurrent tasks hung

+                               tasksToRemove.insert(event.id);

+                           },

+                           [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {

+                               keepMonitoring = false;

+                           },

+                           [&pollPromises](typename MonitoredEventType::Poll& event) {

+                               pollPromises.push_back(std::move(event.complete));

+                           }},

+                       *event);

+        }

+

+        // Sort by what times out first

+        std::map<Timestamp, uint64_t> sortedTasks;

+        for (auto& [_, task] : mMonitoredTasks) {

+            sortedTasks[task.timeoutTimestamp] = task.id;

+        }

+

+        for (auto& [_, task_id] : sortedTasks) {

+            auto& task = mMonitoredTasks[task_id];

+            if (task.timeoutTimestamp < now) {

+                // Newly hung task

+                if (!task.hungTimestamp.has_value()) {

+                    mLogger.logMetricEvent(MetricEventHang{.metadata = task.metadata.get(),

+                                                           .otherHungTasks = newHungTasks});

+                    task.hungTimestamp = task.timeoutTimestamp;

+                    newHungTasks++;

+                }

+            } else {

+                // Task resumes

+                if (task.hungTimestamp.has_value()) {

+                    auto hangTime = duration_cast<std::chrono::milliseconds>(

+                                        task.timeoutTimestamp -

+                                        (task.hungTimestamp.value() + task.timeoutThreshold))

+                                        .count();

+                    mLogger.logMetricEvent(

+                        MetricEventUnHang{.metadata = task.metadata.get(), .hung_ms = hangTime});

+                    task.hungTimestamp = std::nullopt;

+                    newHungTasks--;

+                }

+            }

+            if (tasksToRemove.find(task_id) != tasksToRemove.end()) {

+                mMonitoredTasks.erase(task_id);

+            }

+        }

+

+        if (mHungTasks != newHungTasks) {

+            ERR("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",

+                mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);

+            mHungTasks = newHungTasks;

+        }

+

+        for (auto& complete : pollPromises) {

+            complete.set_value();

+        }

+    }

+

+    return 0;

+}

+

+template class HealthMonitor<steady_clock>;

+template class HealthMonitor<TestClock>;

+

+}  // namespace emugl
\ No newline at end of file
diff --git a/base/HealthMonitor.h b/base/HealthMonitor.h
new file mode 100644
index 0000000..c1c2259
--- /dev/null
+++ b/base/HealthMonitor.h
@@ -0,0 +1,169 @@
+/*

+ * Copyright (C) 2022 The Android Open Source Project

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ * http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+#pragma once

+

+#include <chrono>

+#include <future>

+#include <optional>

+#include <queue>

+#include <string>

+#include <unordered_map>

+#include <unordered_set>

+#include <variant>

+

+#include "base/ConditionVariable.h"

+#include "base/Lock.h"

+#include "base/Metrics.h"

+#include "base/Thread.h"

+#include "host-common/logging.h"

+

+using android::base::EventHangMetadata;

+

+#define WATCHDOG_DATA(msg, hangType, data) \

+    std::make_unique<EventHangMetadata>(__FILE__, __func__, msg, __LINE__, hangType, data)

+

+namespace emugl {

+

+using android::base::ConditionVariable;

+using android::base::Lock;

+using android::base::MetricsLogger;

+using std::chrono::duration;

+using std::chrono::steady_clock;

+using std::chrono::time_point;

+

+static uint64_t kDefaultIntervalMs = 1'000;

+static uint64_t kDefaultTimeoutMs = 5'000;

+

+// HealthMonitor provides the ability to register arbitrary start/touch/stop events associated

+// with client defined tasks. At some pre-defined interval, it will periodically consume

+// all logged events to assess whether the system is hanging on any task. Via the

+// MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming.

+// TODO: willho@ Integrate with crashpad to upload host dumps when a hang is detected.

+// Design doc: http://go/gfxstream-health-monitor

+template <class Clock = steady_clock>

+class HealthMonitor : public android::base::Thread {

+   public:

+    // Alias for task id.

+    using Id = uint64_t;

+

+    // Constructor

+    // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for

+    // in between health checks.

+    HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs);

+

+    // Destructor

+    // Enqueues an event to end monitoring and waits on thread to process remaining queued events.

+    ~HealthMonitor();

+

+    // Start monitoring a task. Returns an id that is used for touch and stop operations.

+    // `metadata` is a struct containing info on the task watchdog to be passed through to the

+    // metrics logger.

+    // `timeout` is the duration in milliseconds a task is allowed to run before it's

+    // considered "hung". Because `timeout` must be larger than the monitor's heartbeat

+    // interval, as shorter timeout periods would not be detected, this method will set actual

+    // timeout to the lesser of `timeout` and twice the heartbeat interval.

+    Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,

+                           uint64_t timeout = kDefaultTimeoutMs);

+

+    // Touch a monitored task. Resets the timeout countdown for that task.

+    void touchMonitoredTask(Id id);

+

+    // Stop monitoring a task.

+    void stopMonitoringTask(Id id);

+

+   private:

+    using Duration = typename Clock::duration;  // duration<double>;

+    using Timestamp = time_point<Clock, Duration>;

+

+    // Allow test class access to private functions

+    friend class HealthMonitorTest;

+

+    struct MonitoredEventType {

+        struct Start {

+            Id id;

+            std::unique_ptr<EventHangMetadata> metadata;

+            Timestamp timeOccurred;

+            Duration timeoutThreshold;

+        };

+        struct Touch {

+            Id id;

+            Timestamp timeOccurred;

+        };

+        struct Stop {

+            Id id;

+            Timestamp timeOccurred;

+        };

+        struct EndMonitoring {};

+        struct Poll {

+            std::promise<void> complete;

+        };

+    };

+

+    using MonitoredEvent =

+        std::variant<std::monostate, typename MonitoredEventType::Start,

+                     typename MonitoredEventType::Touch, typename MonitoredEventType::Stop,

+                     typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>;

+

+    struct MonitoredTask {

+        Id id;

+        Timestamp timeoutTimestamp;

+        Duration timeoutThreshold;

+        std::optional<Timestamp> hungTimestamp;

+        std::unique_ptr<EventHangMetadata> metadata;

+    };

+

+    // Thread's main loop

+    intptr_t main() override;

+

+    // Explicitly wake the monitor thread. Returns a future that can be used to wait until the

+    // poll event has been processed.

+    std::future<void> poll();

+

+    // Immutable. Multi-thread access is safe.

+    const Duration mInterval;

+

+    // Members accessed only on the worker thread. Not protected by mutex.

+    int mHungTasks = 0;

+    MetricsLogger& mLogger;

+    std::unordered_map<Id, MonitoredTask> mMonitoredTasks;

+

+    // Lock and cv control access to queue and id counter

+    android::base::ConditionVariable mCv;

+    Lock mLock;

+    Id mNextId = 0;

+    std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue;

+};

+

+// This class provides an RAII mechanism for monitoring a task.

+template <class Clock = steady_clock>

+class HealthWatchdog {

+   public:

+    HealthWatchdog(HealthMonitor<Clock>& healthMonitor, std::unique_ptr<EventHangMetadata> metadata,

+                   uint64_t timeout = kDefaultTimeoutMs)

+        : mHealthMonitor(healthMonitor) {

+        mId = mHealthMonitor.startMonitoringTask(std::move(metadata), timeout);

+    }

+

+    ~HealthWatchdog() { mHealthMonitor.stopMonitoringTask(mId); }

+

+    void touch() { mHealthMonitor.touchMonitoredTask(mId); }

+

+   private:

+    typename HealthMonitor<Clock>::Id mId;

+    HealthMonitor<Clock>& mHealthMonitor;

+};

+

+}  // namespace emugl

diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp
new file mode 100644
index 0000000..c9423eb
--- /dev/null
+++ b/base/HealthMonitor_unittest.cpp
@@ -0,0 +1,291 @@
+// Copyright (C) 2022 The Android Open Source Project

+//

+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at

+//

+// http://www.apache.org/licenses/LICENSE-2.0

+//

+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.

+

+#include <gmock/gmock.h>

+#include <gtest/gtest.h>

+

+#include "base/HealthMonitor.h"

+

+#include <chrono>

+

+#include "base/Metrics.h"

+#include "base/testing/TestClock.h"

+

+namespace emugl {

+

+using android::base::MetricEventHang;

+using android::base::MetricEventType;

+using android::base::MetricEventUnHang;

+using android::base::MetricsLogger;

+using android::base::TestClock;

+using emugl::kDefaultIntervalMs;

+using emugl::kDefaultTimeoutMs;

+using ::testing::_;

+using ::testing::AllOf;

+using ::testing::Field;

+using ::testing::Ge;

+using ::testing::HasSubstr;

+using ::testing::InSequence;

+using ::testing::Le;

+using ::testing::MockFunction;

+using ::testing::Test;

+using ::testing::VariantWith;

+

+class HealthMonitorTest : public Test {

+   protected:

+    class MockLogger : public MetricsLogger {

+       public:

+        MOCK_METHOD(void, logMetricEvent, (MetricEventType eventType), (override));

+    };

+

+    HealthMonitorTest() : healthMonitor(logger, SToMs(1)) { TestClock::reset(); }

+

+    ~HealthMonitorTest() { step(1); }

+

+    void step(int seconds) {

+        for (int i = 0; i < seconds; i++) {

+            TestClock::advance(1);

+            healthMonitor.poll().wait();

+        }

+    }

+

+    int SToMs(int seconds) { return seconds * 1'000; }

+

+    int defaultHangThresholdS = 5;

+    MockLogger logger;

+    HealthMonitor<TestClock> healthMonitor;

+};

+

+TEST_F(HealthMonitorTest, badTimeoutTimeTest) {

+    int expectedHangThresholdS = 2;

+    int expectedHangDurationS = 5;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(logger,

+                    logMetricEvent(VariantWith<MetricEventUnHang>(Field(

+                        &MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS - 1)),

+                                                           Le(SToMs(expectedHangDurationS + 1)))))))

+            .Times(1);

+    }

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(), 1);

+    step(expectedHangThresholdS + expectedHangDurationS);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, startTouchEndEventsTest) {

+    EXPECT_CALL(logger, logMetricEvent(_)).Times(0);

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS - 1);

+    healthMonitor.touchMonitoredTask(id);

+    step(defaultHangThresholdS - 1);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, hangingStartEventTest) {

+    EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + 1);

+}

+

+TEST_F(HealthMonitorTest, lateEndEventTest) {

+    int expectedHangDurationS = 5;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(logger,

+                    logMetricEvent(VariantWith<MetricEventUnHang>(Field(

+                        &MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS - 1)),

+                                                           Le(SToMs(expectedHangDurationS + 1)))))))

+            .Times(1);

+    }

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + expectedHangDurationS);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, taskHangsTwiceTest) {

+    int expectedHangDurationS1 = 3;

+    int expectedHangDurationS2 = 5;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS1 - 1)),

+                                                         Le(SToMs(expectedHangDurationS1 + 1)))))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS2 - 1)),

+                                                         Le(SToMs(expectedHangDurationS2 + 1)))))))

+            .Times(1);

+    }

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + expectedHangDurationS1);

+    healthMonitor.touchMonitoredTask(id);

+    step(defaultHangThresholdS + expectedHangDurationS2);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, taskHangsThriceTest) {

+    int expectedHangDurationS1 = 3;

+    int expectedHangDurationS2 = 5;

+    int expectedHangDurationS3 = 3;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS1 - 1)),

+                                                         Le(SToMs(expectedHangDurationS1 + 1)))))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS2 - 1)),

+                                                         Le(SToMs(expectedHangDurationS2 + 1)))))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS3 - 1)),

+                                                         Le(SToMs(expectedHangDurationS3 + 1)))))))

+            .Times(1);

+    }

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + expectedHangDurationS1);

+    healthMonitor.touchMonitoredTask(id);

+    step(defaultHangThresholdS + expectedHangDurationS2);

+    healthMonitor.touchMonitoredTask(id);

+    step(defaultHangThresholdS + expectedHangDurationS3);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, multipleHangingTasksTest) {

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 0))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 1))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 2))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 3))))

+            .Times(1);

+    }

+

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    TestClock::advance(0.2);

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    TestClock::advance(0.2);

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    TestClock::advance(0.2);

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + 1);

+}

+

+TEST_F(HealthMonitorTest, oneHangingTaskOutOfTwoTest) {

+    EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+

+    healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    auto id2 = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS - 1);

+    healthMonitor.stopMonitoringTask(id2);

+    step(2);

+}

+

+TEST_F(HealthMonitorTest, twoTasksHangNonOverlappingTest) {

+    int expectedHangDurationS1 = 5;

+    int hangThresholdS2 = 10;

+    int expectedHangDurationS2 = 2;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS1 - 1)),

+                                                         Le(SToMs(expectedHangDurationS1 + 1)))))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS2 - 1)),

+                                                         Le(SToMs(expectedHangDurationS2 + 1)))))))

+            .Times(1);

+    }

+

+    auto id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(defaultHangThresholdS + expectedHangDurationS1);

+    healthMonitor.stopMonitoringTask(id);

+    step(1);

+    id = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>(),

+                                           SToMs(hangThresholdS2));

+    step(hangThresholdS2 + expectedHangDurationS2);

+    healthMonitor.stopMonitoringTask(id);

+}

+

+TEST_F(HealthMonitorTest, twoTasksHangOverlappingTest) {

+    int expectedHangDurationS1 = 5;

+    int expectedHangDurationS2 = 8;

+    {

+        InSequence s;

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 0))))

+            .Times(1);

+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(

+                                Field(&MetricEventHang::otherHungTasks, 1))))

+            .Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS1 - 1)),

+                                                         Le(SToMs(expectedHangDurationS1 + 1)))))))

+            .Times(1);

+        EXPECT_CALL(

+            logger,

+            logMetricEvent(VariantWith<MetricEventUnHang>(

+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS2 - 1)),

+                                                         Le(SToMs(expectedHangDurationS2 + 1)))))))

+            .Times(1);

+    }

+

+    auto id1 = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(3);

+    auto id2 = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());

+    step(7);

+    healthMonitor.stopMonitoringTask(id1);

+    step(5);

+    healthMonitor.stopMonitoringTask(id2);

+}

+

+}  // namespace emugl
\ No newline at end of file
diff --git a/base/include/base/Metrics.h b/base/include/base/Metrics.h
index 64852d3..b8d991e 100644
--- a/base/include/base/Metrics.h
+++ b/base/include/base/Metrics.h
@@ -15,16 +15,61 @@
 #pragma once
 
 #include <inttypes.h>
+
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <variant>
 
 // Library to log metrics.
 namespace android {
 namespace base {
 
+// Struct for hanging events
+struct EventHangMetadata {
+    const char* file;
+    const char* function;
+    const char* msg;
+    const int line;
+    // Field for adding custom key value annotations
+    std::unique_ptr<std::unordered_map<std::string, std::string>> data;
+
+    // TODO: willho@ replace this enum with a generic string field embedded in the
+    // proto and replace the individual event codes with a general hang event
+    // Requires a new callback to be passed from the vm to gfxstream_backend_init
+    enum class HangType { kRenderThread, kSyncThread };
+    HangType hangType;
+
+    EventHangMetadata()
+        : file(nullptr),
+          function(nullptr),
+          msg(nullptr),
+          line(0),
+          data(nullptr),
+          hangType(HangType::kRenderThread) {}
+
+    EventHangMetadata(const char* file, const char* function, const char* msg, int line,
+                      HangType hangType,
+                      std::unique_ptr<std::unordered_map<std::string, std::string>> data)
+        : file(file),
+          function(function),
+          msg(msg),
+          line(line),
+          data(std::move(data)),
+          hangType(hangType) {}
+};
+
 // Events that can be logged.
 struct MetricEventFreeze {};
 struct MetricEventUnFreeze { int64_t frozen_ms; };
+struct MetricEventHang {
+    EventHangMetadata* metadata;
+    int64_t otherHungTasks;
+};
+struct MetricEventUnHang {
+    EventHangMetadata* metadata;
+    int64_t hung_ms;
+};
 struct GfxstreamVkAbort {
     const char* file;
     const char* function;
@@ -33,8 +78,8 @@
     int64_t abort_reason;
 };
 
-using MetricEventType =
-    std::variant<std::monostate, MetricEventFreeze, MetricEventUnFreeze, GfxstreamVkAbort>;
+using MetricEventType = std::variant<std::monostate, MetricEventFreeze, MetricEventUnFreeze,
+                                     MetricEventHang, MetricEventUnHang, GfxstreamVkAbort>;
 
 class MetricsLogger {
 public:
diff --git a/base/testing/TestClock.h b/base/testing/TestClock.h
new file mode 100644
index 0000000..169e344
--- /dev/null
+++ b/base/testing/TestClock.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022 The Android Open Source Project

+//

+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at

+//

+// http://www.apache.org/licenses/LICENSE-2.0

+//

+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.

+#pragma once

+

+#include <chrono>

+

+namespace android {

+namespace base {

+

+// TestClock class for testing. Provides basic functions for advancing and resetting time.

+// Satisfies requirements of TrivialClock: https://en.cppreference.com/w/cpp/named_req/TrivialClock

+class TestClock {

+   public:

+    using rep = double;

+    using period = std::ratio<1, 1>; /* TestClock::duration(1) is 1 second */

+    using duration = std::chrono::duration<rep, period>;

+    using time_point = std::chrono::time_point<TestClock>;

+    const bool is_steady = false;

+

+    static time_point now() { return time_point(duration(getInternalTime())); }

+

+    static void advance(double secondsPassed) { getInternalTime() += secondsPassed; }

+

+    static void reset() { getInternalTime() = 0.0; }

+

+   private:

+    static double& getInternalTime() {

+        static double internalTime = 0.0;

+        return internalTime;

+    }

+};

+

+}  // namespace base

+}  // namespace android
\ No newline at end of file
diff --git a/host-common/include/host-common/logging.h b/host-common/include/host-common/logging.h
index 243753b..cb0f61f 100644
--- a/host-common/include/host-common/logging.h
+++ b/host-common/include/host-common/logging.h
@@ -46,6 +46,11 @@
         GFXSTREAM_LOG(stderr, 'E', fmt, ##__VA_ARGS__); \
     } while (0)
 
+#define WARN(fmt, ...)                                  \
+    do {                                                \
+        GFXSTREAM_LOG(stderr, 'W', fmt, ##__VA_ARGS__); \
+    } while (0)
+
 #define INFO(fmt, ...)                                  \
     do {                                                \
         GFXSTREAM_LOG(stdout, 'I', fmt, ##__VA_ARGS__); \
diff --git a/stream-servers/FrameBuffer.cpp b/stream-servers/FrameBuffer.cpp
index a0bee66..4f155fa 100644
--- a/stream-servers/FrameBuffer.cpp
+++ b/stream-servers/FrameBuffer.cpp
@@ -35,6 +35,7 @@
 #include "base/Lock.h"
 #include "base/Lookup.h"
 #include "base/MemoryTracker.h"
+#include "base/Metrics.h"
 #include "base/SharedLibrary.h"
 #include "base/StreamSerializing.h"
 #include "base/System.h"
@@ -900,7 +901,8 @@
     // Start up the single sync thread. If we are using Vulkan native
     // swapchain, then don't initialize SyncThread worker threads with EGL
     // contexts.
-    SyncThread::initialize(/* noGL */ fb->m_displayVk != nullptr);
+    SyncThread::initialize(
+        /* noGL */ fb->m_displayVk != nullptr, fb->getHealthMonitor());
 
     //
     // Keep the singleton framebuffer pointer
@@ -982,30 +984,26 @@
       m_windowHeight(p_height),
       m_useSubWindow(useSubWindow),
       m_fpsStats(getenv("SHOW_FPS_STATS") != nullptr),
-      m_perfStats(
-              !android::base::getEnvironmentVariable("SHOW_PERF_STATS").empty()),
+      m_perfStats(!android::base::getEnvironmentVariable("SHOW_PERF_STATS").empty()),
       m_perfThread(new PerfStatThread(&m_perfStats)),
       m_colorBufferHelper(new ColorBufferHelper(this)),
-      m_readbackThread([this](FrameBuffer::Readback&& readback) {
-          return sendReadbackWorkerCmd(readback);
-      }),
-      m_refCountPipeEnabled(feature_is_enabled(
-              kFeature_RefCountPipe)),
-      m_noDelayCloseColorBufferEnabled(feature_is_enabled(
-              kFeature_NoDelayCloseColorBuffer)),
-      m_postThread([this](Post&& post) {
-          return postWorkerFunc(post);
-      }) {
-     uint32_t displayId = 0;
-     if (createDisplay(&displayId) < 0) {
-         fprintf(stderr, "Failed to create default display\n");
-     }
+      m_readbackThread(
+          [this](FrameBuffer::Readback&& readback) { return sendReadbackWorkerCmd(readback); }),
+      m_refCountPipeEnabled(feature_is_enabled(kFeature_RefCountPipe)),
+      m_noDelayCloseColorBufferEnabled(feature_is_enabled(kFeature_NoDelayCloseColorBuffer)),
+      m_postThread([this](Post&& post) { return postWorkerFunc(post); }),
+      m_logger(CreateMetricsLogger()),
+      m_healthMonitor(*m_logger) {
+    uint32_t displayId = 0;
+    if (createDisplay(&displayId) < 0) {
+        fprintf(stderr, "Failed to create default display\n");
+    }
 
-     setDisplayPose(displayId, 0, 0, getWidth(), getHeight(), 0);
-     m_perfThread->start();
+    setDisplayPose(displayId, 0, 0, getWidth(), getHeight(), 0);
+    m_perfThread->start();
 
-     memset(m_vulkanUUID, 0x0, VK_UUID_SIZE);
-     memset(m_glesUUID, 0x0, GL_UUID_SIZE_EXT);
+    memset(m_vulkanUUID, 0x0, VK_UUID_SIZE);
+    memset(m_glesUUID, 0x0, GL_UUID_SIZE_EXT);
 }
 
 FrameBuffer::~FrameBuffer() {
@@ -3602,6 +3600,8 @@
     m_guestManagedColorBufferLifetime = guestManaged;
 }
 
+HealthMonitor<>& FrameBuffer::getHealthMonitor() { return m_healthMonitor; }
+
 bool FrameBuffer::platformImportResource(uint32_t handle, uint32_t info, void* resource) {
     if (!resource) {
         ERR("Error: resource was null");
diff --git a/stream-servers/FrameBuffer.h b/stream-servers/FrameBuffer.h
index 3a32398..fa3d4fc 100644
--- a/stream-servers/FrameBuffer.h
+++ b/stream-servers/FrameBuffer.h
@@ -28,9 +28,9 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "CompositorGl.h"
 #include "ColorBuffer.h"
 #include "Compositor.h"
+#include "CompositorGl.h"
 #include "DisplayVk.h"
 #include "FbConfig.h"
 #include "GLESVersionDetector.h"
@@ -42,8 +42,10 @@
 #include "Renderer.h"
 #include "TextureDraw.h"
 #include "WindowSurface.h"
+#include "base/HealthMonitor.h"
 #include "base/Lock.h"
 #include "base/MessageChannel.h"
+#include "base/Metrics.h"
 #include "base/Stream.h"
 #include "base/Thread.h"
 #include "base/WorkerThread.h"
@@ -53,6 +55,10 @@
 #include "virtio_gpu_ops.h"
 #include "vulkan/vk_util.h"
 
+using android::base::CreateMetricsLogger;
+using emugl::HealthMonitor;
+using emugl::MetricsLogger;
+
 struct ColorBufferRef {
     ColorBufferPtr cb;
     uint32_t refcount;  // number of client-side references
@@ -596,6 +602,8 @@
                                                                        bool colorBufferIsTarget);
     std::unique_ptr<BorrowedImageInfo> borrowColorBufferForDisplay(uint32_t colorBufferHandle);
 
+    HealthMonitor<>& getHealthMonitor();
+
    private:
     FrameBuffer(int p_width, int p_height, bool useSubWindow);
     // Requires the caller to hold the m_colorBufferMapLock until the new handle is inserted into of
@@ -817,5 +825,8 @@
         EGLSurface surface;
     };
     std::unordered_map<void*, PlatformEglContextInfo> m_platformEglContexts;
+
+    std::unique_ptr<MetricsLogger> m_logger;
+    HealthMonitor<> m_healthMonitor;
 };
 #endif
diff --git a/stream-servers/RenderThread.cpp b/stream-servers/RenderThread.cpp
index 32ca095..e262924 100644
--- a/stream-servers/RenderThread.cpp
+++ b/stream-servers/RenderThread.cpp
@@ -27,8 +27,10 @@
 #include "RendererImpl.h"
 #include "RingStream.h"
 #include "apigen-codec-common/ChecksumCalculatorThreadInfo.h"
+#include "base/HealthMonitor.h"
 #include "base/Lock.h"
 #include "base/MessageChannel.h"
+#include "base/Metrics.h"
 #include "base/StreamSerializing.h"
 #include "base/System.h"
 #include "base/Tracing.h"
@@ -47,6 +49,7 @@
 #include <string.h>
 
 using android::base::AutoLock;
+using android::base::EventHangMetadata;
 using android::base::MessageChannel;
 
 namespace emugl {
@@ -416,6 +419,10 @@
         bool progress;
 
         do {
+            HealthWatchdog watchdog(
+                FrameBuffer::getFB()->getHealthMonitor(),
+                WATCHDOG_DATA("RenderThread decode operation",
+                              EventHangMetadata::HangType::kRenderThread, nullptr));
 
             if (!seqnoPtr && tInfo.m_puid) {
                 seqnoPtr = FrameBuffer::getFB()->getProcessSequenceNumberPtr(tInfo.m_puid);
diff --git a/stream-servers/SyncThread.cpp b/stream-servers/SyncThread.cpp
index 8e24b41..80d236f 100644
--- a/stream-servers/SyncThread.cpp
+++ b/stream-servers/SyncThread.cpp
@@ -17,6 +17,7 @@
 #include "SyncThread.h"
 
 #include "OpenGLESDispatch/OpenGLDispatchLoader.h"
+#include "base/Metrics.h"
 #include "base/System.h"
 #include "base/Thread.h"
 #include "host-common/GfxstreamFatalError.h"
@@ -29,6 +30,7 @@
 #endif
 #include <memory>
 
+using android::base::EventHangMetadata;
 using emugl::ABORT_REASON_OTHER;
 using emugl::FatalError;
 
@@ -66,10 +68,10 @@
 public:
     GlobalSyncThread() = default;
 
-    void initialize(bool noGL) {
+    void initialize(bool noGL, HealthMonitor<>& healthMonitor) {
         AutoLock mutex(mLock);
         SYNC_THREAD_CHECK(!mSyncThread);
-        mSyncThread = std::make_unique<SyncThread>(noGL);
+        mSyncThread = std::make_unique<SyncThread>(noGL, healthMonitor);
     }
     SyncThread* syncThreadPtr() {
         AutoLock mutex(mLock);
@@ -96,10 +98,14 @@
 static const uint32_t kTimelineInterval = 1;
 static const uint64_t kDefaultTimeoutNsecs = 5ULL * 1000ULL * 1000ULL * 1000ULL;
 
-SyncThread::SyncThread(bool noGL)
+SyncThread::SyncThread(bool noGL, HealthMonitor<>& healthMonitor)
     : android::base::Thread(android::base::ThreadFlags::MaskSignals, 512 * 1024),
-      mWorkerThreadPool(kNumWorkerThreads, doSyncThreadCmd),
-      mNoGL(noGL) {
+      mWorkerThreadPool(kNumWorkerThreads,
+                        [this](Command&& command, ThreadPool::WorkerId id) {
+                            doSyncThreadCmd(std::move(command), id);
+                        }),
+      mNoGL(noGL),
+      mHealthMonitor(healthMonitor) {
     this->start();
     mWorkerThreadPool.start();
     if (!noGL) {
@@ -257,6 +263,13 @@
     DPRINT("exit");
 }
 
+void SyncThread::doSyncThreadCmd(Command&& command, WorkerId workerId) {
+    HealthWatchdog watchdog(mHealthMonitor,
+                            WATCHDOG_DATA("SyncThread task execution",
+                                          EventHangMetadata::HangType::kSyncThread, nullptr));
+    command.mTask(workerId);
+}
+
 void SyncThread::initSyncEGLContext() {
     mWorkerThreadPool.broadcast([this] {
         return Command{
@@ -402,16 +415,14 @@
 }
 
 /* static */
-void SyncThread::doSyncThreadCmd(Command&& command, WorkerId workerId) { command.mTask(workerId); }
-
 SyncThread* SyncThread::get() {
     auto res = sGlobalSyncThread()->syncThreadPtr();
     SYNC_THREAD_CHECK(res);
     return res;
 }
 
-void SyncThread::initialize(bool noEGL) {
-    sGlobalSyncThread()->initialize(noEGL);
+void SyncThread::initialize(bool noEGL, HealthMonitor<>& healthMonitor) {
+    sGlobalSyncThread()->initialize(noEGL, healthMonitor);
 }
 
 void SyncThread::destroy() { sGlobalSyncThread()->destroy(); }
diff --git a/stream-servers/SyncThread.h b/stream-servers/SyncThread.h
index 0adbb0f..b892c97 100644
--- a/stream-servers/SyncThread.h
+++ b/stream-servers/SyncThread.h
@@ -1,18 +1,18 @@
 /*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -27,6 +27,7 @@
 
 #include "FenceSync.h"
 #include "base/ConditionVariable.h"
+#include "base/HealthMonitor.h"
 #include "base/Lock.h"
 #include "base/MessageChannel.h"
 #include "base/Optional.h"
@@ -35,18 +36,20 @@
 #include "virtio_gpu_ops.h"
 #include "vulkan/VkDecoderGlobalState.h"
 
+using emugl::HealthMonitor;
+using emugl::HealthWatchdog;
+
 // SyncThread///////////////////////////////////////////////////////////////////
 // The purpose of SyncThread is to track sync device timelines and give out +
 // signal FD's that correspond to the completion of host-side GL fence commands.
 
-
 struct RenderThreadInfo;
 class SyncThread : public android::base::Thread {
-public:
+   public:
     // - constructor: start up the sync worker threads for a given context.
     // The initialization of the sync threads is nonblocking.
     // - Triggers a |SyncThreadCmd| with op code |SYNC_THREAD_EGL_INIT|
-    SyncThread(bool noGL);
+    SyncThread(bool noGL, HealthMonitor<>& healthMonitor);
     ~SyncThread();
 
     // |triggerWait|: async wait with a given FenceSync object.
@@ -55,8 +58,7 @@
     // which should signal the guest-side fence FD.
     // This method is how the goldfish sync virtual device
     // knows when to increment timelines / signal native fence FD's.
-    void triggerWait(FenceSync* fenceSync,
-                     uint64_t timeline);
+    void triggerWait(FenceSync* fenceSync, uint64_t timeline);
 
     // |triggerWaitVk|: async wait with a given VkFence object.
     // The |vkFence| argument is a *boxed* host Vulkan handle of the fence.
@@ -72,7 +74,8 @@
     // while waiting.
     void triggerBlockedWaitNoTimeline(FenceSync* fenceSync);
 
-    // For use with virtio-gpu and async fence completion callback. This is async like triggerWait, but takes a fence completion callback instead of incrementing some timeline directly.
+    // For use with virtio-gpu and async fence completion callback. This is async like triggerWait,
+    // but takes a fence completion callback instead of incrementing some timeline directly.
     void triggerWaitWithCompletionCallback(FenceSync* fenceSync, FenceCompletionCallback);
     void triggerWaitVkWithCompletionCallback(VkFence fenceHandle, FenceCompletionCallback);
     void triggerWaitVkQsriWithCompletionCallback(VkImage image, FenceCompletionCallback);
@@ -86,7 +89,7 @@
     void cleanup();
 
     // Initialize the global sync thread.
-    static void initialize(bool noGL);
+    static void initialize(bool noGL, HealthMonitor<>& healthMonitor);
 
     // Obtains the global sync thread.
     static SyncThread* get();
@@ -121,7 +124,7 @@
     void sendAsync(std::function<void(WorkerId)> job, std::string description);
 
     // |doSyncThreadCmd| execute the actual task. These run on the sync thread.
-    static void doSyncThreadCmd(Command&& command, ThreadPool::WorkerId);
+    void doSyncThreadCmd(Command&& command, ThreadPool::WorkerId);
 
     void doSyncWait(FenceSync* fenceSync, std::function<void()> onComplete);
     static int doSyncWaitVk(VkFence, std::function<void()> onComplete);
@@ -139,5 +142,6 @@
     android::base::ConditionVariable mCv;
     ThreadPool mWorkerThreadPool;
     bool mNoGL;
-};
 
+    HealthMonitor<>& mHealthMonitor;
+};