HealthMonitor: handle edge cases of tasks timing out ...when the task ended long before the monitor thread processes the event queue. This isn't expected to happen, but has been observed in user logs. Bug: b/260907576. Test: Added unit tests, manually repro'ed bug Change-Id: Ib9d89fcdd8230819c7c6baec6e0114c54fc04d7b

commit: 8a6e20d8b6e51395178415675c52558ba1d6ac91 [log] [tgz]
author: William Ho <willho@google.com> Fri Dec 16 13:58:23 2022 -0800
committer: William Ho <willho@google.com> Wed Jan 04 12:43:03 2023 -0800
tree: f170160e8dfb0715ebf08b32ad0c9acfe18b3ddb
parent: 1613d88e1a50c295ce825d6f9c415963ee12fb71 [diff]
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp
index f900427..c3a7103 100644
--- a/base/HealthMonitor.cpp
+++ b/base/HealthMonitor.cpp

@@ -118,7 +118,6 @@
     while (keepMonitoring) {
         std::vector<std::promise<void>> pollPromises;
         std::unordered_set<Id> tasksToRemove;
-        int newHungTasks = mHungTasks;
         {
             AutoLock lock(mLock);
             if (mEventQueue.empty()) {
@@ -192,7 +191,7 @@
                                updateTaskParent(events, task, event.timeOccurred);
 
                                // Mark it for deletion, but retain it until the end of
-                               // the health check concurrent tasks hung
+                               // the health check to check concurrent tasks hung
                                tasksToRemove.insert(event.id);
                            },
                            [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
@@ -204,6 +203,8 @@
                        *event);
         }
 
+        int newHungTasks = mHungTasks;
+
         // Sort by what times out first. Identical timestamps are possible
         std::multimap<Timestamp, uint64_t> sortedTasks;
         for (auto& [_, task] : mMonitoredTasks) {
@@ -212,8 +213,8 @@
 
         for (auto& [_, task_id] : sortedTasks) {
             auto& task = mMonitoredTasks[task_id];
-            if (task.timeoutTimestamp < now) {
-                // Newly hung task
+            if (task.timeoutTimestamp < now && tasksToRemove.find(task_id) == tasksToRemove.end()) {
+                // Newly hung task which is still in progress
                 if (!task.hungTimestamp.has_value()) {
                     // Copy over additional annotations captured at hangTime
                     if (task.onHangAnnotationsCallback) {
@@ -227,7 +228,7 @@
                     newHungTasks++;
                 }
             } else {
-                // Task resumes
+                // Task resumes or ends
                 if (task.hungTimestamp.has_value()) {
                     newHungTasks--;
                     auto hangTime = duration_cast<std::chrono::milliseconds>(
@@ -241,6 +242,7 @@
                     task.hungTimestamp = std::nullopt;
                 }
             }
+
             if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
                 mMonitoredTasks.erase(task_id);
             }

diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp
index c678a03..1e15a20 100644
--- a/base/HealthMonitor_unittest.cpp
+++ b/base/HealthMonitor_unittest.cpp

@@ -80,6 +80,8 @@
 
     int SToMs(int seconds) { return seconds * 1'000; }
 
+    void poll() { healthMonitor.poll().wait(); }
+
     int defaultHangThresholdS = 5;
     MockLogger logger;
     HealthMonitor<TestClock> healthMonitor;
@@ -469,6 +471,38 @@
     healthMonitor.stopMonitoringTask(parent);
 }
 
+TEST_F(HealthMonitorTest, taskEndProcessedMuchLater) {
+    {
+        InSequence s;
+        EXPECT_CALL(logger, logMetricEvent(_)).Times(0);
+    }
+    auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+    step(1);
+    healthMonitor.stopMonitoringTask(taskId);
+    TestClock::advance(defaultHangThresholdS * 2);
+    poll();
+}
+
+TEST_F(HealthMonitorTest, hungTaskEndProcessedMuchLater) {
+    int expectedHangDurationS = 5;
+    {
+        InSequence s;
+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);
+        EXPECT_CALL(
+            logger,
+            logMetricEvent(VariantWith<MetricEventUnHang>(AllOf(
+                Field(&MetricEventUnHang::taskId, 0),
+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS - 1)),
+                                                         Le(SToMs(expectedHangDurationS + 1))))))))
+            .Times(1);
+    }
+    auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+    step(defaultHangThresholdS + expectedHangDurationS);
+    healthMonitor.stopMonitoringTask(taskId);
+    TestClock::advance(defaultHangThresholdS * 2);
+    poll();
+}
+
 class MockHealthMonitor {
    public:
     using Id = uint32_t;
commit	8a6e20d8b6e51395178415675c52558ba1d6ac91	[log] [tgz]
author	William Ho <willho@google.com>	Fri Dec 16 13:58:23 2022 -0800
committer	William Ho <willho@google.com>	Wed Jan 04 12:43:03 2023 -0800
tree	f170160e8dfb0715ebf08b32ad0c9acfe18b3ddb
parent	1613d88e1a50c295ce825d6f9c415963ee12fb71 [diff]