HealthMonitor: handle edge cases of tasks timing out

...when the task ended long before the monitor thread
processes the event queue. This isn't expected to
happen, but has been observed in user logs.

Bug: b/260907576.
Test: Added unit tests, manually repro'ed bug
Change-Id: Ib9d89fcdd8230819c7c6baec6e0114c54fc04d7b
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp
index f900427..c3a7103 100644
--- a/base/HealthMonitor.cpp
+++ b/base/HealthMonitor.cpp
@@ -118,7 +118,6 @@
     while (keepMonitoring) {
         std::vector<std::promise<void>> pollPromises;
         std::unordered_set<Id> tasksToRemove;
-        int newHungTasks = mHungTasks;
         {
             AutoLock lock(mLock);
             if (mEventQueue.empty()) {
@@ -192,7 +191,7 @@
                                updateTaskParent(events, task, event.timeOccurred);
 
                                // Mark it for deletion, but retain it until the end of
-                               // the health check concurrent tasks hung
+                               // the health check to check concurrent tasks hung
                                tasksToRemove.insert(event.id);
                            },
                            [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
@@ -204,6 +203,8 @@
                        *event);
         }
 
+        int newHungTasks = mHungTasks;
+
         // Sort by what times out first. Identical timestamps are possible
         std::multimap<Timestamp, uint64_t> sortedTasks;
         for (auto& [_, task] : mMonitoredTasks) {
@@ -212,8 +213,8 @@
 
         for (auto& [_, task_id] : sortedTasks) {
             auto& task = mMonitoredTasks[task_id];
-            if (task.timeoutTimestamp < now) {
-                // Newly hung task
+            if (task.timeoutTimestamp < now && tasksToRemove.find(task_id) == tasksToRemove.end()) {
+                // Newly hung task which is still in progress
                 if (!task.hungTimestamp.has_value()) {
                     // Copy over additional annotations captured at hangTime
                     if (task.onHangAnnotationsCallback) {
@@ -227,7 +228,7 @@
                     newHungTasks++;
                 }
             } else {
-                // Task resumes
+                // Task resumes or ends
                 if (task.hungTimestamp.has_value()) {
                     newHungTasks--;
                     auto hangTime = duration_cast<std::chrono::milliseconds>(
@@ -241,6 +242,7 @@
                     task.hungTimestamp = std::nullopt;
                 }
             }
+
             if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
                 mMonitoredTasks.erase(task_id);
             }
diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp
index c678a03..1e15a20 100644
--- a/base/HealthMonitor_unittest.cpp
+++ b/base/HealthMonitor_unittest.cpp
@@ -80,6 +80,8 @@
 
     int SToMs(int seconds) { return seconds * 1'000; }
 
+    void poll() { healthMonitor.poll().wait(); }
+
     int defaultHangThresholdS = 5;
     MockLogger logger;
     HealthMonitor<TestClock> healthMonitor;
@@ -469,6 +471,38 @@
     healthMonitor.stopMonitoringTask(parent);
 }
 
+TEST_F(HealthMonitorTest, taskEndProcessedMuchLater) {
+    {
+        InSequence s;
+        EXPECT_CALL(logger, logMetricEvent(_)).Times(0);
+    }
+    auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+    step(1);
+    healthMonitor.stopMonitoringTask(taskId);
+    TestClock::advance(defaultHangThresholdS * 2);
+    poll();
+}
+
+TEST_F(HealthMonitorTest, hungTaskEndProcessedMuchLater) {
+    int expectedHangDurationS = 5;
+    {
+        InSequence s;
+        EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);
+        EXPECT_CALL(
+            logger,
+            logMetricEvent(VariantWith<MetricEventUnHang>(AllOf(
+                Field(&MetricEventUnHang::taskId, 0),
+                Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS - 1)),
+                                                         Le(SToMs(expectedHangDurationS + 1))))))))
+            .Times(1);
+    }
+    auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+    step(defaultHangThresholdS + expectedHangDurationS);
+    healthMonitor.stopMonitoringTask(taskId);
+    TestClock::advance(defaultHangThresholdS * 2);
+    poll();
+}
+
 class MockHealthMonitor {
    public:
     using Id = uint32_t;