HealthMonitor: handle edge cases of tasks timing out
...when the task ended long before the monitor thread
processes the event queue. This isn't expected to
happen, but has been observed in user logs.
Bug: b/260907576.
Test: Added unit tests, manually repro'ed bug
Change-Id: Ib9d89fcdd8230819c7c6baec6e0114c54fc04d7b
diff --git a/base/HealthMonitor.cpp b/base/HealthMonitor.cpp
index f900427..c3a7103 100644
--- a/base/HealthMonitor.cpp
+++ b/base/HealthMonitor.cpp
@@ -118,7 +118,6 @@
while (keepMonitoring) {
std::vector<std::promise<void>> pollPromises;
std::unordered_set<Id> tasksToRemove;
- int newHungTasks = mHungTasks;
{
AutoLock lock(mLock);
if (mEventQueue.empty()) {
@@ -192,7 +191,7 @@
updateTaskParent(events, task, event.timeOccurred);
// Mark it for deletion, but retain it until the end of
- // the health check concurrent tasks hung
+ // the health check to check concurrent tasks hung
tasksToRemove.insert(event.id);
},
[&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
@@ -204,6 +203,8 @@
*event);
}
+ int newHungTasks = mHungTasks;
+
// Sort by what times out first. Identical timestamps are possible
std::multimap<Timestamp, uint64_t> sortedTasks;
for (auto& [_, task] : mMonitoredTasks) {
@@ -212,8 +213,8 @@
for (auto& [_, task_id] : sortedTasks) {
auto& task = mMonitoredTasks[task_id];
- if (task.timeoutTimestamp < now) {
- // Newly hung task
+ if (task.timeoutTimestamp < now && tasksToRemove.find(task_id) == tasksToRemove.end()) {
+ // Newly hung task which is still in progress
if (!task.hungTimestamp.has_value()) {
// Copy over additional annotations captured at hangTime
if (task.onHangAnnotationsCallback) {
@@ -227,7 +228,7 @@
newHungTasks++;
}
} else {
- // Task resumes
+ // Task resumes or ends
if (task.hungTimestamp.has_value()) {
newHungTasks--;
auto hangTime = duration_cast<std::chrono::milliseconds>(
@@ -241,6 +242,7 @@
task.hungTimestamp = std::nullopt;
}
}
+
if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
mMonitoredTasks.erase(task_id);
}
diff --git a/base/HealthMonitor_unittest.cpp b/base/HealthMonitor_unittest.cpp
index c678a03..1e15a20 100644
--- a/base/HealthMonitor_unittest.cpp
+++ b/base/HealthMonitor_unittest.cpp
@@ -80,6 +80,8 @@
int SToMs(int seconds) { return seconds * 1'000; }
+ void poll() { healthMonitor.poll().wait(); }
+
int defaultHangThresholdS = 5;
MockLogger logger;
HealthMonitor<TestClock> healthMonitor;
@@ -469,6 +471,38 @@
healthMonitor.stopMonitoringTask(parent);
}
+TEST_F(HealthMonitorTest, taskEndProcessedMuchLater) {
+ {
+ InSequence s;
+ EXPECT_CALL(logger, logMetricEvent(_)).Times(0);
+ }
+ auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(1);
+ healthMonitor.stopMonitoringTask(taskId);
+ TestClock::advance(defaultHangThresholdS * 2);
+ poll();
+}
+
+TEST_F(HealthMonitorTest, hungTaskEndProcessedMuchLater) {
+ int expectedHangDurationS = 5;
+ {
+ InSequence s;
+ EXPECT_CALL(logger, logMetricEvent(VariantWith<MetricEventHang>(_))).Times(1);
+ EXPECT_CALL(
+ logger,
+ logMetricEvent(VariantWith<MetricEventUnHang>(AllOf(
+ Field(&MetricEventUnHang::taskId, 0),
+ Field(&MetricEventUnHang::hung_ms, AllOf(Ge(SToMs(expectedHangDurationS - 1)),
+ Le(SToMs(expectedHangDurationS + 1))))))))
+ .Times(1);
+ }
+ auto taskId = healthMonitor.startMonitoringTask(std::make_unique<EventHangMetadata>());
+ step(defaultHangThresholdS + expectedHangDurationS);
+ healthMonitor.stopMonitoringTask(taskId);
+ TestClock::advance(defaultHangThresholdS * 2);
+ poll();
+}
+
class MockHealthMonitor {
public:
using Id = uint32_t;