Restart psi_monitor on failure and try on EINTR
Linux can EINTR epoll.wait, which was causing the psi_monitor thread to
return an Error resulting in the VM shutting down.
Instead, check if the error is EINTR and wait again.
If the psi_monitor thread fails, start it again, until the VM is asked
ot shut down resulting in the epoll receiving the kill event.
Bug: b/424302767
Test: boot VM, interrupt epoll thread.
Flag: EXEMPT ...
Change-Id: I46eea96a7a3b989b630d089f3619d3c9b2f781f2
diff --git a/android/virtmgr/src/crosvm.rs b/android/virtmgr/src/crosvm.rs
index 0b87e39..94fe3ae 100644
--- a/android/virtmgr/src/crosvm.rs
+++ b/android/virtmgr/src/crosvm.rs
@@ -25,6 +25,7 @@
use log::{debug, error, info, warn};
use semver::{Version, VersionReq};
use nix::{
+ errno::Errno,
fcntl::OFlag,
unistd::{pipe2, Uid, User},
sys::epoll::{Epoll, EpollCreateFlags, EpollEvent, EpollFlags, EpollTimeout},
@@ -34,7 +35,7 @@
use rustutils::system_properties;
use shared_child::SharedChild;
use std::borrow::Cow;
-use std::cmp::max;
+use std::cmp::{max, min};
use std::ffi::CString;
use std::fmt;
use std::fs::{read_to_string, File};
@@ -364,17 +365,19 @@
let psi_monitor_kill_event_clone = psi_monitor_kill_event.clone();
let instance = instance.clone();
Some((
- thread::spawn(move || {
- if let Err(e) = psi_monitor(&instance, psi_monitor_kill_event_clone) {
- error!("psi monitor failed: {:#}", e);
- // Spawn thread to kill VM, avoiding a deadlock as this thread joins
- thread::spawn(move || {
- if let Err(e) = instance.kill() {
- error!("Error stopping VM with CID {}: {:?}", instance.cid, e);
- }
- });
- }
- }),
+ thread::Builder::new().name("virt_psi_monitor".to_string()).spawn(
+ move || {
+ let mut expo_bo = 1;
+ // TODO: add metrics to see how often we restart the thread
+ while let Err(e) = psi_monitor(&instance, &psi_monitor_kill_event_clone)
+ {
+ error!("psi monitor failed: {:#}", e);
+ thread::sleep(Duration::from_secs(expo_bo));
+ // Exponential backoff, capped at 60 seconds
+ expo_bo = min(expo_bo * 2, 60);
+ }
+ },
+ )?,
psi_monitor_kill_event,
))
} else {
@@ -411,7 +414,7 @@
}
}
-fn psi_monitor(instance: &Arc<VmInstance>, psi_monitor_kill_event: Arc<EventFd>) -> Result<()> {
+fn psi_monitor(instance: &Arc<VmInstance>, psi_monitor_kill_event: &Arc<EventFd>) -> Result<()> {
// monitor memory, inflate balloon if some contention exists
// This will initialize a PSI monitor that monitors memory contention in
// windows of 500_000us. If "Some" processes are stalled for a preiod of
@@ -430,7 +433,15 @@
loop {
// Set timeout to -1, blocking indefinitely
// https://man7.org/linux/man-pages/man2/epoll_wait.2.html
- epoll.wait(&mut events, EpollTimeout::NONE)?;
+ let epoll_res = epoll.wait(&mut events, EpollTimeout::NONE);
+ if let Err(e) = epoll_res {
+ if e == Errno::EINTR {
+ // Ignore interrupts and wait again
+ continue;
+ } else {
+ return Err(e.into());
+ }
+ }
match events[0].data() {
0 => {
let mut psi_info = String::new();