Restart psi_monitor on failure and try on EINTR

Linux can EINTR epoll.wait, which was causing the psi_monitor thread to
return an Error resulting in the VM shutting down.

Instead, check if the error is EINTR and wait again.

If the psi_monitor thread fails, start it again, until the VM is asked
ot shut down resulting in the epoll receiving the kill event.

Bug: b/424302767
Test: boot VM, interrupt epoll thread.
Flag: EXEMPT ...
Change-Id: I46eea96a7a3b989b630d089f3619d3c9b2f781f2
diff --git a/android/virtmgr/src/crosvm.rs b/android/virtmgr/src/crosvm.rs
index 0b87e39..94fe3ae 100644
--- a/android/virtmgr/src/crosvm.rs
+++ b/android/virtmgr/src/crosvm.rs
@@ -25,6 +25,7 @@
 use log::{debug, error, info, warn};
 use semver::{Version, VersionReq};
 use nix::{
+    errno::Errno,
     fcntl::OFlag,
     unistd::{pipe2, Uid, User},
     sys::epoll::{Epoll, EpollCreateFlags, EpollEvent, EpollFlags, EpollTimeout},
@@ -34,7 +35,7 @@
 use rustutils::system_properties;
 use shared_child::SharedChild;
 use std::borrow::Cow;
-use std::cmp::max;
+use std::cmp::{max, min};
 use std::ffi::CString;
 use std::fmt;
 use std::fs::{read_to_string, File};
@@ -364,17 +365,19 @@
                 let psi_monitor_kill_event_clone = psi_monitor_kill_event.clone();
                 let instance = instance.clone();
                 Some((
-                    thread::spawn(move || {
-                        if let Err(e) = psi_monitor(&instance, psi_monitor_kill_event_clone) {
-                            error!("psi monitor failed: {:#}", e);
-                            // Spawn thread to kill VM, avoiding a deadlock as this thread joins
-                            thread::spawn(move || {
-                                if let Err(e) = instance.kill() {
-                                    error!("Error stopping VM with CID {}: {:?}", instance.cid, e);
-                                }
-                            });
-                        }
-                    }),
+                    thread::Builder::new().name("virt_psi_monitor".to_string()).spawn(
+                        move || {
+                            let mut expo_bo = 1;
+                            // TODO: add metrics to see how often we restart the thread
+                            while let Err(e) = psi_monitor(&instance, &psi_monitor_kill_event_clone)
+                            {
+                                error!("psi monitor failed: {:#}", e);
+                                thread::sleep(Duration::from_secs(expo_bo));
+                                // Exponential backoff, capped at 60 seconds
+                                expo_bo = min(expo_bo * 2, 60);
+                            }
+                        },
+                    )?,
                     psi_monitor_kill_event,
                 ))
             } else {
@@ -411,7 +414,7 @@
     }
 }
 
-fn psi_monitor(instance: &Arc<VmInstance>, psi_monitor_kill_event: Arc<EventFd>) -> Result<()> {
+fn psi_monitor(instance: &Arc<VmInstance>, psi_monitor_kill_event: &Arc<EventFd>) -> Result<()> {
     // monitor memory, inflate balloon if some contention exists
     // This will initialize a PSI monitor that monitors memory contention in
     // windows of 500_000us. If "Some" processes are stalled for a preiod of
@@ -430,7 +433,15 @@
     loop {
         // Set timeout to -1, blocking indefinitely
         // https://man7.org/linux/man-pages/man2/epoll_wait.2.html
-        epoll.wait(&mut events, EpollTimeout::NONE)?;
+        let epoll_res = epoll.wait(&mut events, EpollTimeout::NONE);
+        if let Err(e) = epoll_res {
+            if e == Errno::EINTR {
+                // Ignore interrupts and wait again
+                continue;
+            } else {
+                return Err(e.into());
+            }
+        }
         match events[0].data() {
             0 => {
                 let mut psi_info = String::new();