swap: fix deadlock on the vmm-swap enable operation

The deadlock could happen on this scenario:

1. User enables vmm-swap. The main process sends Command::Enable to the
   monitor process.
2. User request the current status at the same time. The main process
   sends Command::Status to the monitor process and wait for the
   response from the monitor process.
3. The monitor process start enabling vmm-swap. Sends
   VmSwapCommand::Suspend to the main process and wait for
   VmSwapResponse::SuspendCompleted.
4. The main process is blocked by the step 2 and the request from step 3
   is never consumed.

The root issue issue is that Command::Status can be inserted between
Command::Enable and VmSwapCommand::Suspend. This CL simplifies the
communication between the main and monitor processes on enabling
vmm-swap and resolve the deadlock.

VmSwapCommand and corresponding Tube was added by
https://crrev.com/c/4293656, but is now removed by this CL.

BUG=b:275671628
TEST=manual test

Change-Id: Ia838b1feddb4a3f41bd729e9147adc5e2df866aa
Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/4387662
Reviewed-by: David Stevens <stevensd@chromium.org>
Commit-Queue: Shin Kawamura <kawasin@google.com>
diff --git a/src/crosvm/sys/unix.rs b/src/crosvm/sys/unix.rs
index dcb3ab0..f8fa83a 100644
--- a/src/crosvm/sys/unix.rs
+++ b/src/crosvm/sys/unix.rs
@@ -170,10 +170,6 @@
 use smallvec::SmallVec;
 #[cfg(feature = "swap")]
 use swap::SwapController;
-#[cfg(feature = "swap")]
-use swap::VmSwapCommand;
-#[cfg(feature = "swap")]
-use swap::VmSwapResponse;
 use sync::Condvar;
 use sync::Mutex;
 use vm_control::*;
@@ -1525,7 +1521,7 @@
     mut vm: V,
     irq_chip: &mut dyn IrqChipArch,
     ioapic_host_tube: Option<Tube>,
-    #[cfg(feature = "swap")] swap_controller: Option<(SwapController, Tube)>,
+    #[cfg(feature = "swap")] swap_controller: Option<SwapController>,
 ) -> Result<ExitState>
 where
     Vcpu: VcpuArch + 'static,
@@ -1561,14 +1557,6 @@
     let mut control_tubes = Vec::new();
     let mut irq_control_tubes = Vec::new();
 
-    #[cfg(feature = "swap")]
-    let swap_controller = if let Some((swap_controller, tube)) = swap_controller {
-        control_tubes.push(TaggedControlTube::SwapMonitor(tube));
-        Some(swap_controller)
-    } else {
-        None
-    };
-
     #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
     if let Some(port) = cfg.gdb {
         // GDB needs a control socket to interrupt vcpus.
@@ -2443,35 +2431,6 @@
     }
 }
 
-#[cfg(feature = "swap")]
-fn handle_swap_suspend_command(
-    tube: &Tube,
-    swap_controller: &SwapController,
-    kick_vcpus: impl Fn(VcpuControl),
-    vcpu_num: usize,
-) -> anyhow::Result<()> {
-    info!("suspending vcpus");
-    let _vcpu_guard = VcpuSuspendGuard::new(&kick_vcpus, vcpu_num).context("suspend vcpus")?;
-    info!("suspending devices");
-    // TODO(b/253386409): Use `devices::Suspendable::sleep()` instead of sending `SIGSTOP` signal.
-    let _devices_guard = swap_controller
-        .suspend_devices()
-        .context("suspend devices")?;
-
-    tube.send(&VmSwapResponse::SuspendCompleted)
-        .context("send completed")?;
-
-    // Wait for a resume command.
-    if !matches!(
-        tube.recv().context("wait for VmSwapCommand::Resume")?,
-        VmSwapCommand::Resume
-    ) {
-        anyhow::bail!("the vmm-swap command is not resume.");
-    }
-    info!("resuming vm");
-    Ok(())
-}
-
 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
     mut linux: RunnableLinuxVm<V, Vcpu>,
     sys_allocator: SystemAllocator,
@@ -3254,43 +3213,6 @@
                                     }
                                 }
                             },
-                            #[cfg(feature = "swap")]
-                            TaggedControlTube::SwapMonitor(tube) => {
-                                match tube.recv::<VmSwapCommand>() {
-                                    Ok(VmSwapCommand::Suspend) => {
-                                        if let Err(e) = handle_swap_suspend_command(
-                                            tube,
-                                            // swap_controller must be present if the tube exists.
-                                            swap_controller.as_ref().unwrap(),
-                                            |msg| {
-                                                vcpu::kick_all_vcpus(
-                                                    &vcpu_handles,
-                                                    linux.irq_chip.as_irq_chip(),
-                                                    msg,
-                                                )
-                                            },
-                                            vcpu_handles.len(),
-                                        ) {
-                                            error!("failed to suspend vm: {:?}", e);
-                                            if let Err(e) =
-                                                tube.send(&VmSwapResponse::SuspendFailed)
-                                            {
-                                                error!("failed to send SuspendFailed: {:?}", e);
-                                            }
-                                        }
-                                    }
-                                    Ok(VmSwapCommand::Resume) => {
-                                        // Ignore resume command.
-                                    }
-                                    Err(e) => {
-                                        if let TubeError::Disconnected = e {
-                                            vm_control_indices_to_remove.push(index);
-                                        } else {
-                                            error!("failed to recv VmSwapCommand: {}", e);
-                                        }
-                                    }
-                                }
-                            }
                         }
                     }
                     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/src/crosvm/sys/unix/device_helpers.rs b/src/crosvm/sys/unix/device_helpers.rs
index 54eb056..9b4b443 100644
--- a/src/crosvm/sys/unix/device_helpers.rs
+++ b/src/crosvm/sys/unix/device_helpers.rs
@@ -88,8 +88,6 @@
         expose_with_viommu: bool,
     },
     VmMsync(Tube),
-    #[cfg(feature = "swap")]
-    SwapMonitor(Tube),
 }
 
 impl AsRef<Tube> for TaggedControlTube {
@@ -97,8 +95,6 @@
         use self::TaggedControlTube::*;
         match &self {
             Fs(tube) | Vm(tube) | VmMemory { tube, .. } | VmMsync(tube) => tube,
-            #[cfg(feature = "swap")]
-            SwapMonitor(tube) => tube,
         }
     }
 }
diff --git a/swap/src/lib.rs b/swap/src/lib.rs
index 9885582..e2ba5e4 100644
--- a/swap/src/lib.rs
+++ b/swap/src/lib.rs
@@ -201,6 +201,14 @@
             state_transition: *state_transition,
         }
     }
+
+    fn dummy() -> Self {
+        Status {
+            state: State::Pending,
+            metrics: Metrics::default(),
+            state_transition: StateTransition::default(),
+        }
+    }
 }
 
 /// Commands used in vmm-swap feature internally sent to the monitor process from the main and other
@@ -219,24 +227,6 @@
     ProcessForked(RawDescriptor),
 }
 
-/// Commands sent from the monitor process to the main process.
-#[derive(Serialize, Deserialize, Debug)]
-pub enum VmSwapCommand {
-    /// Suspend vCPUs and devices.
-    Suspend,
-    /// Resume vCPUs and devices.
-    Resume,
-}
-
-/// Response from the main process to the monitor process.
-#[derive(Serialize, Deserialize, Debug)]
-pub enum VmSwapResponse {
-    /// Suspend completes.
-    SuspendCompleted,
-    /// Failed to suspend vCPUs and devices.
-    SuspendFailed,
-}
-
 /// [SwapController] provides APIs to control vmm-swap.
 pub struct SwapController {
     child_process: Child,
@@ -258,7 +248,7 @@
         guest_memory: GuestMemory,
         swap_dir: &Path,
         jail_config: &Option<JailConfig>,
-    ) -> anyhow::Result<(Self, Tube)> {
+    ) -> anyhow::Result<Self> {
         info!("vmm-swap is enabled. launch monitor process.");
 
         let uffd_factory = UffdFactory::new();
@@ -280,10 +270,6 @@
         // to the monitor process. The response is `Status` only.
         let (command_tube_main, command_tube_monitor) =
             Tube::pair().context("create swap command tube")?;
-        // The tube in which `VmSwapCommand` is sent from the monitor process to the main process.
-        // The response is `VmSwapResponse`.
-        let (vm_tube_main, vm_tube_monitor) =
-            Tube::pair().context("create swap vm-request tube")?;
 
         // Allocate eventfd before creating sandbox.
         let bg_job_control = BackgroundJobControl::new().context("create background job event")?;
@@ -298,7 +284,6 @@
             uffd.as_raw_descriptor(),
             swap_file.as_raw_descriptor(),
             command_tube_monitor.as_raw_descriptor(),
-            vm_tube_monitor.as_raw_descriptor(),
             bg_job_control.get_completion_event().as_raw_descriptor(),
             #[cfg(feature = "log_page_fault")]
             page_fault_logger.as_raw_descriptor(),
@@ -334,7 +319,6 @@
             fork_process(jail, keep_rds, Some(String::from("swap monitor")), || {
                 if let Err(e) = monitor_process(
                     command_tube_monitor,
-                    vm_tube_monitor,
                     guest_memory,
                     uffd,
                     swap_file,
@@ -364,14 +348,11 @@
             }
         };
 
-        Ok((
-            Self {
-                child_process,
-                uffd_factory,
-                command_tube: command_tube_main,
-            },
-            vm_tube_main,
-        ))
+        Ok(Self {
+            child_process,
+            uffd_factory,
+            command_tube: command_tube_main,
+        })
     }
 
     /// Enable monitoring page faults and move guest memory to staging memory.
@@ -379,7 +360,10 @@
     /// The pages will be swapped in from the staging memory to the guest memory on page faults
     /// until pages are written into the swap file by [Self::swap_out()].
     ///
-    /// This returns as soon as it succeeds to send request to the monitor process.
+    /// This waits until enabling vmm-swap finishes on the monitor process.
+    ///
+    /// The caller must guarantee that any contents on the guest memory is not updated during
+    /// enabling vmm-swap.
     ///
     /// # Note
     ///
@@ -393,6 +377,11 @@
         self.command_tube
             .send(&Command::Enable)
             .context("send swap enable request")?;
+
+        let _ = self
+            .command_tube
+            .recv::<Status>()
+            .context("receive swap status")?;
         Ok(())
     }
 
@@ -557,7 +546,6 @@
 /// The main thread of the monitor process.
 fn monitor_process(
     command_tube: Tube,
-    vm_tube: Tube,
     guest_memory: GuestMemory,
     uffd: Userfaultfd,
     swap_file: File,
@@ -667,7 +655,6 @@
                                 &uffd_list,
                                 &guest_memory,
                                 &command_tube,
-                                &vm_tube,
                                 &worker,
                                 &mutex_transition,
                                 &bg_job_control,
@@ -732,58 +719,71 @@
     Failed,
 }
 
+fn handle_enable_command<'scope>(
+    state: SwapState,
+    bg_job_control: &BackgroundJobControl,
+    page_handler: &PageHandler,
+    guest_memory: &GuestMemory,
+    worker: &Worker<MoveToStaging>,
+    state_transition: &Mutex<StateTransition>,
+) -> anyhow::Result<SwapState<'scope>> {
+    match state {
+        SwapState::SwapInInProgress(join_handle) => {
+            info!("abort swap-in");
+            abort_background_job(join_handle, bg_job_control).context("abort swap-in")?;
+        }
+        SwapState::Trim(join_handle) => {
+            info!("abort trim");
+            abort_background_job(join_handle, bg_job_control).context("abort trim")?;
+        }
+        _ => {}
+    }
+
+    info!("start moving memory to staging");
+    match move_guest_to_staging(page_handler, guest_memory, worker) {
+        Ok(new_state_transition) => {
+            info!(
+                "move {} pages to staging in {} ms",
+                new_state_transition.pages, new_state_transition.time_ms
+            );
+            *state_transition.lock() = new_state_transition;
+            Ok(SwapState::SwapOutPending)
+        }
+        Err(e) => {
+            error!("failed to move memory to staging: {}", e);
+            *state_transition.lock() = StateTransition::default();
+            Ok(SwapState::Failed)
+        }
+    }
+}
+
 fn move_guest_to_staging(
     page_handler: &PageHandler,
     guest_memory: &GuestMemory,
-    vm_tube: &Tube,
     worker: &Worker<MoveToStaging>,
 ) -> anyhow::Result<StateTransition> {
     let start_time = std::time::Instant::now();
 
-    // Suspend vCPUs and devices from the main process.
-    vm_tube
-        .send(&VmSwapCommand::Suspend)
-        .context("request suspend")?;
-
     let mut pages = 0;
 
-    let result = match vm_tube.recv().context("recv suspend completed") {
-        Ok(VmSwapResponse::SuspendCompleted) => {
-            let result = guest_memory.with_regions::<_, anyhow::Error>(
-                |MemoryRegionInformation {
-                     host_addr,
-                     shm,
-                     shm_offset,
-                     ..
-                 }| {
-                    // safe because:
-                    // * all the regions are registered to all userfaultfd
-                    // * no process access the guest memory
-                    // * page fault events are handled by PageHandler
-                    // * wait for all the copy completed within _processes_guard
-                    pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
-                        .context("move to staging")?;
-                    Ok(())
-                },
-            );
-            worker.channel.wait_complete();
-            result
-        }
-        Ok(VmSwapResponse::SuspendFailed) => Err(anyhow::anyhow!("failed to suspend vm")),
-        // When failed to receive suspend response, try resume the vm.
-        Err(e) => Err(e),
-    };
-
-    // Resume vCPUs and devices from the main process.
-    if let Err(e) = vm_tube
-        .send(&VmSwapCommand::Resume)
-        .context("request resume")
-    {
-        if let Err(e) = result {
-            error!("failed to move memory to staging: {:?}", e);
-        }
-        return Err(e);
-    }
+    let result = guest_memory.with_regions::<_, anyhow::Error>(
+        |MemoryRegionInformation {
+             host_addr,
+             shm,
+             shm_offset,
+             ..
+         }| {
+            // safe because:
+            // * all the regions are registered to all userfaultfd
+            // * no process access the guest memory
+            // * page fault events are handled by PageHandler
+            // * wait for all the copy completed within _processes_guard
+            pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
+                .context("move to staging")?;
+            Ok(())
+        },
+    );
+    worker.channel.wait_complete();
 
     match result {
         Ok(()) => {
@@ -801,7 +801,7 @@
 }
 
 fn abort_background_job<T>(
-    join_handle: ScopedJoinHandle<'_, T>,
+    join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>,
     bg_job_control: &BackgroundJobControl,
 ) -> anyhow::Result<T> {
     bg_job_control.abort();
@@ -810,7 +810,7 @@
         .join()
         .expect("panic on the background job thread");
     bg_job_control.reset().context("reset swap in event")?;
-    Ok(result)
+    result.context("failure on background job thread")
 }
 
 fn handle_vmm_swap<'scope, 'env>(
@@ -820,13 +820,12 @@
     uffd_list: &'env UffdList,
     guest_memory: &GuestMemory,
     command_tube: &Tube,
-    vm_tube: &Tube,
     worker: &Worker<MoveToStaging>,
     state_transition: &'env Mutex<StateTransition>,
     bg_job_control: &'env BackgroundJobControl,
     #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger,
 ) -> anyhow::Result<bool> {
-    let mut state = match move_guest_to_staging(page_handler, guest_memory, vm_tube, worker) {
+    let mut state = match move_guest_to_staging(page_handler, guest_memory, worker) {
         Ok(transition) => {
             info!(
                 "move {} pages to staging in {} ms",
@@ -841,6 +840,9 @@
             SwapState::Failed
         }
     };
+    command_tube
+        .send(&Status::dummy())
+        .context("send enable finish signal")?;
 
     loop {
         let events = match &state {
@@ -921,38 +923,18 @@
                         bail!("child process is forked while swap is enabled");
                     }
                     Command::Enable => {
-                        match state {
-                            SwapState::SwapInInProgress(join_handle) => {
-                                info!("abort swap-in");
-                                abort_background_job(join_handle, bg_job_control)
-                                    .context("abort swap in")?
-                                    .context("swap_in failure")?;
-                            }
-                            SwapState::Trim(join_handle) => {
-                                info!("abort trim");
-                                abort_background_job(join_handle, bg_job_control)
-                                    .context("abort trim")?
-                                    .context("trim failure")?;
-                            }
-                            _ => {}
-                        }
-
-                        info!("start moving memory to staging");
-                        match move_guest_to_staging(page_handler, guest_memory, vm_tube, worker) {
-                            Ok(new_state_transition) => {
-                                info!(
-                                    "move {} pages to staging in {} ms",
-                                    new_state_transition.pages, new_state_transition.time_ms
-                                );
-                                state = SwapState::SwapOutPending;
-                                *state_transition.lock() = new_state_transition;
-                            }
-                            Err(e) => {
-                                error!("failed to move memory to staging: {}", e);
-                                state = SwapState::Failed;
-                                *state_transition.lock() = StateTransition::default();
-                            }
-                        }
+                        let result = handle_enable_command(
+                            state,
+                            bg_job_control,
+                            page_handler,
+                            guest_memory,
+                            worker,
+                            state_transition,
+                        );
+                        command_tube
+                            .send(&Status::dummy())
+                            .context("send enable finish signal")?;
+                        state = result?;
                     }
                     Command::Trim => match &state {
                         SwapState::SwapOutPending => {
@@ -1011,8 +993,7 @@
                             SwapState::Trim(join_handle) => {
                                 info!("abort trim");
                                 abort_background_job(join_handle, bg_job_control)
-                                    .context("abort trim")?
-                                    .context("trim failure")?;
+                                    .context("abort trim")?;
                             }
                             SwapState::SwapOutInProgress { .. } => {
                                 info!("swap out is aborted");
@@ -1065,8 +1046,7 @@
                             }
                             SwapState::Trim(join_handle) => {
                                 abort_background_job(join_handle, bg_job_control)
-                                    .context("abort trim")?
-                                    .context("trim failure")?;
+                                    .context("abort trim")?;
                             }
                             _ => {}
                         }
diff --git a/vm_control/src/lib.rs b/vm_control/src/lib.rs
index 02eb422..a2918b0 100644
--- a/vm_control/src/lib.rs
+++ b/vm_control/src/lib.rs
@@ -1373,6 +1373,26 @@
             VmRequest::Swap(SwapCommand::Enable) => {
                 #[cfg(feature = "swap")]
                 if let Some(swap_controller) = swap_controller {
+                    // Suspend all vcpus and devices while vmm-swap is enabling (move the guest
+                    // memory contents to the staging memory) to guarantee no processes other than
+                    // the swap monitor process access the guest memory.
+                    let _vcpu_guard = match VcpuSuspendGuard::new(&kick_vcpus, vcpu_size) {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            error!("failed to suspend vcpus: {:?}", e);
+                            return VmResponse::Err(SysError::new(EINVAL));
+                        }
+                    };
+                    // TODO(b/253386409): Use `devices::Suspendable::sleep()` instead of sending
+                    // `SIGSTOP` signal.
+                    let _devices_guard = match swap_controller.suspend_devices() {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            error!("failed to suspend devices: {:?}", e);
+                            return VmResponse::Err(SysError::new(EINVAL));
+                        }
+                    };
+
                     return match swap_controller.enable() {
                         Ok(()) => VmResponse::Ok,
                         Err(e) => {