swap: fix deadlock on the vmm-swap enable operation
The deadlock could happen on this scenario:
1. User enables vmm-swap. The main process sends Command::Enable to the
monitor process.
2. User request the current status at the same time. The main process
sends Command::Status to the monitor process and wait for the
response from the monitor process.
3. The monitor process start enabling vmm-swap. Sends
VmSwapCommand::Suspend to the main process and wait for
VmSwapResponse::SuspendCompleted.
4. The main process is blocked by the step 2 and the request from step 3
is never consumed.
The root issue issue is that Command::Status can be inserted between
Command::Enable and VmSwapCommand::Suspend. This CL simplifies the
communication between the main and monitor processes on enabling
vmm-swap and resolve the deadlock.
VmSwapCommand and corresponding Tube was added by
https://crrev.com/c/4293656, but is now removed by this CL.
BUG=b:275671628
TEST=manual test
Change-Id: Ia838b1feddb4a3f41bd729e9147adc5e2df866aa
Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/4387662
Reviewed-by: David Stevens <stevensd@chromium.org>
Commit-Queue: Shin Kawamura <kawasin@google.com>
diff --git a/src/crosvm/sys/unix.rs b/src/crosvm/sys/unix.rs
index dcb3ab0..f8fa83a 100644
--- a/src/crosvm/sys/unix.rs
+++ b/src/crosvm/sys/unix.rs
@@ -170,10 +170,6 @@
use smallvec::SmallVec;
#[cfg(feature = "swap")]
use swap::SwapController;
-#[cfg(feature = "swap")]
-use swap::VmSwapCommand;
-#[cfg(feature = "swap")]
-use swap::VmSwapResponse;
use sync::Condvar;
use sync::Mutex;
use vm_control::*;
@@ -1525,7 +1521,7 @@
mut vm: V,
irq_chip: &mut dyn IrqChipArch,
ioapic_host_tube: Option<Tube>,
- #[cfg(feature = "swap")] swap_controller: Option<(SwapController, Tube)>,
+ #[cfg(feature = "swap")] swap_controller: Option<SwapController>,
) -> Result<ExitState>
where
Vcpu: VcpuArch + 'static,
@@ -1561,14 +1557,6 @@
let mut control_tubes = Vec::new();
let mut irq_control_tubes = Vec::new();
- #[cfg(feature = "swap")]
- let swap_controller = if let Some((swap_controller, tube)) = swap_controller {
- control_tubes.push(TaggedControlTube::SwapMonitor(tube));
- Some(swap_controller)
- } else {
- None
- };
-
#[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), feature = "gdb"))]
if let Some(port) = cfg.gdb {
// GDB needs a control socket to interrupt vcpus.
@@ -2443,35 +2431,6 @@
}
}
-#[cfg(feature = "swap")]
-fn handle_swap_suspend_command(
- tube: &Tube,
- swap_controller: &SwapController,
- kick_vcpus: impl Fn(VcpuControl),
- vcpu_num: usize,
-) -> anyhow::Result<()> {
- info!("suspending vcpus");
- let _vcpu_guard = VcpuSuspendGuard::new(&kick_vcpus, vcpu_num).context("suspend vcpus")?;
- info!("suspending devices");
- // TODO(b/253386409): Use `devices::Suspendable::sleep()` instead of sending `SIGSTOP` signal.
- let _devices_guard = swap_controller
- .suspend_devices()
- .context("suspend devices")?;
-
- tube.send(&VmSwapResponse::SuspendCompleted)
- .context("send completed")?;
-
- // Wait for a resume command.
- if !matches!(
- tube.recv().context("wait for VmSwapCommand::Resume")?,
- VmSwapCommand::Resume
- ) {
- anyhow::bail!("the vmm-swap command is not resume.");
- }
- info!("resuming vm");
- Ok(())
-}
-
fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
mut linux: RunnableLinuxVm<V, Vcpu>,
sys_allocator: SystemAllocator,
@@ -3254,43 +3213,6 @@
}
}
},
- #[cfg(feature = "swap")]
- TaggedControlTube::SwapMonitor(tube) => {
- match tube.recv::<VmSwapCommand>() {
- Ok(VmSwapCommand::Suspend) => {
- if let Err(e) = handle_swap_suspend_command(
- tube,
- // swap_controller must be present if the tube exists.
- swap_controller.as_ref().unwrap(),
- |msg| {
- vcpu::kick_all_vcpus(
- &vcpu_handles,
- linux.irq_chip.as_irq_chip(),
- msg,
- )
- },
- vcpu_handles.len(),
- ) {
- error!("failed to suspend vm: {:?}", e);
- if let Err(e) =
- tube.send(&VmSwapResponse::SuspendFailed)
- {
- error!("failed to send SuspendFailed: {:?}", e);
- }
- }
- }
- Ok(VmSwapCommand::Resume) => {
- // Ignore resume command.
- }
- Err(e) => {
- if let TubeError::Disconnected = e {
- vm_control_indices_to_remove.push(index);
- } else {
- error!("failed to recv VmSwapCommand: {}", e);
- }
- }
- }
- }
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/src/crosvm/sys/unix/device_helpers.rs b/src/crosvm/sys/unix/device_helpers.rs
index 54eb056..9b4b443 100644
--- a/src/crosvm/sys/unix/device_helpers.rs
+++ b/src/crosvm/sys/unix/device_helpers.rs
@@ -88,8 +88,6 @@
expose_with_viommu: bool,
},
VmMsync(Tube),
- #[cfg(feature = "swap")]
- SwapMonitor(Tube),
}
impl AsRef<Tube> for TaggedControlTube {
@@ -97,8 +95,6 @@
use self::TaggedControlTube::*;
match &self {
Fs(tube) | Vm(tube) | VmMemory { tube, .. } | VmMsync(tube) => tube,
- #[cfg(feature = "swap")]
- SwapMonitor(tube) => tube,
}
}
}
diff --git a/swap/src/lib.rs b/swap/src/lib.rs
index 9885582..e2ba5e4 100644
--- a/swap/src/lib.rs
+++ b/swap/src/lib.rs
@@ -201,6 +201,14 @@
state_transition: *state_transition,
}
}
+
+ fn dummy() -> Self {
+ Status {
+ state: State::Pending,
+ metrics: Metrics::default(),
+ state_transition: StateTransition::default(),
+ }
+ }
}
/// Commands used in vmm-swap feature internally sent to the monitor process from the main and other
@@ -219,24 +227,6 @@
ProcessForked(RawDescriptor),
}
-/// Commands sent from the monitor process to the main process.
-#[derive(Serialize, Deserialize, Debug)]
-pub enum VmSwapCommand {
- /// Suspend vCPUs and devices.
- Suspend,
- /// Resume vCPUs and devices.
- Resume,
-}
-
-/// Response from the main process to the monitor process.
-#[derive(Serialize, Deserialize, Debug)]
-pub enum VmSwapResponse {
- /// Suspend completes.
- SuspendCompleted,
- /// Failed to suspend vCPUs and devices.
- SuspendFailed,
-}
-
/// [SwapController] provides APIs to control vmm-swap.
pub struct SwapController {
child_process: Child,
@@ -258,7 +248,7 @@
guest_memory: GuestMemory,
swap_dir: &Path,
jail_config: &Option<JailConfig>,
- ) -> anyhow::Result<(Self, Tube)> {
+ ) -> anyhow::Result<Self> {
info!("vmm-swap is enabled. launch monitor process.");
let uffd_factory = UffdFactory::new();
@@ -280,10 +270,6 @@
// to the monitor process. The response is `Status` only.
let (command_tube_main, command_tube_monitor) =
Tube::pair().context("create swap command tube")?;
- // The tube in which `VmSwapCommand` is sent from the monitor process to the main process.
- // The response is `VmSwapResponse`.
- let (vm_tube_main, vm_tube_monitor) =
- Tube::pair().context("create swap vm-request tube")?;
// Allocate eventfd before creating sandbox.
let bg_job_control = BackgroundJobControl::new().context("create background job event")?;
@@ -298,7 +284,6 @@
uffd.as_raw_descriptor(),
swap_file.as_raw_descriptor(),
command_tube_monitor.as_raw_descriptor(),
- vm_tube_monitor.as_raw_descriptor(),
bg_job_control.get_completion_event().as_raw_descriptor(),
#[cfg(feature = "log_page_fault")]
page_fault_logger.as_raw_descriptor(),
@@ -334,7 +319,6 @@
fork_process(jail, keep_rds, Some(String::from("swap monitor")), || {
if let Err(e) = monitor_process(
command_tube_monitor,
- vm_tube_monitor,
guest_memory,
uffd,
swap_file,
@@ -364,14 +348,11 @@
}
};
- Ok((
- Self {
- child_process,
- uffd_factory,
- command_tube: command_tube_main,
- },
- vm_tube_main,
- ))
+ Ok(Self {
+ child_process,
+ uffd_factory,
+ command_tube: command_tube_main,
+ })
}
/// Enable monitoring page faults and move guest memory to staging memory.
@@ -379,7 +360,10 @@
/// The pages will be swapped in from the staging memory to the guest memory on page faults
/// until pages are written into the swap file by [Self::swap_out()].
///
- /// This returns as soon as it succeeds to send request to the monitor process.
+ /// This waits until enabling vmm-swap finishes on the monitor process.
+ ///
+ /// The caller must guarantee that any contents on the guest memory is not updated during
+ /// enabling vmm-swap.
///
/// # Note
///
@@ -393,6 +377,11 @@
self.command_tube
.send(&Command::Enable)
.context("send swap enable request")?;
+
+ let _ = self
+ .command_tube
+ .recv::<Status>()
+ .context("receive swap status")?;
Ok(())
}
@@ -557,7 +546,6 @@
/// The main thread of the monitor process.
fn monitor_process(
command_tube: Tube,
- vm_tube: Tube,
guest_memory: GuestMemory,
uffd: Userfaultfd,
swap_file: File,
@@ -667,7 +655,6 @@
&uffd_list,
&guest_memory,
&command_tube,
- &vm_tube,
&worker,
&mutex_transition,
&bg_job_control,
@@ -732,58 +719,71 @@
Failed,
}
+fn handle_enable_command<'scope>(
+ state: SwapState,
+ bg_job_control: &BackgroundJobControl,
+ page_handler: &PageHandler,
+ guest_memory: &GuestMemory,
+ worker: &Worker<MoveToStaging>,
+ state_transition: &Mutex<StateTransition>,
+) -> anyhow::Result<SwapState<'scope>> {
+ match state {
+ SwapState::SwapInInProgress(join_handle) => {
+ info!("abort swap-in");
+ abort_background_job(join_handle, bg_job_control).context("abort swap-in")?;
+ }
+ SwapState::Trim(join_handle) => {
+ info!("abort trim");
+ abort_background_job(join_handle, bg_job_control).context("abort trim")?;
+ }
+ _ => {}
+ }
+
+ info!("start moving memory to staging");
+ match move_guest_to_staging(page_handler, guest_memory, worker) {
+ Ok(new_state_transition) => {
+ info!(
+ "move {} pages to staging in {} ms",
+ new_state_transition.pages, new_state_transition.time_ms
+ );
+ *state_transition.lock() = new_state_transition;
+ Ok(SwapState::SwapOutPending)
+ }
+ Err(e) => {
+ error!("failed to move memory to staging: {}", e);
+ *state_transition.lock() = StateTransition::default();
+ Ok(SwapState::Failed)
+ }
+ }
+}
+
fn move_guest_to_staging(
page_handler: &PageHandler,
guest_memory: &GuestMemory,
- vm_tube: &Tube,
worker: &Worker<MoveToStaging>,
) -> anyhow::Result<StateTransition> {
let start_time = std::time::Instant::now();
- // Suspend vCPUs and devices from the main process.
- vm_tube
- .send(&VmSwapCommand::Suspend)
- .context("request suspend")?;
-
let mut pages = 0;
- let result = match vm_tube.recv().context("recv suspend completed") {
- Ok(VmSwapResponse::SuspendCompleted) => {
- let result = guest_memory.with_regions::<_, anyhow::Error>(
- |MemoryRegionInformation {
- host_addr,
- shm,
- shm_offset,
- ..
- }| {
- // safe because:
- // * all the regions are registered to all userfaultfd
- // * no process access the guest memory
- // * page fault events are handled by PageHandler
- // * wait for all the copy completed within _processes_guard
- pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
- .context("move to staging")?;
- Ok(())
- },
- );
- worker.channel.wait_complete();
- result
- }
- Ok(VmSwapResponse::SuspendFailed) => Err(anyhow::anyhow!("failed to suspend vm")),
- // When failed to receive suspend response, try resume the vm.
- Err(e) => Err(e),
- };
-
- // Resume vCPUs and devices from the main process.
- if let Err(e) = vm_tube
- .send(&VmSwapCommand::Resume)
- .context("request resume")
- {
- if let Err(e) = result {
- error!("failed to move memory to staging: {:?}", e);
- }
- return Err(e);
- }
+ let result = guest_memory.with_regions::<_, anyhow::Error>(
+ |MemoryRegionInformation {
+ host_addr,
+ shm,
+ shm_offset,
+ ..
+ }| {
+ // safe because:
+ // * all the regions are registered to all userfaultfd
+ // * no process access the guest memory
+ // * page fault events are handled by PageHandler
+ // * wait for all the copy completed within _processes_guard
+ pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
+ .context("move to staging")?;
+ Ok(())
+ },
+ );
+ worker.channel.wait_complete();
match result {
Ok(()) => {
@@ -801,7 +801,7 @@
}
fn abort_background_job<T>(
- join_handle: ScopedJoinHandle<'_, T>,
+ join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>,
bg_job_control: &BackgroundJobControl,
) -> anyhow::Result<T> {
bg_job_control.abort();
@@ -810,7 +810,7 @@
.join()
.expect("panic on the background job thread");
bg_job_control.reset().context("reset swap in event")?;
- Ok(result)
+ result.context("failure on background job thread")
}
fn handle_vmm_swap<'scope, 'env>(
@@ -820,13 +820,12 @@
uffd_list: &'env UffdList,
guest_memory: &GuestMemory,
command_tube: &Tube,
- vm_tube: &Tube,
worker: &Worker<MoveToStaging>,
state_transition: &'env Mutex<StateTransition>,
bg_job_control: &'env BackgroundJobControl,
#[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger,
) -> anyhow::Result<bool> {
- let mut state = match move_guest_to_staging(page_handler, guest_memory, vm_tube, worker) {
+ let mut state = match move_guest_to_staging(page_handler, guest_memory, worker) {
Ok(transition) => {
info!(
"move {} pages to staging in {} ms",
@@ -841,6 +840,9 @@
SwapState::Failed
}
};
+ command_tube
+ .send(&Status::dummy())
+ .context("send enable finish signal")?;
loop {
let events = match &state {
@@ -921,38 +923,18 @@
bail!("child process is forked while swap is enabled");
}
Command::Enable => {
- match state {
- SwapState::SwapInInProgress(join_handle) => {
- info!("abort swap-in");
- abort_background_job(join_handle, bg_job_control)
- .context("abort swap in")?
- .context("swap_in failure")?;
- }
- SwapState::Trim(join_handle) => {
- info!("abort trim");
- abort_background_job(join_handle, bg_job_control)
- .context("abort trim")?
- .context("trim failure")?;
- }
- _ => {}
- }
-
- info!("start moving memory to staging");
- match move_guest_to_staging(page_handler, guest_memory, vm_tube, worker) {
- Ok(new_state_transition) => {
- info!(
- "move {} pages to staging in {} ms",
- new_state_transition.pages, new_state_transition.time_ms
- );
- state = SwapState::SwapOutPending;
- *state_transition.lock() = new_state_transition;
- }
- Err(e) => {
- error!("failed to move memory to staging: {}", e);
- state = SwapState::Failed;
- *state_transition.lock() = StateTransition::default();
- }
- }
+ let result = handle_enable_command(
+ state,
+ bg_job_control,
+ page_handler,
+ guest_memory,
+ worker,
+ state_transition,
+ );
+ command_tube
+ .send(&Status::dummy())
+ .context("send enable finish signal")?;
+ state = result?;
}
Command::Trim => match &state {
SwapState::SwapOutPending => {
@@ -1011,8 +993,7 @@
SwapState::Trim(join_handle) => {
info!("abort trim");
abort_background_job(join_handle, bg_job_control)
- .context("abort trim")?
- .context("trim failure")?;
+ .context("abort trim")?;
}
SwapState::SwapOutInProgress { .. } => {
info!("swap out is aborted");
@@ -1065,8 +1046,7 @@
}
SwapState::Trim(join_handle) => {
abort_background_job(join_handle, bg_job_control)
- .context("abort trim")?
- .context("trim failure")?;
+ .context("abort trim")?;
}
_ => {}
}
diff --git a/vm_control/src/lib.rs b/vm_control/src/lib.rs
index 02eb422..a2918b0 100644
--- a/vm_control/src/lib.rs
+++ b/vm_control/src/lib.rs
@@ -1373,6 +1373,26 @@
VmRequest::Swap(SwapCommand::Enable) => {
#[cfg(feature = "swap")]
if let Some(swap_controller) = swap_controller {
+ // Suspend all vcpus and devices while vmm-swap is enabling (move the guest
+ // memory contents to the staging memory) to guarantee no processes other than
+ // the swap monitor process access the guest memory.
+ let _vcpu_guard = match VcpuSuspendGuard::new(&kick_vcpus, vcpu_size) {
+ Ok(guard) => guard,
+ Err(e) => {
+ error!("failed to suspend vcpus: {:?}", e);
+ return VmResponse::Err(SysError::new(EINVAL));
+ }
+ };
+ // TODO(b/253386409): Use `devices::Suspendable::sleep()` instead of sending
+ // `SIGSTOP` signal.
+ let _devices_guard = match swap_controller.suspend_devices() {
+ Ok(guard) => guard,
+ Err(e) => {
+ error!("failed to suspend devices: {:?}", e);
+ return VmResponse::Err(SysError::new(EINVAL));
+ }
+ };
+
return match swap_controller.enable() {
Ok(()) => VmResponse::Ok,
Err(e) => {