swap/src/lib.rs - platform/external/crosvm - Git at Google

 // Copyright 2022 The ChromiumOS Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //! crate for the vmm-swap feature.

 #![cfg(unix)]
 #![deny(missing_docs)]

 mod file;
 mod logger;
 mod pagesize;
 mod present_list;
 // this is public only for integration tests.
 pub mod page_handler;
 mod processes;
 mod staging;
 // this is public only for integration tests.
 pub mod userfaultfd;
 // this is public only for integration tests.
 pub mod worker;

 use std::fs::File;
 use std::fs::OpenOptions;
 use std::io::stderr;
 use std::io::stdout;
 use std::ops::Range;
 use std::os::unix::fs::OpenOptionsExt;
 use std::path::Path;
 use std::thread::Scope;
 use std::thread::ScopedJoinHandle;
 use std::time::Duration;
 use std::time::Instant;

 use anyhow::bail;
 use anyhow::Context;
 use base::debug;
 use base::error;
 use base::info;
 use base::syslog;
 use base::unix::process::fork_process;
 use base::unix::process::Child;
 use base::warn;
 use base::AsRawDescriptor;
 use base::AsRawDescriptors;
 use base::EventToken;
 use base::FromRawDescriptor;
 use base::RawDescriptor;
 use base::SharedMemory;
 use base::Tube;
 use base::WaitContext;
 use jail::create_base_minijail;
 use jail::create_sandbox_minijail;
 use jail::JailConfig;
 use jail::SandboxConfig;
 use jail::MAX_OPEN_FILES_DEFAULT;
 use once_cell::sync::Lazy;
 use serde::Deserialize;
 use serde::Serialize;
 use sync::Mutex;
 use vm_memory::GuestMemory;
 use vm_memory::MemoryRegionInformation;

 #[cfg(feature = "log_page_fault")]
 use crate::logger::PageFaultEventLogger;
 use crate::page_handler::MoveToStaging;
 use crate::page_handler::PageHandler;
 use crate::page_handler::MLOCK_BUDGET;
 use crate::pagesize::THP_SIZE;
 use crate::processes::freeze_child_processes;
 use crate::processes::ProcessesGuard;
 use crate::userfaultfd::register_regions;
 use crate::userfaultfd::unregister_regions;
 use crate::userfaultfd::Factory as UffdFactory;
 use crate::userfaultfd::UffdEvent;
 use crate::userfaultfd::Userfaultfd;
 use crate::worker::BackgroundJobControl;
 use crate::worker::Worker;

 /// The max size of chunks to swap out/in at once.
 const MAX_SWAP_CHUNK_SIZE: usize = 2 * 1024 * 1024; // = 2MB
 /// The max pages to trim at once.
 const MAX_TRIM_PAGES: usize = 1024;

 /// Current state of vmm-swap.
 ///
 /// This should not contain fields but be a plain enum because this will be displayed to user using
 /// `serde_json` crate.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub enum State {
     /// vmm-swap is ready. userfaultfd is disabled until vmm-swap is enabled.
     Ready,
     /// Pages in guest memory are moved to the staging memory.
     Pending,
     /// Trimming staging memory.
     TrimInProgress,
     /// swap-out is in progress.
     SwapOutInProgress,
     /// swap out succeeded.
     Active,
     /// swap-in is in progress.
     SwapInInProgress,
     /// swap out failed.
     Failed,
 }

 impl From<&SwapState<'_>> for State {
     fn from(state: &SwapState<'_>) -> Self {
         match state {
             SwapState::SwapOutPending => State::Pending,
             SwapState::Trim(_) => State::TrimInProgress,
             SwapState::SwapOutInProgress { .. } => State::SwapOutInProgress,
             SwapState::SwapOutCompleted => State::Active,
             SwapState::SwapInInProgress(_) => State::SwapInInProgress,
             SwapState::Failed => State::Failed,
         }
     }
 }

 /// Latency and number of pages of swap operations (move to staging, swap out, swap in).
 ///
 /// The meaning of `StateTransition` depends on `State`.
 ///
 /// | `State`             | `StateTransition`                            |
 /// |---------------------|----------------------------------------------|
 /// | `Ready`             | empty or transition record of `swap disable` |
 /// | `Pending`           | transition record of `swap enable`           |
 /// | `SwapOutInProgress` | transition record of `swap out`              |
 /// | `Active`            | transition record of `swap out`              |
 /// | `SwapInInProgress`  | transition record of `swap disable`          |
 /// | `Failed`            | empty                                        |
 #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default)]
 pub struct StateTransition {
     /// The number of pages moved for the state transition.
     pages: usize,
     /// Time taken for the state transition.
     time_ms: u128,
 }

 /// Current metrics of vmm-swap.
 ///
 /// This is only available while vmm-swap is enabled.
 #[derive(Serialize, Deserialize, Debug, Clone, Default)]
 pub struct Metrics {
     /// count of pages on RAM.
     resident_pages: usize,
     /// count of pages copied from the vmm-swap file.
     copied_from_file_pages: usize,
     /// count of pages copied from the staging memory.
     copied_from_staging_pages: usize,
     /// count of pages initialized with zero.
     zeroed_pages: usize,
     /// count of pages which were already initialized on page faults. This can happen when several
     /// threads/processes access the uninitialized/removed page at the same time.
     redundant_pages: usize,
     /// count of pages in staging memory.
     staging_pages: usize,
     /// count of pages in swap files.
     swap_pages: usize,
 }

 impl Metrics {
     fn new(page_handler: &PageHandler) -> Self {
         Self {
             resident_pages: page_handler.compute_resident_pages(),
             copied_from_file_pages: page_handler.compute_copied_from_file_pages(),
             copied_from_staging_pages: page_handler.compute_copied_from_staging_pages(),
             zeroed_pages: page_handler.compute_zeroed_pages(),
             redundant_pages: page_handler.compute_redundant_pages(),
             staging_pages: page_handler.compute_staging_pages(),
             swap_pages: page_handler.compute_swap_pages(),
         }
     }
 }

 /// The response to `crosvm swap status` command.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct Status {
     state: State,
     metrics: Metrics,
     state_transition: StateTransition,
 }

 impl Status {
     fn new(
         state: &SwapState,
         state_transition: StateTransition,
         page_handler: &PageHandler,
     ) -> Self {
         Status {
             state: state.into(),
             metrics: Metrics::new(page_handler),
             state_transition,
         }
     }

     fn disabled(state_transition: &StateTransition) -> Self {
         Status {
             state: State::Ready,
             metrics: Metrics::default(),
             state_transition: *state_transition,
         }
     }

     fn dummy() -> Self {
         Status {
             state: State::Pending,
             metrics: Metrics::default(),
             state_transition: StateTransition::default(),
         }
     }
 }

 /// Commands used in vmm-swap feature internally sent to the monitor process from the main and other
 /// processes.
 ///
 /// This is mainly originated from the `crosvm swap <command>` command line.
 #[derive(Serialize, Deserialize, Debug)]
 enum Command {
     Enable,
     Trim,
     SwapOut,
     Disable,
     Exit,
     Status,
     #[serde(with = "base::platform::with_raw_descriptor")]
     ProcessForked(RawDescriptor),
 }

 /// [SwapController] provides APIs to control vmm-swap.
 pub struct SwapController {
     child_process: Child,
     uffd_factory: UffdFactory,
     command_tube: Tube,
 }

 impl SwapController {
     /// Launch a monitor process for vmm-swap and return a controller.
     ///
     /// Pages on the [GuestMemory] are registered to userfaultfd to track pagefault events.
     ///
     /// # Arguments
     ///
     /// * `guest_memory` - fresh new [GuestMemory]. Any pages on the [GuestMemory] must not be
     ///   touched.
     /// * `swap_dir` - directory to store swap files.
     pub fn launch(
         guest_memory: GuestMemory,
         swap_dir: &Path,
         jail_config: &Option<JailConfig>,
     ) -> anyhow::Result<Self> {
         info!("vmm-swap is enabled. launch monitor process.");

         let uffd_factory = UffdFactory::new();
         let uffd = uffd_factory.create().context("create userfaultfd")?;

         // The swap file is created as `O_TMPFILE` from the specified directory. As benefits:
         //
         // * it has no chance to conflict.
         // * it has a security benefit that no one (except root) can access the swap file.
         // * it will be automatically deleted by the kernel when crosvm exits/dies or on reboot if
         //   the device panics/hard-resets while crosvm is running.
         let swap_file = OpenOptions::new()
             .read(true)
             .write(true)
             .custom_flags(libc::O_TMPFILE | libc::O_EXCL)
             .mode(0o000) // other processes with the same uid can't open the file
             .open(swap_dir)?;
         // The internal tube in which [Command]s sent from other processes than the monitor process
         // to the monitor process. The response is `Status` only.
         let (command_tube_main, command_tube_monitor) =
             Tube::pair().context("create swap command tube")?;

         // Allocate eventfd before creating sandbox.
         let bg_job_control = BackgroundJobControl::new().context("create background job event")?;

         #[cfg(feature = "log_page_fault")]
         let page_fault_logger = PageFaultEventLogger::create(&swap_dir, &guest_memory)
             .context("create page fault logger")?;

         let mut keep_rds = vec![
             stdout().as_raw_descriptor(),
             stderr().as_raw_descriptor(),
             uffd.as_raw_descriptor(),
             swap_file.as_raw_descriptor(),
             command_tube_monitor.as_raw_descriptor(),
             bg_job_control.get_completion_event().as_raw_descriptor(),
             #[cfg(feature = "log_page_fault")]
             page_fault_logger.as_raw_descriptor(),
         ];

         syslog::push_descriptors(&mut keep_rds);
         cros_tracing::push_descriptors!(&mut keep_rds);
         keep_rds.extend(guest_memory.as_raw_descriptors());

         keep_rds.extend(uffd_factory.as_raw_descriptors());

         // Load and cache transparent hugepage size from sysfs before jumping into sandbox.
         Lazy::force(&THP_SIZE);

         let mut jail = if let Some(jail_config) = jail_config {
             let config = SandboxConfig::new(jail_config, "swap_monitor");
             create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)
                 .context("create sandbox jail")?
         } else {
             create_base_minijail(Path::new("/"), MAX_OPEN_FILES_DEFAULT)
                 .context("create minijail")?
         };
         jail.set_rlimit(
             libc::RLIMIT_MEMLOCK as libc::c_int,
             MLOCK_BUDGET as u64,
             MLOCK_BUDGET as u64,
         )
         .context("error setting RLIMIT_MEMLOCK")?;

         // Start a page fault monitoring process (this will be the first child process of the
         // current process)
         let child_process =
             fork_process(jail, keep_rds, Some(String::from("swap monitor")), || {
                 if let Err(e) = monitor_process(
                     command_tube_monitor,
                     guest_memory,
                     uffd,
                     swap_file,
                     bg_job_control,
                     #[cfg(feature = "log_page_fault")]
                     page_fault_logger,
                 ) {
                     panic!("page_fault_handler_thread exited with error: {:?}", e)
                 }
             })
             .context("fork monitor process")?;

         // send first status request to the monitor process and wait for the response until setup on
         // the monitor process completes.
         command_tube_main.send(&Command::Status)?;
         match command_tube_main
             .recv::<Status>()
             .context("recv initial status")?
             .state
         {
             State::Ready => {
                 // The initial state of swap status is Ready and this is a signal that the
                 // monitoring process completes setup and is running.
             }
             status => {
                 bail!("initial state is not Ready, but {:?}", status);
             }
         };

         Ok(Self {
             child_process,
             uffd_factory,
             command_tube: command_tube_main,
         })
     }

     /// Enable monitoring page faults and move guest memory to staging memory.
     ///
     /// The pages will be swapped in from the staging memory to the guest memory on page faults
     /// until pages are written into the swap file by [Self::swap_out()].
     ///
     /// This waits until enabling vmm-swap finishes on the monitor process.
     ///
     /// The caller must guarantee that any contents on the guest memory is not updated during
     /// enabling vmm-swap.
     ///
     /// # Note
     ///
     /// Enabling does not write pages to the swap file. User should call [Self::swap_out()]
     /// after a suitable time.
     ///
     /// Just after enabling vmm-swap, some amount of pages are swapped in as soon as guest resumes.
     /// By splitting the enable/swap_out operation and by delaying write to the swap file operation,
     /// it has a benefit of reducing file I/O for hot pages.
     pub fn enable(&self) -> anyhow::Result<()> {
         self.command_tube
             .send(&Command::Enable)
             .context("send swap enable request")?;

         let _ = self
             .command_tube
             .recv::<Status>()
             .context("receive swap status")?;
         Ok(())
     }

     /// Trim pages in the staging memory which are needless to be written back to the swap file.
     ///
     /// * zero pages
     /// * pages which are the same as the pages in the swap file.
     pub fn trim(&self) -> anyhow::Result<()> {
         self.command_tube
             .send(&Command::Trim)
             .context("send swap trim request")?;
         Ok(())
     }

     /// Swap out all the pages in the staging memory to the swap files.
     ///
     /// This returns as soon as it succeeds to send request to the monitor process.
     ///
     /// Users should call [Self::enable()] before this. See the comment of [Self::enable()] as well.
     pub fn swap_out(&self) -> anyhow::Result<()> {
         self.command_tube
             .send(&Command::SwapOut)
             .context("send swap out request")?;
         Ok(())
     }

     /// Swap in all the guest memory and disable monitoring page faults.
     ///
     /// This returns as soon as it succeeds to send request to the monitor process.
     pub fn disable(&self) -> anyhow::Result<()> {
         self.command_tube
             .send(&Command::Disable)
             .context("send swap disable request")?;
         Ok(())
     }

     /// Return current swap status.
     ///
     /// This blocks until response from the monitor process arrives to the main process.
     pub fn status(&self) -> anyhow::Result<Status> {
         self.command_tube
             .send(&Command::Status)
             .context("send swap status request")?;
         let status = self.command_tube.recv().context("receive swap status")?;
         Ok(status)
     }

     /// Shutdown the monitor process.
     ///
     /// This blocks until the monitor process exits.
     ///
     /// This should be called once.
     pub fn exit(self) -> anyhow::Result<()> {
         self.command_tube
             .send(&Command::Exit)
             .context("send exit command")?;
         self.child_process
             .wait()
             .context("wait monitor process shutdown")?;
         Ok(())
     }

     /// Create a new userfaultfd and send it to the monitor process.
     ///
     /// This must be called as soon as a child process which may touch the guest memory is forked.
     ///
     /// Userfaultfd(2) originally has `UFFD_FEATURE_EVENT_FORK`. But it is not applicable to crosvm
     /// since it does not support non-root user namespace.
     pub fn on_process_forked(&self) -> anyhow::Result<()> {
         let uffd = self.uffd_factory.create().context("create userfaultfd")?;
         self.command_tube
             .send(&Command::ProcessForked(uffd.as_raw_descriptor()))
             .context("send forked event")?;
         // The fd for Userfaultfd in this process is droped when this method exits, but the
         // userfaultfd keeps alive in the monitor process which it is sent to.
         Ok(())
     }

     /// Suspend device processes using `SIGSTOP` signal.
     ///
     /// When the returned `ProcessesGuard` is dropped, the devices resume.
     ///
     /// This must be called from the main process.
     pub fn suspend_devices(&self) -> anyhow::Result<ProcessesGuard> {
         freeze_child_processes(self.child_process.pid)
     }
 }

 impl AsRawDescriptors for SwapController {
     fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
         let mut rds = self.uffd_factory.as_raw_descriptors();
         rds.push(self.command_tube.as_raw_descriptor());
         rds
     }
 }

 #[derive(EventToken)]
 enum Token {
     UffdEvents(u32),
     Command,
     BackgroundJobCompleted,
 }

 struct UffdList<'a> {
     list: Vec<Userfaultfd>,
     wait_ctx: &'a WaitContext<Token>,
 }

 impl<'a> UffdList<'a> {
     const ID_MAIN_UFFD: u32 = 0;

     fn new(main_uffd: Userfaultfd, wait_ctx: &'a WaitContext<Token>) -> Self {
         Self {
             list: vec![main_uffd],
             wait_ctx,
         }
     }

     fn register(&mut self, uffd: Userfaultfd) -> anyhow::Result<()> {
         let id_uffd = self
             .list
             .len()
             .try_into()
             .context("too many userfaultfd forked")?;

         self.wait_ctx
             .add(&uffd, Token::UffdEvents(id_uffd))
             .context("add to wait context")?;
         self.list.push(uffd);

         Ok(())
     }

     fn get(&self, id: u32) -> Option<&Userfaultfd> {
         self.list.get(id as usize)
     }

     fn main_uffd(&self) -> &Userfaultfd {
         &self.list[Self::ID_MAIN_UFFD as usize]
     }

     fn get_list(&self) -> &[Userfaultfd] {
         &self.list
     }
 }

 fn regions_from_guest_memory(guest_memory: &GuestMemory) -> Vec<Range<usize>> {
     let mut regions = Vec::new();
     guest_memory
         .with_regions::<_, ()>(
             |MemoryRegionInformation {
                  size, host_addr, ..
              }| {
                 regions.push(host_addr..(host_addr + size));
                 Ok(())
             },
         )
         .unwrap(); // the callback never return error.
     regions
 }

 /// The main thread of the monitor process.
 fn monitor_process(
     command_tube: Tube,
     guest_memory: GuestMemory,
     uffd: Userfaultfd,
     swap_file: File,
     bg_job_control: BackgroundJobControl,
     #[cfg(feature = "log_page_fault")] mut page_fault_logger: PageFaultEventLogger,
 ) -> anyhow::Result<()> {
     info!("monitor_process started");

     let wait_ctx = WaitContext::build_with(&[
         (&command_tube, Token::Command),
         // Even though swap isn't enabled until the enable command is received, it's necessary to
         // start waiting on the main uffd here so that uffd fork events can be processed, because
         // child processes will block until their corresponding uffd fork event is read.
         (&uffd, Token::UffdEvents(UffdList::ID_MAIN_UFFD)),
         (
             bg_job_control.get_completion_event(),
             Token::BackgroundJobCompleted,
         ),
     ])
     .context("create wait context")?;

     let n_worker = num_cpus::get();
     info!("start {} workers for staging memory move", n_worker);
     // The worker threads are killed when the main thread of the monitor process dies.
     let worker = Worker::new(n_worker, n_worker);

     let mut uffd_list = UffdList::new(uffd, &wait_ctx);
     let mut state_transition = StateTransition::default();

     loop {
         let events = wait_ctx.wait().context("wait poll events")?;

         for event in events.iter() {
             match event.token {
                 Token::UffdEvents(id_uffd) => {
                     let uffd = uffd_list
                         .get(id_uffd)
                         .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
                     // Userfaultfd does not work as level triggered but as edge triggered. We need
                     // to read all the events in the userfaultfd here.
                     while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
                         match event {
                             UffdEvent::Remove { .. } => {
                                 // BUG(b/272620051): This is a bug of userfaultfd that
                                 // UFFD_EVENT_REMOVE can be read even after unregistering memory
                                 // from the userfaultfd.
                                 warn!("page remove event while vmm-swap disabled");
                             }
                             event => {
                                 bail!("unexpected uffd event: {:?}", event);
                             }
                         }
                     }
                 }
                 Token::Command => match command_tube
                     .recv::<Command>()
                     .context("recv swap command")?
                 {
                     Command::ProcessForked(raw_descriptor) => {
                         debug!("new fork uffd: {:?}", raw_descriptor);
                         // Safe because the raw_descriptor is sent from another process via Tube and
                         // no one in this process owns it.
                         let uffd = unsafe { Userfaultfd::from_raw_descriptor(raw_descriptor) };
                         uffd_list.register(uffd).context("register forked uffd")?;
                     }
                     Command::Enable => {
                         info!("enabling vmm-swap");

                         let staging_shmem =
                             SharedMemory::new("swap staging memory", guest_memory.memory_size())
                                 .context("create staging shmem")?;

                         let regions = regions_from_guest_memory(&guest_memory);

                         let page_handler = match PageHandler::create(
                             &swap_file,
                             &staging_shmem,
                             &regions,
                             worker.channel.clone(),
                         ) {
                             Ok(page_handler) => page_handler,
                             Err(e) => {
                                 error!("failed to create swap handler: {:?}", e);
                                 continue;
                             }
                         };

                         // TODO(b/272634283): Should just disable vmm-swap without crash.
                         // Safe because the regions are from guest memory and uffd_list contains all
                         // the processes of crosvm.
                         unsafe { register_regions(&regions, uffd_list.get_list()) }
                             .context("register regions")?;

                         // events may contain unprocessed entries, but those pending events will be
                         // immediately re-created when handle_vmm_swap checks wait_ctx because
                         // WaitContext is level triggered.
                         drop(events);

                         let mutex_transition = Mutex::new(state_transition);

                         bg_job_control.reset()?;
                         let exit = std::thread::scope(|scope| {
                             let exit = handle_vmm_swap(
                                 scope,
                                 &wait_ctx,
                                 &page_handler,
                                 &uffd_list,
                                 &guest_memory,
                                 &command_tube,
                                 &worker,
                                 &mutex_transition,
                                 &bg_job_control,
                                 #[cfg(feature = "log_page_fault")]
                                 &mut page_fault_logger,
                             );
                             // Abort background jobs to unblock ScopedJoinHandle eariler on a
                             // failure.
                             bg_job_control.abort();
                             exit
                         })?;
                         if exit {
                             return Ok(());
                         }
                         state_transition = mutex_transition.into_inner();

                         unregister_regions(&regions, uffd_list.get_list())
                             .context("unregister regions")?;

                         // Truncate the swap file to hold minimum resources while disabled.
                         if let Err(e) = swap_file.set_len(0) {
                             error!("failed to clear swap file: {:?}", e);
                         };

                         info!("vmm-swap is disabled");
                         // events are obsolete. Run `WaitContext::wait()` again
                         break;
                     }
                     Command::Trim => {
                         warn!("swap trim while disabled");
                     }
                     Command::SwapOut => {
                         warn!("swap out while disabled");
                     }
                     Command::Disable => {
                         warn!("swap is already disabled");
                     }
                     Command::Exit => {
                         return Ok(());
                     }
                     Command::Status => {
                         let status = Status::disabled(&state_transition);
                         command_tube.send(&status).context("send status response")?;
                         info!("swap status: {:?}", status);
                     }
                 },
                 Token::BackgroundJobCompleted => {
                     error!("unexpected background job completed event while swap is disabled");
                     bg_job_control.reset()?;
                 }
             };
         }
     }
 }

 enum SwapState<'scope> {
     SwapOutPending,
     Trim(ScopedJoinHandle<'scope, anyhow::Result<()>>),
     SwapOutInProgress { started_time: Instant },
     SwapOutCompleted,
     SwapInInProgress(ScopedJoinHandle<'scope, anyhow::Result<()>>),
     Failed,
 }

 fn handle_enable_command<'scope>(
     state: SwapState,
     bg_job_control: &BackgroundJobControl,
     page_handler: &PageHandler,
     guest_memory: &GuestMemory,
     worker: &Worker<MoveToStaging>,
     state_transition: &Mutex<StateTransition>,
 ) -> anyhow::Result<SwapState<'scope>> {
     match state {
         SwapState::SwapInInProgress(join_handle) => {
             info!("abort swap-in");
             abort_background_job(join_handle, bg_job_control).context("abort swap-in")?;
         }
         SwapState::Trim(join_handle) => {
             info!("abort trim");
             abort_background_job(join_handle, bg_job_control).context("abort trim")?;
         }
         _ => {}
     }

     info!("start moving memory to staging");
     match move_guest_to_staging(page_handler, guest_memory, worker) {
         Ok(new_state_transition) => {
             info!(
                 "move {} pages to staging in {} ms",
                 new_state_transition.pages, new_state_transition.time_ms
             );
             *state_transition.lock() = new_state_transition;
             Ok(SwapState::SwapOutPending)
         }
         Err(e) => {
             error!("failed to move memory to staging: {}", e);
             *state_transition.lock() = StateTransition::default();
             Ok(SwapState::Failed)
         }
     }
 }

 fn move_guest_to_staging(
     page_handler: &PageHandler,
     guest_memory: &GuestMemory,
     worker: &Worker<MoveToStaging>,
 ) -> anyhow::Result<StateTransition> {
     let start_time = std::time::Instant::now();

     let mut pages = 0;

     let result = guest_memory.with_regions::<_, anyhow::Error>(
         |MemoryRegionInformation {
              host_addr,
              shm,
              shm_offset,
              ..
          }| {
             // safe because:
             // * all the regions are registered to all userfaultfd
             // * no process access the guest memory
             // * page fault events are handled by PageHandler
             // * wait for all the copy completed within _processes_guard
             pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
                 .context("move to staging")?;
             Ok(())
         },
     );
     worker.channel.wait_complete();

     match result {
         Ok(()) => {
             if page_handler.compute_resident_pages() > 0 {
                 error!(
                     "active page is not zero just after swap out but {} pages",
                     page_handler.compute_resident_pages()
                 );
             }
             let time_ms = start_time.elapsed().as_millis();
             Ok(StateTransition { pages, time_ms })
         }
         Err(e) => Err(e),
     }
 }

 fn abort_background_job<T>(
     join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>,
     bg_job_control: &BackgroundJobControl,
 ) -> anyhow::Result<T> {
     bg_job_control.abort();
     // Wait until the background job is aborted and the thread finishes.
     let result = join_handle
         .join()
         .expect("panic on the background job thread");
     bg_job_control.reset().context("reset swap in event")?;
     result.context("failure on background job thread")
 }

 fn handle_vmm_swap<'scope, 'env>(
     scope: &'scope Scope<'scope, 'env>,
     wait_ctx: &WaitContext<Token>,
     page_handler: &'env PageHandler<'env>,
     uffd_list: &'env UffdList,
     guest_memory: &GuestMemory,
     command_tube: &Tube,
     worker: &Worker<MoveToStaging>,
     state_transition: &'env Mutex<StateTransition>,
     bg_job_control: &'env BackgroundJobControl,
     #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger,
 ) -> anyhow::Result<bool> {
     let mut state = match move_guest_to_staging(page_handler, guest_memory, worker) {
         Ok(transition) => {
             info!(
                 "move {} pages to staging in {} ms",
                 transition.pages, transition.time_ms
             );
             *state_transition.lock() = transition;
             SwapState::SwapOutPending
         }
         Err(e) => {
             error!("failed to move memory to staging: {}", e);
             *state_transition.lock() = StateTransition::default();
             SwapState::Failed
         }
     };
     command_tube
         .send(&Status::dummy())
         .context("send enable finish signal")?;

     loop {
         let events = match &state {
             SwapState::SwapOutInProgress { started_time } => {
                 let events = wait_ctx
                     .wait_timeout(Duration::ZERO)
                     .context("wait poll events")?;

                 // TODO(b/273129441): swap out on a background thread.
                 // Proceed swap out only when there is no page fault (or other) events.
                 if events.is_empty() {
                     match page_handler.swap_out(MAX_SWAP_CHUNK_SIZE) {
                         Ok(num_pages) => {
                             let mut state_transition = state_transition.lock();
                             state_transition.pages += num_pages;
                             state_transition.time_ms = started_time.elapsed().as_millis();
                             if num_pages == 0 {
                                 info!(
                                     "swap out all {} pages to file in {} ms",
                                     state_transition.pages, state_transition.time_ms
                                 );
                                 state = SwapState::SwapOutCompleted;
                             }
                         }
                         Err(e) => {
                             error!("failed to swap out: {:?}", e);
                             state = SwapState::Failed;
                             *state_transition.lock() = StateTransition::default();
                         }
                     }
                     continue;
                 }

                 events
             }
             _ => wait_ctx.wait().context("wait poll events")?,
         };

         for event in events.iter() {
             match event.token {
                 Token::UffdEvents(id_uffd) => {
                     let uffd = uffd_list
                         .get(id_uffd)
                         .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
                     // Userfaultfd does not work as level triggered but as edge triggered. We need
                     // to read all the events in the userfaultfd here.
                     // TODO(kawasin): Use [userfaultfd::Uffd::read_events()] for performance.
                     while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
                         match event {
                             UffdEvent::Pagefault { addr, .. } => {
                                 #[cfg(feature = "log_page_fault")]
                                 page_fault_logger.log_page_fault(addr as usize, id_uffd);
                                 page_handler
                                     .handle_page_fault(uffd, addr as usize)
                                     .context("handle fault")?;
                             }
                             UffdEvent::Remove { start, end } => {
                                 page_handler
                                     .handle_page_remove(start as usize, end as usize)
                                     .context("handle fault")?;
                             }
                             event => {
                                 bail!("unsupported UffdEvent: {:?}", event);
                             }
                         }
                     }
                 }
                 Token::Command => match command_tube
                     .recv::<Command>()
                     .context("recv swap command")?
                 {
                     Command::ProcessForked(raw_descriptor) => {
                         debug!("new fork uffd: {:?}", raw_descriptor);
                         // TODO(b/266898615): The forked processes must wait running until the
                         // regions are registered to the new uffd if vmm-swap is already enabled.
                         // There are currently no use cases for swap + hotplug, so this is currently
                         // not implemented.
                         bail!("child process is forked while swap is enabled");
                     }
                     Command::Enable => {
                         let result = handle_enable_command(
                             state,
                             bg_job_control,
                             page_handler,
                             guest_memory,
                             worker,
                             state_transition,
                         );
                         command_tube
                             .send(&Status::dummy())
                             .context("send enable finish signal")?;
                         state = result?;
                     }
                     Command::Trim => match &state {
                         SwapState::SwapOutPending => {
                             *state_transition.lock() = StateTransition::default();
                             let join_handle = scope.spawn(|| {
                                 let mut ctx = page_handler.start_trim();
                                 let job = bg_job_control.new_job();
                                 let start_time = std::time::Instant::now();

                                 while !job.is_aborted() {
                                     if let Some(trimmed_pages) =
                                         ctx.trim_pages(MAX_TRIM_PAGES).context("trim pages")?
                                     {
                                         let mut state_transition = state_transition.lock();
                                         state_transition.pages += trimmed_pages;
                                         state_transition.time_ms = start_time.elapsed().as_millis();
                                     } else {
                                         // Traversed all pages.
                                         break;
                                     }
                                 }

                                 if job.is_aborted() {
                                     info!("trim is aborted");
                                 } else {
                                     info!(
                                         "trimmed {} clean pages and {} zero pages",
                                         ctx.trimmed_clean_pages(),
                                         ctx.trimmed_zero_pages()
                                     );
                                 }
                                 Ok(())
                             });

                             state = SwapState::Trim(join_handle);
                             info!("start trimming staging memory");
                         }
                         state => {
                             warn!("swap trim is not ready. state: {:?}", State::from(state));
                         }
                     },
                     Command::SwapOut => match &state {
                         SwapState::SwapOutPending => {
                             state = SwapState::SwapOutInProgress {
                                 started_time: std::time::Instant::now(),
                             };
                             *state_transition.lock() = StateTransition::default();
                             info!("start swapping out");
                         }
                         state => {
                             warn!("swap out is not ready. state: {:?}", State::from(state));
                         }
                     },
                     Command::Disable => {
                         match state {
                             SwapState::Trim(join_handle) => {
                                 info!("abort trim");
                                 abort_background_job(join_handle, bg_job_control)
                                     .context("abort trim")?;
                             }
                             SwapState::SwapOutInProgress { .. } => {
                                 info!("swap out is aborted");
                             }
                             SwapState::SwapInInProgress(_) => {
                                 info!("swap in is in progress");
                                 continue;
                             }
                             _ => {}
                         }
                         *state_transition.lock() = StateTransition::default();

                         let join_handle = scope.spawn(|| {
                             let mut ctx = page_handler.start_swap_in();
                             let uffd = uffd_list.main_uffd();
                             let job = bg_job_control.new_job();
                             let start_time = std::time::Instant::now();
                             while !job.is_aborted() {
                                 match ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE) {
                                     Ok(num_pages) => {
                                         if num_pages == 0 {
                                             break;
                                         }
                                         let mut state_transition = state_transition.lock();
                                         state_transition.pages += num_pages;
                                         state_transition.time_ms = start_time.elapsed().as_millis();
                                     }
                                     Err(e) => {
                                         bail!("failed to swap in: {:?}", e);
                                     }
                                 }
                             }
                             if job.is_aborted() {
                                 info!("swap in is aborted");
                             }
                             Ok(())
                         });
                         state = SwapState::SwapInInProgress(join_handle);

                         info!("start swapping in");
                     }
                     Command::Exit => {
                         match state {
                             SwapState::SwapInInProgress(join_handle) => {
                                 // Wait until swap-in finishes.
                                 if let Err(e) = join_handle.join() {
                                     bail!("failed to join swap in thread: {:?}", e);
                                 }
                                 return Ok(true);
                             }
                             SwapState::Trim(join_handle) => {
                                 abort_background_job(join_handle, bg_job_control)
                                     .context("abort trim")?;
                             }
                             _ => {}
                         }
                         let mut ctx = page_handler.start_swap_in();
                         let uffd = uffd_list.main_uffd();
                         // Swap-in all before exit.
                         while ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE).context("swap in")? > 0 {}
                         return Ok(true);
                     }
                     Command::Status => {
                         let status = Status::new(&state, *state_transition.lock(), page_handler);
                         command_tube.send(&status).context("send status response")?;
                         info!("swap status: {:?}", status);
                     }
                 },
                 Token::BackgroundJobCompleted => {
                     // Reset the completed event.
                     if !bg_job_control
                         .reset()
                         .context("reset background job event")?
                     {
                         // When the job is aborted and the event is comsumed by reset(), the token
                         // `Token::BackgroundJobCompleted` may remain in the `events`. Just ignore
                         // the obsolete token here.
                         continue;
                     }
                     match state {
                         SwapState::SwapInInProgress(join_handle) => {
                             join_handle
                                 .join()
                                 .expect("panic on the background job thread")
                                 .context("swap in finish")?;
                             let state_transition = state_transition.lock();
                             info!(
                                 "swap in all {} pages in {} ms.",
                                 state_transition.pages, state_transition.time_ms
                             );
                             return Ok(false);
                         }
                         SwapState::Trim(join_handle) => {
                             join_handle
                                 .join()
                                 .expect("panic on the background job thread")
                                 .context("trim finish")?;
                             let state_transition = state_transition.lock();
                             info!(
                                 "trimmed {} pages in {} ms.",
                                 state_transition.pages, state_transition.time_ms
                             );
                             state = SwapState::SwapOutPending;
                         }
                         state => {
                             bail!(
                                 "background job completed but the actual state is {:?}",
                                 State::from(&state)
                             );
                         }
                     }
                 }
             };
         }
     }
 }