blob: 0c6710affbce98b86ffd3b1ae5b06aacd0169701 [file] [log] [blame]
// Copyright 2022 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//! PageHandler manages the page states of multiple regions.
#![deny(missing_docs)]
use std::ops::Range;
use std::path::Path;
use base::error;
use base::unix::FileDataIterator;
use base::AsRawDescriptor;
use data_model::VolatileSlice;
use thiserror::Error as ThisError;
use crate::file::Error as FileError;
use crate::file::SwapFile;
use crate::pagesize::addr_to_page_idx;
use crate::pagesize::bytes_to_pages;
use crate::pagesize::is_page_aligned;
use crate::pagesize::page_base_addr;
use crate::pagesize::page_idx_to_addr;
use crate::pagesize::pages_to_bytes;
use crate::userfaultfd::UffdError;
use crate::userfaultfd::Userfaultfd;
/// Result for PageHandler
pub type Result<T> = std::result::Result<T, Error>;
/// Errors for PageHandler
#[derive(ThisError, Debug)]
pub enum Error {
#[error("the address is invalid {0:#018X}")]
/// the address is invalid
InvalidAddress(usize),
#[error("the regions {0:?} and {1:?} overlap")]
/// regions are overlaps on registering
RegionOverlap(Range<usize>, Range<usize>),
#[error("file operation failed : {0:?}")]
/// file operation failed
File(FileError),
#[error("userfaultfd failed : {0:?}")]
/// userfaultfd operation failed
Userfaultfd(UffdError),
}
impl From<UffdError> for Error {
fn from(e: UffdError) -> Self {
Self::Userfaultfd(e)
}
}
impl From<FileError> for Error {
fn from(e: FileError) -> Self {
Self::File(e)
}
}
/// [Region] represents a memory region and corresponding [SwapFile].
struct Region {
/// the head page index of the region.
head_page_idx: usize,
file: SwapFile,
copied_pages: usize,
zeroed_pages: usize,
/// the amount of pages which were already initialized on page faults.
redundant_pages: usize,
swap_active: bool,
}
/// PageHandler manages the page states of multiple regions.
///
/// Handles multiple events derived from userfaultfd and swap out requests.
/// All the addresses and sizes in bytes are converted to page id internally.
pub struct PageHandler {
regions: Vec<Region>,
}
impl PageHandler {
/// Creates [PageHandler] for the given region.
///
/// # Arguments
///
/// * `swap_dir` - path to the directory to create a swap file from.
/// * `regions` - the list of the region. the start address must align with page. the size must
/// be multiple of pagesize.
pub fn create(swap_dir: &Path, regions: &[Range<usize>]) -> Result<Self> {
let mut handler = Self {
regions: Vec::new(),
};
for address_range in regions {
handler.add_region(swap_dir, address_range)?;
}
Ok(handler)
}
fn find_region_position(&self, page_idx: usize) -> Option<usize> {
// sequential search the corresponding page map from the list. It should be fast enough
// because there are a few regions (usually only 1).
self.regions.iter().position(|region| {
region.head_page_idx <= page_idx
&& page_idx < region.head_page_idx + region.file.num_pages()
})
}
fn find_region(&mut self, page_idx: usize) -> Option<&mut Region> {
self.find_region_position(page_idx)
.map(|i| &mut self.regions[i])
}
/// Create a new internal context to handle userfaultfd events and swap in/out request.
///
/// If the regions overlaps an existing region, it returns [Error::RegionOverlap].
///
/// # Arguments
///
/// * `swap_dir` - path to the directory to create a swap file from.
/// * `address_range` - the range of the region. the start address must align with page. the
/// size must be multiple of pagesize.
fn add_region(&mut self, swap_dir: &Path, address_range: &Range<usize>) -> Result<()> {
let head_page_idx = addr_to_page_idx(address_range.start);
let region_size = address_range.end - address_range.start;
let num_of_pages = bytes_to_pages(region_size);
// find an overlaping region
match self.regions.iter().position(|region| {
if region.head_page_idx < head_page_idx {
region.head_page_idx + region.file.num_pages() > head_page_idx
} else {
region.head_page_idx < head_page_idx + num_of_pages
}
}) {
Some(i) => {
let region = &self.regions[i];
Err(Error::RegionOverlap(
address_range.clone(),
page_idx_to_addr(region.head_page_idx)
..(page_idx_to_addr(region.head_page_idx + region.file.num_pages())),
))
}
None => {
let base_addr = address_range.start;
assert!(is_page_aligned(base_addr));
assert!(is_page_aligned(region_size));
let file = SwapFile::new(swap_dir, num_of_pages)?;
self.regions.push(Region {
head_page_idx,
file,
copied_pages: 0,
zeroed_pages: 0,
redundant_pages: 0,
swap_active: false,
});
Ok(())
}
}
}
fn copy_all(
uffd: &Userfaultfd,
mut page_addr: usize,
mut data_slice: VolatileSlice,
wake: bool,
) -> std::result::Result<(), UffdError> {
loop {
let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
match result {
Err(UffdError::PartiallyCopied(copied)) => {
page_addr += copied;
data_slice.advance(copied);
}
other => {
// Even EEXIST for copy operation should be an error for page fault handling. If
// the page was swapped in before, the page should be cleared from the swap file
// and do `Userfaultfd::zero()` instead.
return other.map(|_| ());
}
}
}
}
/// Fills the faulted page with zero if the page is not initialized, with the content in the
/// swap file if the page is swapped out.
///
/// # Arguments
///
/// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
/// * `address` - the address that triggered the page fault.
pub fn handle_page_fault(&mut self, uffd: &Userfaultfd, address: usize) -> Result<()> {
let page_idx = addr_to_page_idx(address);
// the head address of the page.
let page_addr = page_base_addr(address);
let page_size = pages_to_bytes(1);
let Region {
head_page_idx,
file,
copied_pages,
zeroed_pages,
redundant_pages,
..
} = self
.find_region(page_idx)
.ok_or(Error::InvalidAddress(address))?;
let idx_in_region = page_idx - *head_page_idx;
match file.page_content(idx_in_region)? {
Some(page_slice) => {
Self::copy_all(uffd, page_addr, page_slice, true)?;
file.clear(idx_in_region)?;
*copied_pages += 1;
Ok(())
}
None => {
// Map a zero page since no swap file has been created yet but the fault happened.
// safe because the fault page is notified by uffd.
let result = uffd.zero(page_addr, page_size, true);
match result {
Ok(_) => {
*zeroed_pages += 1;
Ok(())
}
Err(UffdError::ZeropageFailed(errno)) if errno as i32 == libc::EEXIST => {
// zeroing fails with EEXIST if the page is already filled. This case can
// happen if page faults on the same page happen on different processes.
uffd.wake(page_addr, page_size)?;
*redundant_pages += 1;
Ok(())
}
Err(e) => Err(e.into()),
}
}
}
}
/// Clear the internal state for the pages.
///
/// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
/// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
///
/// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
///
/// # Arguments
///
/// * `start_addr` - the head address of the memory area to be freed.
/// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
/// head address of the next memory area of the freed area. (i.e. the exact tail address of
/// the memory area is `end_addr - 1`.)
pub fn handle_page_remove(&mut self, start_addr: usize, end_addr: usize) -> Result<()> {
if !is_page_aligned(start_addr) {
return Err(Error::InvalidAddress(start_addr));
} else if !is_page_aligned(end_addr) {
return Err(Error::InvalidAddress(end_addr));
}
let start_page_idx = addr_to_page_idx(start_addr);
let last_page_idx = addr_to_page_idx(end_addr);
for page_idx in start_page_idx..(last_page_idx) {
let page_addr = page_idx_to_addr(page_idx);
let region = self
.find_region(page_idx)
.ok_or(Error::InvalidAddress(page_addr))?;
if let Err(e) = region.file.clear(page_idx - region.head_page_idx) {
error!("failed to clear removed page: {:?}", e);
}
}
Ok(())
}
/// Write active pages in the memory region to the swap file.
///
/// It only writes active contents in the guest memory to the swap file and skips empty
/// pages (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
///
/// The memory must be protected not to be updated during swapped out.
///
/// Returns the count of swapped out pages.
///
/// # Arguments
///
/// * `base_addr` - the head address of the memory region to swap out.
/// * `memfd` - the file descriptor of the memfd backing the guest memory region.
/// * `base_offset` - the offset of the memory region in the memfd.
///
/// # Safety
///
/// The region must have been registered to all userfaultfd of processes which may touch the
/// region.
///
/// The page fault events for the region from the userfaultfd must be handled by
/// [PageHandler::handle_page_fault].
#[deny(unsafe_op_in_unsafe_fn)]
pub unsafe fn swap_out<T>(
&mut self,
base_addr: usize,
memfd: &T,
base_offset: u64,
) -> Result<usize>
where
T: AsRawDescriptor,
{
let head_page_idx = addr_to_page_idx(base_addr);
// use find_region_position instead of find_region() due to borrow checker.
let region_position = self
.find_region_position(head_page_idx)
.ok_or(Error::InvalidAddress(base_addr))?;
if self.regions[region_position].head_page_idx != head_page_idx {
return Err(Error::InvalidAddress(base_addr));
}
let region_size = pages_to_bytes(self.regions[region_position].file.num_pages());
let file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
let mut swapped_size = 0;
for data_range in file_data {
// assert offset is page aligned
let offset = (data_range.start - base_offset) as usize;
assert!(is_page_aligned(offset));
let addr = base_addr + offset;
let page_idx = addr_to_page_idx(addr);
let size = (data_range.end - data_range.start) as usize;
assert!(is_page_aligned(size));
// safe because the page is within the range of the guest memory.
let mem_slice = unsafe { std::slice::from_raw_parts(addr as *const u8, size) };
self.regions[region_position]
.file
.write_to_file(page_idx - head_page_idx, mem_slice)?;
swapped_size += size;
// TODO(kawasin): periodically MADV_REMOVE the guest memory. if the pages are in zram,
// it increases the RAM usage during swap_out.
// TODO(kawasin): free the page cache of the swap file. or direct I/O.
}
// safe because the region is already backed by the file and the content will be swapped in
// on a page fault.
unsafe {
libc::madvise(
base_addr as *mut libc::c_void,
region_size,
libc::MADV_REMOVE,
);
}
let swapped_pages = bytes_to_pages(swapped_size);
let mut region = &mut self.regions[region_position];
// Suppress error log on the first swap_out, since page counts are not initialized but
// zero.
if region.swap_active && swapped_pages != (region.copied_pages + region.zeroed_pages) {
error!(
"swapped pages ({}) does not match with resident pages (copied: {}, zeroed: {}).",
swapped_pages, region.copied_pages, region.zeroed_pages
);
}
region.copied_pages = 0;
region.zeroed_pages = 0;
region.redundant_pages = 0;
region.swap_active = true;
Ok(swapped_pages)
}
/// Swap in all the content.
///
/// Returns the count of swapped out pages.
///
/// # Arguments
///
/// * `uffd` - the main [Userfaultfd].
pub fn swap_in(self, uffd: &Userfaultfd) -> Result<usize> {
let mut swapped_size = 0;
for region in self.regions.iter() {
for pages in region.file.all_present_pages() {
let page_idx = region.head_page_idx + pages.base_idx;
let page_addr = page_idx_to_addr(page_idx);
let size = pages.content.size();
Self::copy_all(uffd, page_addr, pages.content, false)?;
swapped_size += size;
}
}
Ok(bytes_to_pages(swapped_size))
}
/// Returns count of pages active on the memory.
pub fn compute_resident_pages(&self) -> usize {
self.regions
.iter()
.map(|r| r.copied_pages + r.zeroed_pages)
.sum()
}
/// Returns count of pages copied from vmm-swap file on the memory.
pub fn compute_copied_pages(&self) -> usize {
self.regions.iter().map(|r| r.copied_pages).sum()
}
/// Returns count of pages initialized with zero.
pub fn compute_zeroed_pages(&self) -> usize {
self.regions.iter().map(|r| r.zeroed_pages).sum()
}
/// Returns count of pages which were already initialized on page faults.
pub fn compute_redundant_pages(&self) -> usize {
self.regions.iter().map(|r| r.redundant_pages).sum()
}
/// Returns count of pages present in the swap files.
pub fn compute_swap_pages(&self) -> usize {
let mut swapped_size = 0;
for r in self.regions.iter() {
for pages in r.file.all_present_pages() {
swapped_size += pages.content.size();
}
}
bytes_to_pages(swapped_size)
}
}