blob: 4cfaf832d4b3954b7bcb7f922fc0dc0dcefd319a [file] [log] [blame]
// Copyright 2021, The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Functions for running instances of `crosvm`.
use crate::aidl;
use crate::atom::{get_num_cpus, write_vm_exited_stats_sync};
use crate::composite;
use crate::debug_config::DebugConfig;
use crate::virtualmachine::{self, Cid, VirtualMachineCallbacks};
use anyhow::{anyhow, bail, Context, Error, Result};
use avflog::LogResult;
use binder::{DeathRecipient, ParcelFileDescriptor, Strong};
use command_fds::CommandFdExt;
use libc::{sysconf, _SC_CLK_TCK};
use log::{debug, error, info, warn};
use nix::{
errno::Errno,
fcntl::OFlag,
sys::epoll::{Epoll, EpollCreateFlags, EpollEvent, EpollFlags, EpollTimeout},
sys::eventfd::EventFd,
sys::wait::{waitid, Id, WaitPidFlag, WaitStatus},
unistd::{pipe2, Pid, Uid, User},
};
use psi_rs::{init_psi_monitor, parse_psi_line, register_psi_monitor, PsiResource, PsiStallType};
use regex::{Captures, Regex};
use rpcbinder::RpcServer;
use rustutils::system_properties;
use semver::{Version, VersionReq};
use shared_child::SharedChild;
use std::borrow::Cow;
use std::cmp::min;
use std::collections::HashMap;
use std::ffi::{CString, OsStr, OsString};
use std::fmt;
use std::fs::{read_to_string, File, OpenOptions};
use std::io::{self, Read, Seek};
use std::mem;
use std::num::{NonZeroU16, NonZeroU32};
use std::os::unix::io::{AsFd, AsRawFd, OwnedFd};
use std::os::unix::process::CommandExt;
use std::os::unix::process::ExitStatusExt;
use std::path::{Path, PathBuf};
use std::process::{Command, ExitStatus};
use std::sync::mpsc;
use std::sync::{Arc, Condvar, LazyLock, Mutex};
use std::thread::{self, JoinHandle};
use std::time::Instant;
use std::time::{Duration, SystemTime};
use tombstoned_client::{DebuggerdDumpType, TombstonedConnection};
const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
/// Version of the platform that crosvm currently implements. The format follows SemVer. This
/// should be updated when there is a platform change in the crosvm side. Having this value here is
/// fine because virtualizationservice and crosvm are supposed to be updated together in the virt
/// APEX.
const CROSVM_PLATFORM_VERSION: &str = "1.0.0";
/// The exit status which crosvm returns when it has an error starting a VM.
const CROSVM_START_ERROR_STATUS: i32 = 1;
/// The exit status which crosvm returns when a VM requests a reboot.
const CROSVM_REBOOT_STATUS: i32 = 32;
/// The exit status which crosvm returns when it crashes due to an error.
const CROSVM_CRASH_STATUS: i32 = 33;
/// The exit status which crosvm returns when vcpu is stalled.
const CROSVM_WATCHDOG_REBOOT_STATUS: i32 = 36;
/// The size of memory (in MiB) reserved for ramdump
const RAMDUMP_RESERVED_MIB: u32 = 17;
const MILLIS_PER_SEC: i64 = 1000;
const SYSPROP_CUSTOM_PVMFW_PATH: &str = "hypervisor.pvmfw.path";
/// Serial device for VM console input.
/// Hypervisor (virtio-console)
const CONSOLE_HVC0: &str = "hvc0";
/// Serial (emulated uart)
const CONSOLE_TTYS0: &str = "ttyS0";
/// virtio-console input usage is uncommon in AVF and it consumes a lot of memory (one page per
/// entry), so make the RX as small as possible.
/// The `virtio_drivers` crate requires a size of at least 2.
const CONSOLE_RX_QUEUE_SIZE: u32 = 2;
const CONSOLE_TX_QUEUE_SIZE: u32 = 32;
/// If the VM doesn't move to the Started state within this amount time, a hang-up error is
/// triggered.
static BOOT_HANGUP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
if nested_virt::is_nested_virtualization().unwrap() {
// Nested virtualization is slow, so we need a longer timeout.
Duration::from_secs(300)
} else {
Duration::from_secs(60)
}
});
/// Configuration for a VM to run with crosvm.
pub struct CrosvmConfig {
pub cid: Cid,
pub name: String,
pub shared_paths: Vec<SharedPathConfig>,
pub protected: bool,
pub detect_hangup: bool,
pub gdb_port: Option<NonZeroU16>,
pub vfio_devices: Vec<VfioDevice>,
pub dtbo: Option<File>,
pub device_tree_overlays: Vec<File>,
pub hugepages: bool,
pub boost_uclamp: bool,
pub balloon: bool,
pub dump_dt_fd: Option<File>,
pub enable_hypervisor_specific_auth_method: bool,
pub instance_id: [u8; 64],
pub start_suspended: bool,
pub enable_guest_ffa: bool,
pub command: CrosvmCommand,
}
fn try_into_non_zero_u32(value: i32) -> Result<NonZeroU32> {
let u32_value = value.try_into()?;
NonZeroU32::new(u32_value).ok_or(anyhow!("value should be greater than 0"))
}
/// Shared path between host and guest VM.
#[derive(Debug)]
pub struct SharedPathConfig {
pub path: String,
pub host_uid: i32,
pub host_gid: i32,
pub guest_uid: i32,
pub guest_gid: i32,
pub mask: i32,
pub tag: String,
pub socket_path: String,
pub socket_fd: Option<File>,
pub app_domain: bool,
}
type VfioDevice = Strong<dyn aidl::IBoundDevice>;
/// All information needed for running crosvm
pub struct RunContext<'a> {
pub config: &'a aidl::VirtualMachineRawConfig,
pub debug_config: &'a DebugConfig,
pub cid: Cid,
pub temp_dir: &'a Path,
pub console_out: Option<&'a ParcelFileDescriptor>,
pub console_in: Option<&'a ParcelFileDescriptor>,
pub log_out: Option<&'a ParcelFileDescriptor>,
}
/// Parses RunContext into raw arguments which will be used to construct a crosvm command. The
/// parsing is done when the virtual machine is created, and the construction of the crosvm command
/// is done when the virtual machine is started.
pub struct CrosvmCommand {
arg0: OsString,
args: Vec<OsString>,
preserved_fds: Vec<OwnedFd>,
// List of lambdas which need to run after crosvm exits. Option is added since this will be
// moved out of this struct when the VM gets run. Box is needed to satisfy the fixed-size
// requirement of Vec.
cleaners: Option<HashMap<String, Box<Cleaner>>>,
}
type Cleaner = dyn FnOnce(&CleanerContext) -> Result<()> + Send;
struct CleanerContext {
failure_reason: Mutex<String>,
}
impl CrosvmCommand {
pub fn build_from(context: &RunContext) -> Result<Self> {
Self::check_platform_version(context)?;
let mut command = Self {
arg0: OsString::new(),
args: Vec::new(),
preserved_fds: Vec::new(),
cleaners: Some(HashMap::new()),
};
command
.arg("--extended-status")
// Logs are further filtered in logcat per process, debug logs won't show unless
// crosvm is configured to show debug logs.
.args(["--log-level", "debug,disk=warn"])
.arg("run")
.arg("--disable-sandbox"); // TODO(qwandor): Remove --disable-sandbox.
command.add_name_arg(context);
command.add_kernel_arg(context)?;
command.add_cpu_arg(context)?;
command.add_memory_arg(context);
command.add_console_arg(context)?;
command.add_log_arg(context)?;
command.add_failure_pipe()?;
command.add_ramdump_arg(context)?;
command.add_disk_arg(context)?;
command.add_gpu_arg(context)?;
command.add_input_devices_arg(context)?;
command.add_audio_arg(context);
command.add_usb_arg(context);
command.add_network_arg(context)?;
command.add_file_backed_mapping_arg(context)?;
Ok(command)
}
fn arg<S: AsRef<OsStr>>(&mut self, arg: S) -> &mut Self {
self.args.push(arg.as_ref().into());
self
}
fn args<I: IntoIterator<Item = S>, S: AsRef<OsStr>>(&mut self, args: I) -> &mut Self {
for arg in args {
self.arg(arg.as_ref());
}
self
}
#[allow(unused)]
fn add_preserved_fd<F: Into<OwnedFd>>(&mut self, file: F) -> String {
let fd = file.into();
let raw_fd = fd.as_raw_fd();
self.preserved_fds.push(fd);
format!("/proc/self/fd/{}", raw_fd)
}
fn add_cleaner(&mut self, name: &str, cleaner: Box<Cleaner>) -> Result<()> {
if self.cleaners.as_mut().unwrap().insert(name.to_owned(), cleaner).is_some() {
Err(anyhow!("cleaner with name {name} already exists."))
} else {
Ok(())
}
}
fn check_platform_version(context: &RunContext) -> Result<()> {
let ver = &context.config.platformVersion;
let requested = VersionReq::parse(ver)
.context(format!("Invalid platform version requirement {ver}"))?;
let supported = Version::parse(CROSVM_PLATFORM_VERSION).unwrap();
if !requested.matches(&supported) {
bail!(
"Incompatible platform version. The config is compatible with platform version(s) \
{}, but the actual platform version is {}",
requested,
supported
);
}
Ok(())
}
fn add_name_arg(&mut self, context: &RunContext) {
let name = "crosvm_".to_owned() + &context.config.name;
self.arg0 = OsString::from(name.clone());
self.args(["--name", &name]);
}
fn add_kernel_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
if config.bootloader.is_none() && config.kernel.is_none() {
bail!("VM must have either a bootloader or a kernel image.");
}
if config.bootloader.is_some() && (config.kernel.is_some() || config.initrd.is_some()) {
bail!("Can't have both bootloader and kernel/initrd image.");
}
if let Some(bootloader) = &config.bootloader {
let file = self.add_preserved_fd(bootloader.as_ref().try_clone()?);
self.args(["--bios", &file]);
}
if let Some(kernel) = &config.kernel {
let file = self.add_preserved_fd(kernel.as_ref().try_clone()?);
self.arg(file);
}
if let Some(params) = &config.params {
self.args(["--params", params]);
}
if let Some(initrd) = &config.initrd {
let file = self.add_preserved_fd(initrd.as_ref().try_clone()?);
self.args(["--initrd", &file]);
}
Ok(())
}
fn add_cpu_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
let num_cores: Option<usize> = match &config.cpuOptions.cpuTopology {
aidl::CpuTopology::MatchHost(_) => {
if check_if_all_cpus_allowed()? {
None
} else {
Some(get_num_cpus().context("can't get number of CPUs")?)
}
}
aidl::CpuTopology::CpuCount(count) => Some((*count).try_into().unwrap()),
};
let mut cpu_args = Vec::new();
if let Some(num_cores) = num_cores {
cpu_args.push(format!("num-cores={num_cores}"));
} else {
self.arg("--host-cpu-topology");
#[cfg(target_arch = "aarch64")]
{
if cfg!(virt_cpufreq_upstream) {
self.arg("--virt-cpufreq-upstream");
} else {
self.arg("--virt-cpufreq");
}
}
}
#[cfg(target_arch = "aarch64")]
cpu_args.push("sve=[auto=true]".to_string());
if !cpu_args.is_empty() {
self.args(["--cpus", &cpu_args.join(",")]);
}
Ok(())
}
fn add_memory_arg(&mut self, context: &RunContext) {
let config = context.config;
let mut memory_mib = config
.memoryMib
.try_into()
.ok()
.and_then(NonZeroU32::new)
.unwrap_or(NonZeroU32::new(256).unwrap());
let swiotlb_size_mib = Self::get_swiotlb_mib(config);
// b/346770542 for consistent "usable" memory across protected and non-protected VMs.
memory_mib = memory_mib.saturating_add(swiotlb_size_mib);
self.args(["--mem", &memory_mib.get().to_string()]);
if swiotlb_size_mib > 0 {
self.args(["--swiotlb", &swiotlb_size_mib.to_string()]);
}
}
// A note on serial devices. We have five serial devices:
// 1. uart device: used as the output device by bootloaders and as early console by linux
// 2. uart device: used to report the reason for the VM failing.
// 3. virtio-console device: used as the console device where kmsg is redirected to
// 4. virtio-console device: used as the ramdump output
// 5. virtio-console device: used as the logcat output
//
// #1 and #3 are added via add_console_arg()
// #2 is added via add_failure_pipe()
// #4 is added via add_ramdump_arg()
// #5 is added via add_log_arg()
//
// When [console|log]_fd is not specified, the devices are attached to sink, which means what's
// written there is discarded.
//
// Warning: Adding more serial devices requires you to shift the PCI device ID of the boot
// disks in bootconfig.x86_64. This is because x86 crosvm puts serial devices and the block
// devices in the same PCI bus and serial devices comes before the block devices. Arm crosvm
// doesn't have the issue.
fn add_console_arg(&mut self, context: &RunContext) -> Result<()> {
// If user has provided an FD for console_out, let them read from it. Otherwise, we read
// the console output from the VM and emit it over to logcat.
let (out_fd, read_file) = match context.console_out {
Some(pfd) => (Some(pfd.as_ref().try_clone()?), None),
None => {
let (read_fd, write_fd) = create_pipe()?;
(Some(write_fd.into()), Some(read_fd))
}
};
let in_fd = context.console_in.map(|pfd| pfd.as_ref().try_clone()).transpose()?;
let in_device = context.config.consoleInputDevice.as_deref().unwrap_or(CONSOLE_HVC0);
match in_device {
CONSOLE_HVC0 | CONSOLE_TTYS0 => {}
_ => bail!("Unsupported serial device {in_device}"),
};
if context.debug_config.debug_level == aidl::DebugLevel::NONE
&& context.debug_config.should_prepare_console_output()
{
// bootconfig.normal will be used, but we need log.
self.args(["--params", "printk.devkmsg=on"]);
self.args(["--params", "console=hvc0"]);
}
let out_args = out_fd.map_or("type=sink".to_string(), |fd| {
format!("type=file,path={}", self.add_preserved_fd(fd))
});
let in_args =
in_fd.map_or("".to_string(), |fd| format!(",input={}", self.add_preserved_fd(fd)));
// dev/ttyS0
self.arg(format!(
"--serial={out_args}{},hardware=serial,num=1",
if in_device == CONSOLE_TTYS0 { &in_args } else { "" }
));
// dev/hvc0
self.arg(format!(
"--serial={out_args}{},hardware=virtio-console,num=1,\
max-queue-sizes=[{CONSOLE_RX_QUEUE_SIZE},{CONSOLE_TX_QUEUE_SIZE}]",
if in_device == CONSOLE_HVC0 { &in_args } else { "" }
));
let thread = read_file.map(|f| Self::logger_thread(f, format!("Console({})", context.cid)));
let cleaner = move |_: &CleanerContext| {
thread.map(JoinHandle::join);
Ok(())
};
self.add_cleaner("console", Box::new(cleaner))?;
Ok(())
}
fn add_log_arg(&mut self, context: &RunContext) -> Result<()> {
let (out_fd, read_file) = match context.log_out {
Some(pfd) => (Some(pfd.as_ref().try_clone()?), None),
None => {
let (read_fd, write_fd) = create_pipe()?;
(Some(write_fd.into()), Some(read_fd))
}
};
let out_args = out_fd.map_or("type=sink".to_string(), |fd| {
format!("type=file,path={}", self.add_preserved_fd(fd))
});
// dev/hvc2
self.arg(format!(
"--serial={out_args},hardware=virtio-console,num=3,\
max-queue-sizes=[{CONSOLE_RX_QUEUE_SIZE},{CONSOLE_TX_QUEUE_SIZE}]"
));
let thread = read_file.map(|f| Self::logger_thread(f, format!("Log({})", context.cid)));
let cleaner = move |_: &CleanerContext| {
thread.map(JoinHandle::join);
Ok(())
};
self.add_cleaner("log", Box::new(cleaner))?;
Ok(())
}
fn logger_thread(read_from: File, tag: String) -> JoinHandle<()> {
std::thread::spawn(move || {
use std::io::BufRead;
let mut reader = std::io::BufReader::new(read_from);
let mut buf = vec![];
loop {
buf.clear();
buf.shrink_to(1024);
match reader.read_until(b'\n', &mut buf) {
Ok(0) => {
info!("{}: EOF", &tag);
return;
}
Ok(_size) => {
if buf.last() == Some(&b'\n') {
buf.pop();
// Logs sent via TTY usually end lines with "\r\n".
if buf.last() == Some(&b'\r') {
buf.pop();
}
}
info!("{}: {}", &tag, &String::from_utf8_lossy(&buf));
}
Err(e) => {
error!("Could not read console pipe: {e:?}");
return;
}
};
}
})
}
fn add_failure_pipe(&mut self) -> Result<()> {
let (reader, writer) = create_pipe()?;
let writer = self.add_preserved_fd(writer);
// This becomes /dev/ttyS1
self.arg(format!("--serial=type=file,path={writer},hardware=serial,num=2"));
let read_thread = std::thread::spawn(move || {
// Read the pipe to see if any failure reason is written
let mut failure_reason = String::new();
// Arbitrary max size in case of misbehaving guest.
const MAX_SIZE: u64 = 50_000;
match reader.take(MAX_SIZE).read_to_string(&mut failure_reason) {
Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
Ok(len) if len > 0 => {
error!("VM returned failure reason '{}'", failure_reason.trim())
}
_ => (),
};
failure_reason.trim().to_owned()
});
let cleaner = move |context: &CleanerContext| {
let failure_reason = read_thread.join().expect("Failed to wait for fail reason");
*context.failure_reason.lock().unwrap() = failure_reason;
Ok(())
};
self.add_cleaner("failure_pipe", Box::new(cleaner))?;
Ok(())
}
fn get_swiotlb_mib(config: &aidl::VirtualMachineRawConfig) -> u32 {
if !config.protectedVm {
0
} else if config.swiotlbMib > 0 {
config.swiotlbMib.try_into().unwrap()
} else {
estimate_swiotlb_usage_mib(SwiotlbEstimateInputs {
guest_page_size: 4096, // TODO: Use real page size.
block_count: config.disks.len().try_into().unwrap(),
console_count: 3,
balloon: config.balloon,
})
}
}
fn add_ramdump_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
let using_gki =
if !cfg!(vendor_module) { false } else { config.osName.starts_with("microdroid_gki-") };
if context.debug_config.is_ramdump_needed() && !using_gki {
// `ramdump_write` is sent to crosvm and will be the backing store for the /dev/hvc1
// where VM will emit ramdump to. `ramdump_read` will be sent back to the client (i.e.
// the VM owner) for readout.
let file = File::create(context.temp_dir.join("ramdump"))?;
let path = self.add_preserved_fd(file);
// This becoms /dev/hvc1 (see num=2 below)
self.arg(format!(
"--serial=type=file,path={path},hardware=virtio-console,num=2,\
max-queue-sizes=[{CONSOLE_RX_QUEUE_SIZE},{CONSOLE_TX_QUEUE_SIZE}]"
));
let reserve = RAMDUMP_RESERVED_MIB + Self::get_swiotlb_mib(config);
self.args(["--params", &format!("crashkernel={reserve}M")]);
} else {
self.arg(format!(
"--serial=type=sink,hardware=virtio-console,num=2,\
max-queue-sizes=[{CONSOLE_RX_QUEUE_SIZE},{CONSOLE_TX_QUEUE_SIZE}]"
));
}
Ok(())
}
fn add_disk_arg(&mut self, context: &RunContext) -> Result<()> {
/// The size of zero.img.
/// Gaps in composite disk images are filled with a shared zero.img.
const ZERO_FILLER_SIZE: u64 = 4096;
let temp_dir = context.temp_dir;
let zero_filler = temp_dir.join("zero.img");
OpenOptions::new()
.create_new(true)
.read(true)
.write(true)
.open(&zero_filler)
.context(format!("Failed to create {:?}", zero_filler))?
.set_len(ZERO_FILLER_SIZE)?;
for (index, disk) in context.config.disks.iter().enumerate() {
let image = if !disk.partitions.is_empty() {
if disk.image.is_some() {
bail!("DiskImage {:?} contains both image and partitions.", disk);
}
let composite = temp_dir.join(format!("composite-{}.img", index));
let header = temp_dir.join(format!("composite-{}-header.img", index));
let footer = temp_dir.join(format!("composite-{}-footer.img", index));
let (image, partition_files) = composite::make_composite_image(
&disk.partitions,
&zero_filler,
&composite,
&header,
&footer,
)
.with_context(|| {
format!("Failed to make composite disk image with config {:?}", disk)
})
.with_log()?;
// These partition files are not directly shown in the command line, but
// indirectly via the composite disk file. So we need to preserve their FDs.
partition_files.into_iter().for_each(|f| {
self.add_preserved_fd(f);
});
image
} else if let Some(image) = &disk.image {
image.as_ref().try_clone()?.into()
} else {
bail!("DiskImage {:?} didn't contain image or partitions.", disk);
};
let path = self.add_preserved_fd(image);
self.args(["--block", &format!("path={},ro={},lock=false", path, !disk.writable)]);
}
Ok(())
}
fn add_gpu_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
let mut gpu_args = Vec::new();
if let Some(config) = &config.gpuConfig {
if !cfg!(paravirtualized_devices) {
warn!("GPU configuration not supported. Ignoring");
return Ok(());
}
if let Some(b) = &config.backend {
gpu_args.push(format!("backend={}", b));
}
if let Some(t) = &config.contextTypes {
// flatten is to convert Vec<Option<String>> into Vec<String>
let t: Vec<_> = t.clone().into_iter().flatten().collect();
gpu_args.push(format!("context-types={}", t.join(":")));
}
if let Some(a) = &config.pciAddress {
gpu_args.push(format!("pci-address={}", a));
}
if let Some(f) = &config.rendererFeatures {
gpu_args.push(format!("renderer-features={}", f));
}
if config.rendererUseEgl {
gpu_args.push("egl=true".to_string());
}
if config.rendererUseGles {
gpu_args.push("gles=true".to_string());
}
if config.rendererUseGlx {
gpu_args.push("glx=true".to_string());
}
if config.rendererUseSurfaceless {
gpu_args.push("surfaceless=true".to_string());
}
if config.rendererUseVulkan {
gpu_args.push("vulkan=true".to_string());
}
}
let name = &config.name;
if let Some(config) = &config.displayConfig {
if !cfg!(paravirtualized_devices) {
warn!("Display configuration not supported. Ignoring");
return Ok(());
}
gpu_args.push(format!(
"displays=[[mode=windowed[{},{}],dpi=[{},{}],refresh-rate={}]]",
try_into_non_zero_u32(config.width)?,
try_into_non_zero_u32(config.height)?,
try_into_non_zero_u32(config.horizontalDpi)?,
try_into_non_zero_u32(config.verticalDpi)?,
try_into_non_zero_u32(config.refreshRate)?,
));
self.arg(format!("--android-display-service={}", name));
}
if !gpu_args.is_empty() {
self.arg(format!("--gpu={}", gpu_args.join(",")));
}
Ok(())
}
fn add_input_devices_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
if !cfg!(paravirtualized_devices) && !config.inputDevices.is_empty() {
warn!("Input device configuration not supported. Ignoring");
return Ok(());
}
for dev in &config.inputDevices {
self.arg("--input");
match dev {
aidl::InputDevice::SingleTouch(dev) => {
let mut params = Vec::new();
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
params.push(format!("path={}", file));
params.push(format!("width={}", u32::try_from(dev.width)?));
params.push(format!("height={}", u32::try_from(dev.height)?));
if !dev.name.is_empty() {
params.push(format!("name={}", dev.name));
}
self.arg(format!("single-touch[{}]", params.join(",")));
}
aidl::InputDevice::MultiTouch(dev) => {
let mut params = Vec::new();
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
params.push(format!("path={}", file));
params.push(format!("width={}", u32::try_from(dev.width)?));
params.push(format!("height={}", u32::try_from(dev.height)?));
if !dev.name.is_empty() {
params.push(format!("name={}", dev.name));
}
self.arg(format!("multi-touch[{}]", params.join(",")));
}
aidl::InputDevice::Trackpad(dev) => {
let mut params = Vec::new();
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
params.push(format!("path={}", file));
params.push(format!("width={}", u32::try_from(dev.width)?));
params.push(format!("height={}", u32::try_from(dev.height)?));
if !dev.name.is_empty() {
params.push(format!("name={}", dev.name));
}
self.arg(format!("multi-touch-trackpad[{}]", params.join(",")));
}
aidl::InputDevice::EvDev(dev) => {
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
self.arg(format!("evdev[path={}]", file));
}
aidl::InputDevice::Keyboard(dev) => {
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
self.arg(format!("keyboard[path={}]", file));
}
aidl::InputDevice::Mouse(dev) => {
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
self.arg(format!("mouse[path={}]", file));
}
aidl::InputDevice::Switches(dev) => {
let pfd = dev.pfd.as_ref().ok_or(anyhow!("pfd should have value"))?;
let file = self.add_preserved_fd(pfd.as_ref().try_clone()?);
self.arg(format!("switches[path={}]", file));
}
}
}
Ok(())
}
fn add_audio_arg(&mut self, context: &RunContext) {
let config = context.config;
if let Some(config) = &config.audioConfig {
if !cfg!(paravirtualized_devices) {
warn!("Audio configuration not supported. Ignoring");
return;
}
self.arg("--virtio-snd");
self.arg(format!(
"backend=aaudio,num_input_devices={},num_output_devices={}",
if config.useMicrophone { 1 } else { 0 },
if config.useSpeaker { 1 } else { 0 },
));
}
}
fn add_usb_arg(&mut self, context: &RunContext) {
let config = context.config;
let use_usb = if let Some(config) = &config.usbConfig { config.controller } else { false };
if !use_usb {
self.arg("--no-usb");
}
}
fn add_network_arg(&mut self, context: &RunContext) -> Result<()> {
let config = context.config;
if config.networkSupported {
if !cfg!(network) {
warn!("Networking not supported. Ignoring");
return Ok(());
}
if config.protectedVm {
bail!("Network feature is not supported for pVM yet");
}
let tap_fd = {
let iface_suffix = std::process::id().to_string();
let pfd =
virtualmachine::global_service().createTapInterface(&iface_suffix).context(
format!("Failed to create a TAP interface with suffix {iface_suffix}"),
)?;
pfd.as_ref().try_clone()?
};
let tap_fd_cloned = tap_fd.try_clone()?;
let path = self.add_preserved_fd(tap_fd);
let fd_num = path.split('/').last().unwrap();
self.args(["--net", &format!("tap-fd={fd_num}")]);
let cleaner = move |_: &CleanerContext| {
let pfd = ParcelFileDescriptor::new(tap_fd_cloned);
virtualmachine::global_service()
.deleteTapInterface(&pfd)
.context("Error deleting TAP interface")?;
Ok(())
};
self.add_cleaner("network", Box::new(cleaner))?;
}
Ok(())
}
fn add_file_backed_mapping_arg(&mut self, context: &RunContext) -> Result<()> {
for bf in &context.config.customMemoryBackingFiles {
let pfd = bf.file.as_ref().ok_or(anyhow!("missing CustomMemoryBackingFile FD"))?;
let mem_fd = pfd.as_ref().try_clone()?;
let path = self.add_preserved_fd(mem_fd);
let addr = bf.rangeStart as u64;
let size = bf.size as u64;
self.args([
"--file-backed-mapping",
&format!("{path},addr={addr:#0x},size={size:#0x},rw,ram"),
]);
}
Ok(())
}
}
/// The lifecycle state which the payload in the VM has reported itself to be in.
///
/// Note that the order of enum variants is significant; only forward transitions are allowed by
/// [`VmInstance::update_payload_state`].
#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
pub enum PayloadState {
Starting,
Started,
Ready,
Finished,
Hangup, // Hasn't reached to Ready before timeout expires
}
/// The current state of the VM itself.
pub enum VmState {
/// The VM has not yet tried to start.
NotStarted {
///The configuration needed to start the VM, if it has not yet been started.
config: Box<CrosvmConfig>,
},
/// The VM has been started.
Running {
/// The crosvm child process.
child: Arc<SharedChild>,
/// The thread waiting for crosvm to finish.
monitor_vm_exit_thread: JoinHandle<()>,
},
/// The VM is being shut down.
ShuttingDown {
// The receiver half of this channel will be closed when shutdown is finished.
shutdown_finished_tx: mpsc::SyncSender<()>,
},
/// The VM died or was killed.
Dead,
/// The VM failed to start.
Failed,
}
impl std::fmt::Debug for VmState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotStarted { .. } => f.write_str("not started"),
Self::Running { .. } => f.write_str("running"),
Self::ShuttingDown { .. } => f.write_str("shutting down"),
Self::Dead => f.write_str("dead"),
Self::Failed => f.write_str("failed"),
}
}
}
/// Metrics regarding the VM.
#[derive(Debug, Default)]
pub struct VmMetric {
/// Recorded timestamp when the VM is started.
pub start_timestamp: Option<SystemTime>,
/// Cumulative guest CPU time measured before the VM is killed
pub cpu_guest_time: Option<i64>,
/// RSS high watermark measured before the VM is killed
pub rss: Option<i64>,
}
impl VmState {
/// Tries to start the VM, if it is in the `NotStarted` state.
///
/// Returns an error if the VM is in the wrong state, or fails to start.
fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
let state = mem::replace(self, VmState::Failed);
if let VmState::NotStarted { config } = state {
let mut config = *config;
let cleaners = config.command.cleaners.take().unwrap();
let detect_hangup = config.detect_hangup;
let vfio_devices = config.vfio_devices.clone();
let vhost_fs_devices = run_virtiofs(&config)?;
// If this fails and returns an error, `self` will be left in the `Failed` state.
let child = Arc::new(run_vm(config, &instance.crosvm_control_socket_path)?);
let psi_thread_and_evt_fd = if instance.trim_under_pressure {
let psi_monitor_kill_event = Arc::new(EventFd::new()?);
let psi_monitor_kill_event_clone = psi_monitor_kill_event.clone();
let instance = instance.clone();
Some((
thread::Builder::new().name("virt_psi_monitor".to_string()).spawn(
move || {
let mut expo_bo = 1;
// TODO: add metrics to see how often we restart the thread
while let Err(e) = psi_monitor(&instance, &psi_monitor_kill_event_clone)
{
error!("psi monitor failed: {:#}", e);
thread::sleep(Duration::from_secs(expo_bo));
// Exponential backoff, capped at 60 seconds. This number is
// arbitrary
expo_bo = min(expo_bo * 2, 60);
}
},
)?,
psi_monitor_kill_event,
))
} else {
None
};
let child_clone = child.clone();
let instance_clone = instance.clone();
let monitor_vm_exit_thread = thread::spawn(move || {
instance_clone.monitor_vm_exit(
child_clone,
vfio_devices,
vhost_fs_devices,
psi_thread_and_evt_fd,
cleaners,
);
});
if detect_hangup {
let child_clone = child.clone();
thread::spawn(move || {
instance.monitor_payload_hangup(child_clone);
});
}
// If it started correctly, update the state.
*self = VmState::Running { child, monitor_vm_exit_thread };
Ok(())
} else {
*self = state;
bail!("VM already started or failed")
}
}
}
fn trigger_trim(instance: &Arc<VmInstance>) {
// When the host is under memory pressure, send a trim request
// Full contention detected, send a trim request
if let Some(guest_agent) = &*instance.guest_agent.lock().unwrap() {
if let Err(e) = guest_agent.trimAsync() {
error!("IGuestAgent::trimAsync failed: {e:#}");
}
}
}
fn psi_monitor(instance: &Arc<VmInstance>, psi_monitor_kill_event: &Arc<EventFd>) -> Result<()> {
// monitor memory, inflate balloon if some contention exists
// This will initialize a PSI monitor that monitors memory contention in
// windows of 500_000us. If "Some" processes are stalled for a preiod of
// 50_000us within the window period, an event gets fired.
let mut memory_pressure_file =
init_psi_monitor(PsiStallType::Some, 50000, 1000000, PsiResource::Memory)?;
let epoll = Epoll::new(EpollCreateFlags::EPOLL_CLOEXEC)?;
register_psi_monitor(&epoll, memory_pressure_file.as_fd(), 0)?;
epoll
.add(psi_monitor_kill_event.as_fd(), EpollEvent::new(EpollFlags::EPOLLIN, 1))
.context("failed to register psi eventfd")?;
// Wait on event
let mut events = [EpollEvent::empty()];
let mut rate_limiter: Option<Instant> = None;
let mut last_was_full = false;
loop {
// Set timeout to -1, blocking indefinitely
// https://man7.org/linux/man-pages/man2/epoll_wait.2.html
let epoll_res = epoll.wait(&mut events, EpollTimeout::NONE);
if let Err(e) = epoll_res {
if e == Errno::EINTR {
// Ignore interrupts and wait again
continue;
} else {
return Err(e.into());
}
}
match events[0].data() {
0 => {
let mut psi_info = String::new();
memory_pressure_file.rewind().context("failed to rewind file")?;
memory_pressure_file
.read_to_string(&mut psi_info)
.context("Failed to read PSI monitor to buffer")?;
// Monitor both Some and Full contention monitors.
// If the system was not under memory contention, and then becomes under memory
// contention, send a trim request directly.
// If the system was under "Some" contention and went to "Full" contention, send a
// trim request directly.
// If the system was under memory contention and detected new contention, check if
// timeout was hit.
let full_stats = psi_info
.lines()
.filter_map(|l| parse_psi_line(l, PsiStallType::Full).ok())
.next();
let some_stats = psi_info
.lines()
.filter_map(|l| parse_psi_line(l, PsiStallType::Some).ok())
.next();
let full_triggered = full_stats.is_some() && full_stats.unwrap().avg10 > 0.0;
let some_triggered = some_stats.is_some() && some_stats.unwrap().avg10 > 0.0;
let is_rate_limited = rate_limiter.is_some()
&& rate_limiter.unwrap().elapsed() > Duration::from_secs(22);
let should_trim = if is_rate_limited {
!last_was_full && full_triggered
} else {
full_triggered || some_triggered
};
last_was_full = full_triggered;
if should_trim {
rate_limiter = Some(Instant::now());
trigger_trim(instance);
}
}
1 => {
info!("psi_monitor: Epoll kill event triggered");
// EventFD triggered, return
return Ok(());
}
_ => {
return Err(anyhow!("Unknown event received: {:?}", events[0]));
}
}
}
}
/// Information about a particular instance of a VM which may be running.
pub struct VmInstance {
/// The current state of the VM.
pub vm_state: Mutex<VmState>,
/// Condvar that is notified when `vm_state` becomes `Dead`.
vm_dead_convar: Condvar,
/// Whether this VmInstance requires VirtualMachineService
pub requires_vm_service: bool,
/// Hold the reference to RpcServer running VirtualMachineService
pub vm_service: Mutex<Option<RpcServer>>,
/// The CID assigned to the VM for vsock communication.
pub cid: Cid,
/// Path to crosvm control socket
crosvm_control_socket_path: PathBuf,
/// The name of the VM.
pub name: String,
/// Whether the VM is a protected VM.
pub protected: bool,
/// Directory of temporary files used by the VM while it is running.
pub temporary_directory: PathBuf,
/// The UID of the process which requested the VM.
pub requester_uid: u32,
/// The PID of the process which requested the VM. Note that this process may no longer exist
/// and the PID may have been reused for a different process, so this should not be trusted.
pub requester_debug_pid: i32,
/// Callbacks to clients of the VM.
pub callbacks: VirtualMachineCallbacks,
/// Guest agent running on the VM
pub guest_agent: Mutex<Option<Strong<dyn aidl::IGuestAgent>>>,
/// Recorded metrics of VM such as timestamp or cpu / memory usage.
pub vm_metric: Mutex<VmMetric>,
// Whether virtio-balloon is enabled
pub balloon_enabled: bool,
// Whether to send a trim request on app idle.
trim_under_pressure: bool,
/// List of vendor tee services this VM might access.
pub vendor_tee_services: Vec<String>,
/// List of host services this VM might access.
pub host_services: Vec<String>,
/// Represents a Key Encryption Key (KEK) stored on app's private data directory. This KEK is
/// used to set up the encrypted store of guest.
pub encrypted_store_kek: Option<Strong<dyn aidl::IEncryptedStoreKEK>>,
/// The latest lifecycle state which the payload reported itself to be in.
payload_state: Mutex<PayloadState>,
/// Represents the condition that payload_state was updated
payload_state_updated: Condvar,
/// The human readable name of requester_uid
requester_uid_name: String,
/// Death recipient for the global service. (this doesn't implement Debug trait)
pub global_service_death_recipient: Mutex<Option<DeathRecipient>>,
/// Host console name
pub host_console_name: Mutex<Option<String>>,
}
impl fmt::Display for VmInstance {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let adj = if self.protected { "Protected" } else { "Non-protected" };
write!(
f,
"{} virtual machine \"{}\" (owner: {}, cid: {})",
adj, self.name, self.requester_uid_name, self.cid
)
}
}
impl VmInstance {
/// Validates the given config and creates a new `VmInstance` but doesn't start running it.
#[allow(clippy::too_many_arguments)]
pub fn new(
config: CrosvmConfig,
temporary_directory: PathBuf,
requester_uid: u32,
requester_debug_pid: i32,
requires_vm_service: bool,
trim_under_pressure: bool,
vendor_tee_services: Vec<String>,
host_services: Vec<String>,
encrypted_store_kek: Option<Strong<dyn aidl::IEncryptedStoreKEK>>,
) -> Result<VmInstance, Error> {
let cid = config.cid;
let name = config.name.clone();
let protected = config.protected;
let balloon_enabled = config.balloon;
let requester_uid_name = User::from_uid(Uid::from_raw(requester_uid))
.ok()
.flatten()
.map_or_else(|| format!("{}", requester_uid), |u| u.name);
let instance = VmInstance {
vm_state: Mutex::new(VmState::NotStarted { config: Box::new(config) }),
vm_dead_convar: Condvar::new(),
requires_vm_service,
vm_service: Mutex::new(None),
cid,
crosvm_control_socket_path: temporary_directory.join("crosvm.sock"),
name,
protected,
temporary_directory,
requester_uid,
requester_debug_pid,
callbacks: Default::default(),
guest_agent: Mutex::new(None),
vm_metric: Mutex::new(Default::default()),
payload_state: Mutex::new(PayloadState::Starting),
payload_state_updated: Condvar::new(),
requester_uid_name,
balloon_enabled,
trim_under_pressure,
vendor_tee_services,
host_services,
encrypted_store_kek,
global_service_death_recipient: Mutex::new(None),
host_console_name: Mutex::new(None),
};
info!("{} created", &instance);
Ok(instance)
}
/// Starts an instance of `crosvm` to manage the VM. The `crosvm` instance will be killed when
/// the `VmInstance` is dropped.
pub fn start(self: &Arc<Self>) -> Result<(), Error> {
let mut vm_metric = self.vm_metric.lock().unwrap();
vm_metric.start_timestamp = Some(SystemTime::now());
let ret = self.vm_state.lock().unwrap().start(self.clone());
if ret.is_ok() {
info!("{} started", &self);
}
ret.with_context(|| format!("{} failed to start", &self))
}
/// Monitors the exit of the VM (i.e. termination of the `child` process). When that happens,
/// handles the event by updating the state, noityfing the event to clients by calling
/// callbacks, and removing temporary files for the VM.
fn monitor_vm_exit(
&self,
child: Arc<SharedChild>,
vfio_devices: Vec<VfioDevice>,
vhost_user_devices: Vec<SharedChild>,
psi_thread_and_evt_fd: Option<(JoinHandle<()>, Arc<EventFd>)>,
cleaners: HashMap<String, Box<Cleaner>>,
) {
// Wait for the EXIT of the crosvm process, but thanks to WNOWAIT it remains in the
// waitable state so that we can inspect /proc/<pid>/stat or status. Note however that we
// can only measure guest runtime, but not maximum RSS because VmHWM is not available for
// zombie process.
let pid = Pid::from_raw(child.id() as i32);
let result = waitid(Id::Pid(pid), WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT);
match &result {
Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
Ok(WaitStatus::Exited(..)) | Ok(WaitStatus::Signaled(..)) => {
self.measure_vm_status(child.id());
}
Ok(wait_status) => {
error!("Unexpected wait status from crosvm({}): {:?}", child.id(), wait_status);
}
}
// Then we really reap the process.
let result = child.wait();
match &result {
Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
Ok(status) => {
info!("crosvm({}) exited with status {}", child.id(), status);
if let Some(exit_status_code) = status.code() {
if exit_status_code == CROSVM_WATCHDOG_REBOOT_STATUS {
info!("detected vcpu stall on crosvm");
}
}
}
}
let cleaner_context = CleanerContext { failure_reason: Mutex::new(String::new()) };
cleaners.into_iter().for_each(|(name, cleaner)| {
// Failure in a cleaner shouldn't stop running other cleaners.
cleaner(&cleaner_context)
.unwrap_or_else(|e| error!("Failed to run cleaner {name}: {e:?}"));
});
// In crosvm, when vhost_user frontend is dead, vhost_user backend device will detect and
// exit. We can safely wait() for vhost user device after waiting crosvm main
// process.
for device in vhost_user_devices {
match device.wait() {
Ok(status) => {
info!("Vhost user device({}) exited with status {}", device.id(), status);
if !status.success() {
if let Some(code) = status.code() {
// vhost_user backend device exit with error code
error!(
"vhost user device({}) exited with error code: {}",
device.id(),
code
);
} else {
// The spawned child process of vhost_user backend device is
// killed by signal
error!("vhost user device({}) killed by signal", device.id());
}
}
}
Err(e) => {
error!("Error waiting for vhost user device({}) to die: {}", device.id(), e);
}
}
}
let failure_reason = cleaner_context.failure_reason.lock().unwrap();
*self.vm_state.lock().unwrap() = VmState::Dead;
self.vm_dead_convar.notify_all();
info!("{} exited", &self);
// In case of hangup, the pipe doesn't give us any information because the hangup can't be
// detected on the VM side (otherwise, it isn't a hangup), but in the
// monitor_payload_hangup function below which updates the payload state to Hangup.
let failure_reason =
if failure_reason.is_empty() && self.payload_state() == PayloadState::Hangup {
Cow::from("HANGUP")
} else {
Cow::from(failure_reason.clone())
};
self.handle_ramdump().unwrap_or_else(|e| error!("Error handling ramdump: {}", e));
let death_reason = death_reason(&result, &failure_reason);
let exit_signal = exit_signal(&result);
self.callbacks.callback_on_died(self.cid, death_reason);
let vm_metric = self.vm_metric.lock().unwrap();
write_vm_exited_stats_sync(
self.requester_uid as i32,
&self.name,
death_reason,
exit_signal,
&vm_metric,
);
if let Some((psi_thread, evt_fd)) = psi_thread_and_evt_fd {
evt_fd.write(1).expect("failed to stop PSI thread");
psi_thread.join().unwrap();
}
// Delete temporary files. The folder itself is removed by VirtualizationServiceInternal.
virtualmachine::remove_temporary_files(&self.temporary_directory).unwrap_or_else(|e| {
error!("Error removing temporary files from {:?}: {}", self.temporary_directory, e);
});
drop(vfio_devices); // Cleanup devices.
// Now that the VM is gone, shut down the VirtualMachineService server to eagerly free up
// the server threads.
let vm_service = self.vm_service.lock().unwrap();
if let Some(service) = &*vm_service {
if let Err(e) = service.shutdown() {
error!("Failed to shutdown VirtualMachineService RPC Binder server: {e:#}");
}
}
}
/// Waits until payload is started, or timeout expires. When timeout occurs, kill
/// the VM to prevent indefinite hangup and update the payload_state accordingly.
fn monitor_payload_hangup(&self, child: Arc<SharedChild>) {
debug!("Starting to monitor hangup for Microdroid({})", child.id());
let (state, result) = self
.payload_state_updated
.wait_timeout_while(self.payload_state.lock().unwrap(), *BOOT_HANGUP_TIMEOUT, |s| {
*s < PayloadState::Started
})
.unwrap();
drop(state); // we are not interested in state
let child_still_running = child.try_wait().ok() == Some(None);
if result.timed_out() && child_still_running {
error!(
"Microdroid({}) failed to start payload within {} secs timeout. Shutting down.",
child.id(),
BOOT_HANGUP_TIMEOUT.as_secs()
);
self.update_payload_state(PayloadState::Hangup).unwrap();
if let Err(e) = self.kill() {
error!("Error stopping timed-out VM with CID {}: {:?}", child.id(), e);
}
}
}
fn measure_vm_status(&self, pid: u32) {
match get_guest_time(pid) {
Ok(guest_time) => self.vm_metric.lock().unwrap().cpu_guest_time = Some(guest_time),
Err(e) => warn!("Failed to get guest CPU time: {}", e),
}
match get_rss(pid) {
Ok(rss) => self.vm_metric.lock().unwrap().rss = Some(rss),
Err(e) => warn!("Failed to get guest RSS: {}", e),
}
}
/// Tells if VM is running or not
pub fn is_vm_running(&self) -> bool {
matches!(&*self.vm_state.lock().unwrap(), VmState::Running { .. })
}
/// Returns the last reported state of the VM payload.
pub fn payload_state(&self) -> PayloadState {
*self.payload_state.lock().unwrap()
}
/// Updates the payload state to the given value, if it is a valid state transition.
pub fn update_payload_state(&self, new_state: PayloadState) -> Result<(), Error> {
if new_state == PayloadState::Finished {
if let VmState::Running { child, .. } = &*self.vm_state.lock().unwrap() {
self.measure_vm_status(child.id());
}
}
let mut state_locked = self.payload_state.lock().unwrap();
// Only allow forward transitions, e.g. from starting to started or finished, not back in
// the other direction.
if new_state > *state_locked {
*state_locked = new_state;
self.payload_state_updated.notify_all();
Ok(())
} else {
bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
}
}
fn try_shutdown(&self) -> bool {
if let Some(guest_agent) = &*self.guest_agent.lock().unwrap() {
info!("Asking VM (name: {}, cid: {}) to shut down", self.name, self.cid);
return guest_agent
.shutdownAsync()
.map_err(|e| error!("Failed to ask shut down: {e:?}"))
.is_ok();
}
false
}
/// Kills the crosvm instance, if it is running. We try to shut it down gracefully, if guest
/// agent is installed there. If not, or the shutdown didn't finish on time, the VM is forcibly
/// shut down. In-flight data in the VM may be affected!
pub fn kill(&self) -> Result<(), Error> {
// VirtualizationServiceInternal has a strong reference to IVirtualMachine. Don't forget to
// delete it. Otherwise there'll be a memory leak.
scopeguard::defer! {
let cid = self.cid.try_into().unwrap();
if let Err(e) = virtualmachine::global_service().unregisterVirtualMachine(cid) {
error!("Failed to unregister virtual machine ({cid}): {e:?}");
}
}
let mut vm_state_mg = self.vm_state.lock().unwrap();
match &*vm_state_mg {
VmState::Running { .. } => {
// We use an `mpsc` in a backwards way as a poor man's broadcast channel. The
// buffer is set to 0 to make this into a "rendezvous channel". Code that wants to
// wait for shutdown to finish will `send` on the channel, which will block until
// we `recv` (we never do) or `drop`.
let (shutdown_finished_tx, shutdown_finished_rx) = mpsc::sync_channel(0);
let vm_state = std::mem::replace(
&mut *vm_state_mg,
VmState::ShuttingDown { shutdown_finished_tx },
);
drop(vm_state_mg); // make sure self.vm_state is not held
let VmState::Running { child, monitor_vm_exit_thread } = vm_state else {
unreachable!();
};
self.measure_vm_status(child.id());
if !self.try_shutdown() {
let id = child.id();
warn!(
"Killing VM (name: {}, cid: {}) forcibly. Data might be corrupted!!!",
self.name, self.cid
);
child.kill().with_context(|| format!("Error killing crosvm({id}) instance"))?;
}
// Wait until the VM moves out of the ShuttingDown state. When the VM is shut down
// or killed, the state is set to Dead. See monitor_vm_exit_thread.
let shutdown_timeout = Duration::from_secs(5);
let result = self
.vm_dead_convar
.wait_timeout_while(self.vm_state.lock().unwrap(), shutdown_timeout, |state| {
matches!(state, VmState::ShuttingDown { .. })
})
.unwrap();
if result.1.timed_out() {
warn!(
"Failed to shut down the VM in {:?}. Killing. Data might be corrupted!.",
shutdown_timeout
);
child.kill().unwrap();
}
drop(result); // unlock self.vm_state to avoid deadlock with the vm_exit thread.
// Wait once again. If the graceful shutdown was successful, this will return
// immediately.
monitor_vm_exit_thread.join().unwrap();
// Drop the channel to signal shutdown is finished.
// Done explicitly just for code visibility.
drop(shutdown_finished_rx);
}
VmState::ShuttingDown { shutdown_finished_tx } => {
let shutdown_finished_tx = shutdown_finished_tx.clone();
drop(vm_state_mg); // make sure self.vm_state is not held
// Wait for the shutdown to finish.
//
// We might consider adding a timeout here just in case, but note that, if this has
// a case where it blocks indefinitely, then the `Running` branch above must have
// such a case a well (because it never dropped the other half).
#[allow(clippy::single_match)]
match shutdown_finished_tx.send(()) {
Ok(()) => unreachable!(),
Err(mpsc::SendError(())) => {} // success!
}
}
VmState::NotStarted { .. } | VmState::Dead | VmState::Failed => {
drop(vm_state_mg); // make sure self.vm_state is not held
// TODO: if it were ever running, we may still need to join
// logging handles, in monitor_vm_exit.
bail!("VM is not running")
}
}
Ok(())
}
/// Returns current virtio-balloon size.
pub fn get_actual_memory_balloon_bytes(&self) -> Result<u64, Error> {
Ok(self.get_balloon_stats()?.0)
}
fn get_balloon_stats(&self) -> Result<(u64, crosvm_control::BalloonStatsFfi), Error> {
if !self.is_vm_running() {
bail!("get_actual_memory_balloon_bytes when VM is not running");
}
if !self.balloon_enabled {
bail!("virtio-balloon is not enabled");
}
let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
let mut stats = crosvm_control::BalloonStatsFfi {
swap_in: 0,
swap_out: 0,
major_faults: 0,
minor_faults: 0,
free_memory: 0,
total_memory: 0,
available_memory: 0,
disk_caches: 0,
hugetlb_allocations: 0,
hugetlb_failures: 0,
shared_memory: 0,
unevictable_memory: 0,
};
let mut balloon_actual = 0u64;
// SAFETY: Pointers are valid for the lifetime of the call.
let success = unsafe {
crosvm_control::crosvm_client_balloon_stats(
socket_path_cstring.as_ptr(),
&mut stats,
&mut balloon_actual,
)
};
if !success {
bail!("Error requesting balloon stats");
}
Ok((balloon_actual, stats))
}
/// Inflates the virtio-balloon to `num_bytes` to reclaim guest memory. Called in response to
/// memory-trimming notifications.
pub fn set_memory_balloon(&self, num_bytes: u64) -> Result<(), Error> {
if !self.is_vm_running() {
bail!("set_memory_balloon when VM is not running");
}
if !self.balloon_enabled {
bail!("virtio-balloon is not enabled");
}
let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
// SAFETY: Pointer is valid for the lifetime of the call.
let success = unsafe {
crosvm_control::crosvm_client_balloon_vms(socket_path_cstring.as_ptr(), num_bytes)
};
if !success {
bail!("Error sending balloon adjustment");
}
Ok(())
}
/// Checks if ramdump has been created. If so, send it to tombstoned.
fn handle_ramdump(&self) -> Result<(), Error> {
let ramdump_path = self.temporary_directory.join("ramdump");
if !ramdump_path.as_path().try_exists()? {
return Ok(());
}
if std::fs::metadata(&ramdump_path)?.len() > 0 {
Self::send_ramdump_to_tombstoned(&ramdump_path)?;
}
Ok(())
}
fn send_ramdump_to_tombstoned(ramdump_path: &Path) -> Result<(), Error> {
let mut input = File::open(ramdump_path)
.context(format!("Failed to open ramdump {:?} for reading", ramdump_path))?;
let pid = std::process::id() as i32;
let conn = TombstonedConnection::connect(pid, DebuggerdDumpType::Tombstone)
.context("Failed to connect to tombstoned")?;
let mut output = conn
.text_output
.as_ref()
.ok_or_else(|| anyhow!("Could not get file to write the tombstones on"))?;
std::io::copy(&mut input, &mut output).context("Failed to send ramdump to tombstoned")?;
info!("Ramdump {:?} sent to tombstoned", ramdump_path);
conn.notify_completion()?;
Ok(())
}
/// Suspends the VM's vCPUs.
pub fn suspend(&self) -> Result<(), Error> {
let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
// SAFETY: Pointer is valid for the lifetime of the call.
let success =
unsafe { crosvm_control::crosvm_client_suspend_vm(socket_path_cstring.as_ptr()) };
if !success {
bail!("Failed to suspend VM");
}
Ok(())
}
/// Resumes the VM's vCPUs.
pub fn resume(&self) -> Result<(), Error> {
let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
// SAFETY: Pointer is valid for the lifetime of the call.
let success =
unsafe { crosvm_control::crosvm_client_resume_vm(socket_path_cstring.as_ptr()) };
if !success {
bail!("Failed to resume VM");
}
Ok(())
}
/// Performs full resume of VM.
pub fn resume_full(&self) -> Result<(), Error> {
let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
// SAFETY: Pointer is valid for the lifetime of the call.
let success =
unsafe { crosvm_control::crosvm_client_resume_vm_full(socket_path_cstring.as_ptr()) };
if !success {
bail!("Failed to resume VM");
}
Ok(())
}
}
// Get Cpus_allowed mask
fn check_if_all_cpus_allowed() -> Result<bool> {
let file = read_to_string("/proc/self/status")?;
let lines: Vec<_> = file.split('\n').collect();
for line in lines {
if line.contains("Cpus_allowed_list") {
let prop: Vec<_> = line.split_whitespace().collect();
if prop.len() != 2 {
return Ok(false);
}
let cpu_list: Vec<_> = prop[1].split('-').collect();
//Only contiguous Cpu list allowed
if cpu_list.len() != 2 {
return Ok(false);
}
if let Some(cpus) = get_num_cpus() {
let max_cpu = cpu_list[1].parse::<usize>()?;
if max_cpu == cpus - 1 {
return Ok(true);
} else {
return Ok(false);
}
}
}
}
Ok(false)
}
// Get guest time from /proc/[crosvm pid]/stat
fn get_guest_time(pid: u32) -> Result<i64> {
let file = read_to_string(format!("/proc/{}/stat", pid))?;
let data_list: Vec<_> = file.split_whitespace().collect();
// Information about guest_time is at 43th place of the file split with the whitespace.
// Example of /proc/[pid]/stat :
// 6603 (kworker/104:1H-kblockd) I 2 0 0 0 -1 69238880 0 0 0 0 0 88 0 0 0 -20 1 0 1845 0 0
// 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 104 0 0 0 0 0 0 0 0 0 0 0 0 0
if data_list.len() < 43 {
bail!("Failed to parse command result for getting guest time : {}", file);
}
let guest_time_ticks = data_list[42].parse::<i64>()?;
if guest_time_ticks == 0 {
bail!("zero value is measured on elapsed CPU guest_time");
}
// SAFETY: It just returns an integer about CPU tick information.
let ticks_per_sec = unsafe { sysconf(_SC_CLK_TCK) };
Ok(guest_time_ticks * MILLIS_PER_SEC / ticks_per_sec)
}
// Get rss from VmHWM of /proc/[crosvm pid]/status
fn get_rss(pid: u32) -> Result<i64> {
let file = read_to_string(format!("/proc/{}/status", pid))?;
let lines: Vec<_> = file.split('\n').collect();
for line in lines {
// VmHWM: 12345 kB
if line.starts_with("VmHWM:") {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() != 3 {
bail!("Failed to parse line: {}", line);
}
let rss = parts[1].parse::<i64>()?;
// We no longer distinguish memory used by the VM itself and the containint crosvm
// process. The former is not available as /proc/<pid>/smaps is not available when
// the process is in zombie state.
return Ok(rss);
}
}
bail!("can't find VmHWM in the status file");
}
fn death_reason(
result: &Result<ExitStatus, io::Error>,
mut failure_reason: &str,
) -> aidl::DeathReason {
use aidl::DeathReason;
if let Some((reason, info)) = failure_reason.split_once('|') {
// Separator indicates extra context information is present after the failure name.
error!("Failure info: {info}");
failure_reason = reason;
}
if let Ok(status) = result {
match failure_reason {
"PVM_FIRMWARE_PUBLIC_KEY_MISMATCH" => {
return DeathReason::PVM_FIRMWARE_PUBLIC_KEY_MISMATCH
}
"PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED" => {
return DeathReason::PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED
}
"MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE" => {
return DeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE
}
"MICRODROID_PAYLOAD_HAS_CHANGED" => return DeathReason::MICRODROID_PAYLOAD_HAS_CHANGED,
"MICRODROID_PAYLOAD_VERIFICATION_FAILED" => {
return DeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED
}
"MICRODROID_INVALID_PAYLOAD_CONFIG" => {
return DeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG
}
"MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
}
"HANGUP" => return DeathReason::HANGUP,
_ => {}
}
match status.code() {
None => DeathReason::KILLED,
Some(0) => DeathReason::SHUTDOWN,
Some(CROSVM_START_ERROR_STATUS) => DeathReason::START_FAILED,
Some(CROSVM_REBOOT_STATUS) => DeathReason::REBOOT,
Some(CROSVM_CRASH_STATUS) => DeathReason::CRASH,
Some(CROSVM_WATCHDOG_REBOOT_STATUS) => DeathReason::WATCHDOG_REBOOT,
Some(_) => DeathReason::UNKNOWN,
}
} else {
DeathReason::INFRASTRUCTURE_ERROR
}
}
fn exit_signal(result: &Result<ExitStatus, io::Error>) -> Option<i32> {
match result {
Ok(status) => status.signal(),
Err(_) => None,
}
}
const SYSFS_PLATFORM_DEVICES_PATH: &str = "/sys/devices/platform/";
const VFIO_PLATFORM_DRIVER_PATH: &str = "/sys/bus/platform/drivers/vfio-platform";
fn vfio_argument_for_platform_device(device: &VfioDevice) -> Result<String, Error> {
// Check platform device exists
let path = Path::new(&device.getSysfsPath()?).canonicalize()?;
if !path.starts_with(SYSFS_PLATFORM_DEVICES_PATH) {
bail!("{path:?} is not a platform device");
}
// Check platform device is bound to VFIO driver
let dev_driver_path = path.join("driver").canonicalize()?;
if dev_driver_path != Path::new(VFIO_PLATFORM_DRIVER_PATH) {
bail!("{path:?} is not bound to VFIO-platform driver");
}
if let Some(p) = path.to_str() {
Ok(format!("--vfio={p},iommu=pkvm-iommu,dt-symbol={0}", device.getDtboLabel()?))
} else {
bail!("invalid path {path:?}");
}
}
fn run_virtiofs(config: &CrosvmConfig) -> io::Result<Vec<SharedChild>> {
let mut devices: Vec<SharedChild> = Vec::new();
for shared_path in &config.shared_paths {
if shared_path.app_domain {
continue;
}
let ugid_map_value = format!(
"{} {} {} {} {} /",
shared_path.guest_uid,
shared_path.guest_gid,
shared_path.host_uid,
shared_path.host_gid,
shared_path.mask,
);
let cfg_arg = format!("ugid_map='{}'", ugid_map_value);
let mut command = Command::new(CROSVM_PATH);
command
.arg("device")
.arg("fs")
.arg(format!("--socket={}", &shared_path.socket_path))
.arg(format!("--tag={}", &shared_path.tag))
.arg(format!("--shared-dir={}", &shared_path.path))
.arg("--cfg")
.arg(cfg_arg.as_str())
.arg("--disable-sandbox")
.arg("--skip-pivot-root=true");
print_crosvm_args(&command);
let result = SharedChild::spawn(&mut command)?;
info!("Spawned virtiofs crosvm({})", result.id());
devices.push(result);
}
Ok(devices)
}
/// Starts an instance of `crosvm` to manage a new VM.
fn run_vm(config: CrosvmConfig, crosvm_control_socket_path: &Path) -> Result<SharedChild, Error> {
let mut command = Command::new(CROSVM_PATH);
command.arg0(config.command.arg0);
command.args(config.command.args);
command.arg("--cid").arg(config.cid.to_string());
if config.balloon {
command.arg("--balloon-page-reporting");
} else {
command.arg("--no-balloon");
}
if config.enable_hypervisor_specific_auth_method && !config.protected {
bail!("hypervisor specific auth method only supported for protected VMs");
}
if config.protected {
if config.enable_hypervisor_specific_auth_method {
if !hypervisor_props::is_gunyah()? {
bail!("hypervisor specific auth method not supported for current hypervisor");
}
// "QCOM Trusted VM" compatibility mode.
//
// When this mode is enabled, two hypervisor specific IDs are expected to be packed
// into the instance ID. We extract them here and pass along to crosvm so they can be
// given to the hypervisor driver via an ioctl.
let pas_id = u32::from_le_bytes(config.instance_id[60..64].try_into().unwrap());
let vm_id = u16::from_le_bytes(config.instance_id[58..60].try_into().unwrap());
command.arg("--hypervisor").arg(
format!("gunyah[device=/dev/gunyah,qcom_trusted_vm_id={vm_id},qcom_trusted_vm_pas_id={pas_id}]"),
);
// Put the FDT close to the payload (default is end of RAM) to so that CMA can be used
// without bloating memory usage.
command.arg("--fdt-position").arg("after-payload");
}
match system_properties::read(SYSPROP_CUSTOM_PVMFW_PATH)? {
Some(pvmfw_path) if !pvmfw_path.is_empty() => {
if pvmfw_path == "none" {
command.arg("--protected-vm-without-firmware")
} else {
command.arg("--protected-vm-with-firmware").arg(pvmfw_path)
}
}
_ => command.arg("--protected-vm"),
};
// Workaround to keep crash_dump from trying to read protected guest memory.
// Context in b/238324526.
command.arg("--unmap-guest-memory-on-fork");
// Lock the guest memory to improve memory accounting. More context in b/407786138
//
// Note that this uses MLOCK_ONFAULT underneath, so we still only pay for memory as it is
// used. Also depends on MADV_DONTNEED_LOCKED, which requires Linux v5.18+.
fn kernel_version() -> Option<(u32, u32)> {
let release = nix::sys::utsname::uname().ok()?.release().to_string_lossy().into_owned();
let mut release_iter = release.splitn(3, ".");
Some((release_iter.next()?.parse().ok()?, release_iter.next()?.parse().ok()?))
}
if kernel_version().context("bad uname")? >= (5, 18) {
command.arg("--lock-guest-memory-dontneed");
} else {
warn!("kernel is too old enable --lock-guest-memory-dontneed");
}
}
// Move the PCI MMIO regions to near the end of the low-MMIO space.
// This is done to accommodate a limitation in a partner's hypervisor.
#[cfg(target_arch = "aarch64")]
command
.arg("--pci")
.arg("mem=[start=0x2c000000,size=0x2000000],cam=[start=0x2e000000,size=0x1000000]");
if let Some(gdb_port) = config.gdb_port {
command.arg("--gdb").arg(gdb_port.to_string());
command.arg("-p").arg("nokaslr");
}
// Keep track of what file descriptors should be mapped to the crosvm process.
let mut preserved_fds = Vec::new();
preserved_fds.extend(config.command.preserved_fds);
if let Some(dump_dt_fd) = config.dump_dt_fd {
let dump_dt_fd = add_preserved_fd(&mut preserved_fds, dump_dt_fd);
command.arg("--dump-device-tree-blob").arg(dump_dt_fd);
}
#[cfg(target_arch = "aarch64")]
command.arg("--no-pmu");
let control_sock = create_crosvm_control_listener(crosvm_control_socket_path)
.context("failed to create control listener")?;
command.arg("--socket").arg(add_preserved_fd(&mut preserved_fds, control_sock));
config.device_tree_overlays.into_iter().for_each(|dt_overlay| {
let arg = add_preserved_fd(&mut preserved_fds, dt_overlay);
command.arg("--device-tree-overlay").arg(arg);
});
if config.hugepages {
command.arg("--hugepages");
}
if config.boost_uclamp {
command.arg("--boost-uclamp");
}
if !config.vfio_devices.is_empty() {
if let Some(dtbo) = config.dtbo {
command.arg(format!(
"--device-tree-overlay={},filter",
add_preserved_fd(&mut preserved_fds, dtbo)
));
} else {
bail!("VFIO devices assigned but no DTBO available");
}
};
for device in config.vfio_devices {
command.arg(vfio_argument_for_platform_device(&device)?);
}
for shared_path in &config.shared_paths {
if shared_path.app_domain {
if let Some(socket_fd) = &shared_path.socket_fd {
let socket_path =
add_preserved_fd(&mut preserved_fds, socket_fd.try_clone().unwrap());
command.arg("--vhost-user").arg(format!("fs,socket={}", socket_path));
}
} else {
if let Err(e) = wait_for_file(&shared_path.socket_path, 5) {
bail!("Error waiting for file: {}", e);
}
command.arg("--vhost-user").arg(format!("fs,socket={}", shared_path.socket_path));
}
}
debug!("Preserving FDs {:?}", preserved_fds);
command.preserved_fds(preserved_fds);
if config.start_suspended {
command.arg("--suspended");
}
if config.enable_guest_ffa {
command.arg("--ffa=auto");
}
print_crosvm_args(&command);
let result = SharedChild::spawn(&mut command)?;
debug!("Spawned crosvm({}).", result.id());
Ok(result)
}
fn wait_for_file(path: &str, timeout_secs: u64) -> Result<(), std::io::Error> {
let start_time = std::time::Instant::now();
let timeout = Duration::from_secs(timeout_secs);
while start_time.elapsed() < timeout {
if std::fs::metadata(path).is_ok() {
return Ok(()); // File exists
}
thread::sleep(Duration::from_millis(100));
}
Err(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found within {} seconds: {}", timeout_secs, path),
))
}
/// Print arguments of the crosvm command. In doing so, /proc/self/fd/XX is annotated with the
/// actual file path if the FD is backed by a regular file. If not, the /proc path is printed
/// unmodified.
fn print_crosvm_args(command: &Command) {
let re = Regex::new(r"/proc/self/fd/[\d]+").unwrap();
info!(
"Running crosvm with args: {:?}",
command
.get_args()
.map(|s| s.to_string_lossy())
.map(|s| {
re.replace_all(&s, |caps: &Captures| {
let path = &caps[0];
if let Ok(realpath) = std::fs::canonicalize(path) {
format!("{} ({})", path, realpath.to_string_lossy())
} else {
path.to_owned()
}
})
.into_owned()
})
.collect::<Vec<_>>()
);
}
/// Adds the file descriptor for `file` to `preserved_fds`, and returns a string of the form
/// "/proc/self/fd/N" where N is the file descriptor.
fn add_preserved_fd<F: Into<OwnedFd>>(preserved_fds: &mut Vec<OwnedFd>, file: F) -> String {
let fd = file.into();
let raw_fd = fd.as_raw_fd();
preserved_fds.push(fd);
format!("/proc/self/fd/{}", raw_fd)
}
/// Creates a new pipe with the `O_CLOEXEC` flag set, and returns the read side and write side.
fn create_pipe() -> Result<(File, File), Error> {
let (read_fd, write_fd) = pipe2(OFlag::O_CLOEXEC)?;
Ok((read_fd.into(), write_fd.into()))
}
/// Creates and binds a unix seqpacket listening socket to be passed as crosvm's `--socket`
/// argument. See `UnixSeqpacketListener::bind` in crosvm's code for reference.
fn create_crosvm_control_listener(crosvm_control_socket_path: &Path) -> Result<OwnedFd> {
use nix::sys::socket;
let fd = socket::socket(
socket::AddressFamily::Unix,
socket::SockType::SeqPacket,
socket::SockFlag::empty(),
None,
)
.context("socket failed")?;
socket::bind(fd.as_raw_fd(), &socket::UnixAddr::new(crosvm_control_socket_path)?)
.context("bind failed")?;
// The exact backlog size isn't imporant. crosvm uses 128 internally. We use 127 here
// because of a `nix` bug.
socket::listen(&fd, socket::Backlog::new(127).unwrap()).context("listen failed")?;
Ok(fd)
}
fn path_to_cstring(path: &Path) -> CString {
if let Some(s) = path.to_str() {
if let Ok(s) = CString::new(s) {
return s;
}
}
// The path contains invalid utf8 or a null, which should never happen.
panic!("bad path: {path:?}");
}
struct SwiotlbEstimateInputs {
guest_page_size: u32,
block_count: u32,
console_count: u32,
balloon: bool,
}
/// Estimate needed size of SWIOTLB based on crosvm and Linux kernel implementation details and
/// workload guesses.
///
/// Since it is based on implementations details of other projects, it is bound to go stale.
///
/// Optimized for microdroid. Custom VMs may want to set an explicit swiotlb size in their
/// configs.
fn estimate_swiotlb_usage_mib(inputs: SwiotlbEstimateInputs) -> u32 {
fn align(x: u32, alignment: u32) -> u32 {
(x + alignment) & alignment
}
// virtio split queue data structure size, based on virtio spec.
let virtq_size = |entries: u32| -> u32 {
// Assume any extra space in the last page is wasted.
align(
align(16 * entries, 16) + align(6 + 2 * entries, 2) + align(6 + 8 * entries, 4),
inputs.guest_page_size,
)
};
let mut total = 0;
// virtio vsock.
total += [
// event queue.
virtq_size(256),
// tx queue.
virtq_size(256),
// rx queue.
virtq_size(256),
// Linux eagerly fills the rx queue with requests, one page each.
256 * inputs.guest_page_size,
]
.iter()
.sum::<u32>();
// virtio console.
total += inputs.console_count
* [
// tx queue.
virtq_size(CONSOLE_TX_QUEUE_SIZE),
// rx queue.
virtq_size(CONSOLE_RX_QUEUE_SIZE),
// Linux eagerly fills the rx queue with requests, one page each.
CONSOLE_RX_QUEUE_SIZE * inputs.guest_page_size,
]
.iter()
.sum::<u32>();
// virtio block.
total += inputs.block_count
* [
// crosvm gives 16 queues.
16 * virtq_size(256),
]
.iter()
.sum::<u32>();
// virtio balloon.
if inputs.balloon {
// Expected queues: inflate, deflate, stats, reporting
total += 4 * virtq_size(128);
}
// Guess at workload dependant peak memory needs.
//
// This was chosen by making it just large enough to boot Microdroid, then adding 2 MiB. Maybe
// should add more based on vCPU count and/or page size.
total += 4 * 1024 * 1024;
total.div_ceil(1024 * 1024)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_estimate_swiotlb() {
// Basic microdroid configuration.
assert_eq!(
estimate_swiotlb_usage_mib(SwiotlbEstimateInputs {
guest_page_size: 4096,
block_count: 3,
console_count: 3,
balloon: true,
}),
6
);
// Basic 16k microdroid configuration.
assert_eq!(
estimate_swiotlb_usage_mib(SwiotlbEstimateInputs {
guest_page_size: 16 * 1024,
block_count: 3,
console_count: 3,
balloon: true,
}),
10
);
}
}