blob: ec00664e3cd6baf73e23bdbdac03a47eeb84c0ba [file] [log] [blame]
// Copyright 2022 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#[cfg(target_os = "android")]
mod android;
pub mod cmdline;
pub mod config;
mod device_helpers;
#[cfg(feature = "gpu")]
pub(crate) mod gpu;
#[cfg(feature = "pci-hotplug")]
pub(crate) mod jail_warden;
#[cfg(feature = "pci-hotplug")]
pub(crate) mod pci_hotplug_helpers;
#[cfg(feature = "pci-hotplug")]
pub(crate) mod pci_hotplug_manager;
mod vcpu;
use std::cmp::max;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
#[cfg(feature = "registered_events")]
use std::collections::HashMap;
#[cfg(feature = "registered_events")]
use std::collections::HashSet;
use std::convert::TryInto;
use std::ffi::CString;
use std::fs::File;
use std::fs::OpenOptions;
#[cfg(feature = "registered_events")]
use std::hash::Hash;
use std::io::prelude::*;
use std::io::stdin;
use std::iter;
use std::mem;
#[cfg(target_arch = "x86_64")]
use std::ops::RangeInclusive;
use std::os::unix::prelude::OpenOptionsExt;
use std::os::unix::process::ExitStatusExt;
use std::path::Path;
use std::process;
#[cfg(feature = "registered_events")]
use std::rc::Rc;
use std::sync::mpsc;
use std::sync::Arc;
use std::sync::Barrier;
use std::thread::JoinHandle;
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
use aarch64::AArch64 as Arch;
use acpi_tables::sdt::SDT;
use anyhow::anyhow;
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use arch::DtbOverlay;
use arch::IrqChipArch;
use arch::LinuxArch;
use arch::RunnableLinuxVm;
use arch::VcpuAffinity;
use arch::VcpuArch;
use arch::VirtioDeviceStub;
use arch::VmArch;
use arch::VmComponents;
use arch::VmImage;
use argh::FromArgs;
use base::ReadNotifier;
#[cfg(feature = "balloon")]
use base::UnixSeqpacket;
use base::UnixSeqpacketListener;
use base::UnlinkUnixSeqpacketListener;
use base::*;
use cros_async::Executor;
use device_helpers::*;
use devices::create_devices_worker_thread;
use devices::serial_device::SerialHardware;
#[cfg(feature = "pvclock")]
use devices::tsc::get_tsc_sync_mitigations;
use devices::vfio::VfioCommonSetup;
use devices::vfio::VfioCommonTrait;
#[cfg(feature = "gpu")]
use devices::virtio;
#[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
use devices::virtio::device_constants::video::VideoDeviceType;
#[cfg(feature = "gpu")]
use devices::virtio::gpu::EventDevice;
#[cfg(target_arch = "x86_64")]
use devices::virtio::memory_mapper::MemoryMapper;
use devices::virtio::memory_mapper::MemoryMapperTrait;
use devices::virtio::vhost::user::VhostUserListener;
use devices::virtio::vhost::user::VhostUserListenerTrait;
#[cfg(feature = "balloon")]
use devices::virtio::BalloonFeatures;
#[cfg(feature = "balloon")]
use devices::virtio::BalloonMode;
#[cfg(feature = "pci-hotplug")]
use devices::virtio::NetParameters;
#[cfg(feature = "pci-hotplug")]
use devices::virtio::NetParametersMode;
use devices::virtio::VirtioDevice;
use devices::virtio::VirtioDeviceType;
use devices::virtio::VirtioTransportType;
use devices::Bus;
use devices::BusDeviceObj;
use devices::BusType;
use devices::CoIommuDev;
#[cfg(feature = "usb")]
use devices::DeviceProvider;
#[cfg(target_arch = "x86_64")]
use devices::HotPlugBus;
#[cfg(target_arch = "x86_64")]
use devices::HotPlugKey;
use devices::IommuDevType;
use devices::IrqEventIndex;
use devices::IrqEventSource;
#[cfg(feature = "pci-hotplug")]
use devices::NetResourceCarrier;
#[cfg(target_arch = "x86_64")]
use devices::PciAddress;
#[cfg(target_arch = "x86_64")]
use devices::PciBridge;
use devices::PciDevice;
#[cfg(target_arch = "x86_64")]
use devices::PciMmioMapper;
#[cfg(target_arch = "x86_64")]
use devices::PciRoot;
#[cfg(target_arch = "x86_64")]
use devices::PciRootCommand;
#[cfg(target_arch = "x86_64")]
use devices::PcieDownstreamPort;
#[cfg(target_arch = "x86_64")]
use devices::PcieHostPort;
#[cfg(target_arch = "x86_64")]
use devices::PcieRootPort;
#[cfg(target_arch = "x86_64")]
use devices::PcieUpstreamPort;
use devices::PvPanicCode;
use devices::PvPanicPciDevice;
#[cfg(feature = "pci-hotplug")]
use devices::ResourceCarrier;
use devices::StubPciDevice;
use devices::VirtioMmioDevice;
use devices::VirtioPciDevice;
#[cfg(feature = "usb")]
use devices::XhciController;
#[cfg(feature = "gpu")]
use gpu::*;
#[cfg(target_arch = "riscv64")]
use hypervisor::CpuConfigRiscv64;
#[cfg(target_arch = "x86_64")]
use hypervisor::CpuConfigX86_64;
use hypervisor::Hypervisor;
use hypervisor::HypervisorCap;
use hypervisor::MemCacheType;
use hypervisor::ProtectionType;
use hypervisor::Vm;
use hypervisor::VmCap;
use jail::*;
#[cfg(feature = "pci-hotplug")]
use jail_warden::JailWarden;
#[cfg(feature = "pci-hotplug")]
use jail_warden::JailWardenImpl;
#[cfg(feature = "pci-hotplug")]
use jail_warden::PermissiveJailWarden;
use libc;
use metrics::MetricsController;
use minijail::Minijail;
#[cfg(feature = "pci-hotplug")]
use pci_hotplug_manager::PciHotPlugManager;
use resources::AddressRange;
use resources::Alloc;
use resources::SystemAllocator;
#[cfg(target_arch = "riscv64")]
use riscv64::Riscv64 as Arch;
use rutabaga_gfx::RutabagaGralloc;
use smallvec::SmallVec;
#[cfg(feature = "swap")]
use swap::SwapController;
use sync::Condvar;
use sync::Mutex;
use vm_control::api::VmMemoryClient;
use vm_control::*;
use vm_memory::GuestAddress;
use vm_memory::GuestMemory;
use vm_memory::MemoryPolicy;
use vm_memory::MemoryRegionOptions;
#[cfg(target_arch = "x86_64")]
use x86_64::X8664arch as Arch;
use crate::crosvm::config::Config;
use crate::crosvm::config::Executable;
use crate::crosvm::config::FileBackedMappingParameters;
use crate::crosvm::config::HypervisorKind;
use crate::crosvm::config::InputDeviceOption;
use crate::crosvm::config::IrqChipKind;
use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
#[cfg(feature = "gdb")]
use crate::crosvm::gdb::gdb_thread;
#[cfg(feature = "gdb")]
use crate::crosvm::gdb::GdbStub;
#[cfg(target_arch = "x86_64")]
use crate::crosvm::ratelimit::Ratelimit;
use crate::crosvm::sys::cmdline::DevicesCommand;
use crate::crosvm::sys::config::SharedDir;
use crate::crosvm::sys::config::SharedDirKind;
const KVM_PATH: &str = "/dev/kvm";
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
#[cfg(feature = "geniezone")]
const GENIEZONE_PATH: &str = "/dev/gzvm";
#[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
static GUNYAH_PATH: &str = "/dev/gunyah";
fn create_virtio_devices(
cfg: &Config,
vm: &mut impl Vm,
resources: &mut SystemAllocator,
#[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
#[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
#[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
#[cfg(feature = "balloon")] init_balloon_size: u64,
#[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
disk_device_tubes: &mut Vec<Tube>,
pmem_device_tubes: &mut Vec<Tube>,
fs_device_tubes: &mut Vec<Tube>,
#[cfg(feature = "gpu")] gpu_control_tube: Tube,
#[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
#[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
#[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
#[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
) -> DeviceResult<Vec<VirtioDeviceStub>> {
let mut devs = Vec::new();
#[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
let mut resource_bridges = Vec::<Tube>::new();
if !cfg.wayland_socket_paths.is_empty() {
#[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
let mut wl_resource_bridge = None::<Tube>;
#[cfg(feature = "gpu")]
{
if cfg.gpu_parameters.is_some() {
let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
resource_bridges.push(gpu_socket);
wl_resource_bridge = Some(wl_socket);
}
}
devs.push(create_wayland_device(
cfg.protection_type,
&cfg.jail_config,
&cfg.wayland_socket_paths,
wl_resource_bridge,
)?);
}
#[cfg(feature = "video-decoder")]
let video_dec_cfg = cfg
.video_dec
.iter()
.map(|config| {
let (video_tube, gpu_tube) =
Tube::pair().expect("failed to create tube for video decoder");
resource_bridges.push(gpu_tube);
(video_tube, config.backend)
})
.collect::<Vec<_>>();
#[cfg(feature = "video-encoder")]
let video_enc_cfg = cfg
.video_enc
.iter()
.map(|config| {
let (video_tube, gpu_tube) =
Tube::pair().expect("failed to create tube for video encoder");
resource_bridges.push(gpu_tube);
(video_tube, config.backend)
})
.collect::<Vec<_>>();
#[cfg(feature = "gpu")]
{
if let Some(gpu_parameters) = &cfg.gpu_parameters {
let mut event_devices = Vec::new();
if cfg.display_window_mouse {
let display_param = if gpu_parameters.display_params.is_empty() {
Default::default()
} else {
gpu_parameters.display_params[0].clone()
};
let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
let (event_device_socket, virtio_dev_socket) =
StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
.context("failed to create socket")?;
let mut multi_touch_width = gpu_display_w;
let mut multi_touch_height = gpu_display_h;
let mut multi_touch_name = None;
for input in &cfg.virtio_input {
if let InputDeviceOption::MultiTouch {
width,
height,
name,
..
} = input
{
if let Some(width) = width {
multi_touch_width = *width;
}
if let Some(height) = height {
multi_touch_height = *height;
}
if let Some(name) = name {
multi_touch_name = Some(name.as_str());
}
break;
}
}
let dev = virtio::input::new_multi_touch(
// u32::MAX is the least likely to collide with the indices generated above for
// the multi_touch options, which begin at 0.
u32::MAX,
virtio_dev_socket,
multi_touch_width,
multi_touch_height,
multi_touch_name,
virtio::base_features(cfg.protection_type),
)
.context("failed to set up mouse device")?;
devs.push(VirtioDeviceStub {
dev: Box::new(dev),
jail: simple_jail(&cfg.jail_config, "input_device")?,
});
event_devices.push(EventDevice::touchscreen(event_device_socket));
}
if cfg.display_window_keyboard {
let (event_device_socket, virtio_dev_socket) =
StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
.context("failed to create socket")?;
let dev = virtio::input::new_keyboard(
// u32::MAX is the least likely to collide with the indices generated above for
// the multi_touch options, which begin at 0.
u32::MAX,
virtio_dev_socket,
virtio::base_features(cfg.protection_type),
)
.context("failed to set up keyboard device")?;
devs.push(VirtioDeviceStub {
dev: Box::new(dev),
jail: simple_jail(&cfg.jail_config, "input_device")?,
});
event_devices.push(EventDevice::keyboard(event_device_socket));
}
devs.push(create_gpu_device(
cfg,
vm_evt_wrtube,
gpu_control_tube,
resource_bridges,
render_server_fd,
has_vfio_gfx_device,
event_devices,
)?);
}
}
for (_, param) in cfg.serial_parameters.iter().filter(|(_k, v)| {
v.hardware == SerialHardware::VirtioConsole
|| v.hardware == SerialHardware::LegacyVirtioConsole
}) {
let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
devs.push(dev);
}
for disk in &cfg.disks {
let disk_config = DiskConfig::new(disk, Some(disk_device_tubes.remove(0)));
devs.push(
disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
);
}
if !cfg.scsis.is_empty() {
let scsi_config = ScsiConfig(&cfg.scsis);
devs.push(
scsi_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
);
}
for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
let pmem_device_tube = pmem_device_tubes.remove(0);
devs.push(create_pmem_device(
cfg.protection_type,
&cfg.jail_config,
vm,
resources,
pmem_disk,
index,
pmem_device_tube,
)?);
}
if cfg.rng {
devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
}
#[cfg(feature = "pvclock")]
if let Some(suspend_tube) = pvclock_device_tube {
let tsc_state = devices::tsc::tsc_state()?;
let tsc_sync_mitigations =
get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
if tsc_state.core_grouping.size() > 1 {
// Host TSCs are not in sync. Log what mitigations are applied.
warn!(
"Host TSCs are not in sync, applying the following mitigations: {:?}",
tsc_sync_mitigations
);
}
devs.push(create_pvclock_device(
cfg.protection_type,
&cfg.jail_config,
tsc_state.frequency,
suspend_tube,
)?);
info!("virtio-pvclock is enabled for this vm");
}
#[cfg(feature = "vtpm")]
{
if cfg.vtpm_proxy {
devs.push(create_vtpm_proxy_device(
cfg.protection_type,
&cfg.jail_config,
)?);
}
}
let mut keyboard_idx = 0;
let mut mouse_idx = 0;
let mut rotary_idx = 0;
let mut switches_idx = 0;
let mut multi_touch_idx = 0;
let mut single_touch_idx = 0;
let mut trackpad_idx = 0;
for input in &cfg.virtio_input {
let input_dev = match input {
InputDeviceOption::Evdev { path } => {
create_vinput_device(cfg.protection_type, &cfg.jail_config, path.as_path())?
}
InputDeviceOption::Keyboard { path } => {
let dev = create_keyboard_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
keyboard_idx,
)?;
keyboard_idx += 1;
dev
}
InputDeviceOption::Mouse { path } => {
let dev = create_mouse_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
mouse_idx,
)?;
mouse_idx += 1;
dev
}
InputDeviceOption::MultiTouch {
path,
width,
height,
name,
} => {
let mut width = *width;
let mut height = *height;
if multi_touch_idx == 0 {
if width.is_none() {
width = cfg.display_input_width;
}
if height.is_none() {
height = cfg.display_input_height;
}
}
let dev = create_multi_touch_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
name.as_deref(),
multi_touch_idx,
)?;
multi_touch_idx += 1;
dev
}
InputDeviceOption::Rotary { path } => {
let dev = create_rotary_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
rotary_idx,
)?;
rotary_idx += 1;
dev
}
InputDeviceOption::SingleTouch {
path,
width,
height,
name,
} => {
let mut width = *width;
let mut height = *height;
if single_touch_idx == 0 {
if width.is_none() {
width = cfg.display_input_width;
}
if height.is_none() {
height = cfg.display_input_height;
}
}
let dev = create_single_touch_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
name.as_deref(),
single_touch_idx,
)?;
single_touch_idx += 1;
dev
}
InputDeviceOption::Switches { path } => {
let dev = create_switches_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
switches_idx,
)?;
switches_idx += 1;
dev
}
InputDeviceOption::Trackpad {
path,
width,
height,
name,
} => {
let dev = create_trackpad_device(
cfg.protection_type,
&cfg.jail_config,
path.as_path(),
width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
name.as_deref(),
trackpad_idx,
)?;
trackpad_idx += 1;
dev
}
};
devs.push(input_dev);
}
#[cfg(feature = "balloon")]
if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
(balloon_device_tube, dynamic_mapping_device_tube)
{
let balloon_features = (cfg.balloon_page_reporting as u64)
<< BalloonFeatures::PageReporting as u64
| (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
devs.push(create_balloon_device(
cfg.protection_type,
&cfg.jail_config,
if cfg.strict_balloon {
BalloonMode::Strict
} else {
BalloonMode::Relaxed
},
balloon_device_tube,
balloon_inflate_tube,
init_balloon_size,
dynamic_mapping_device_tube,
balloon_features,
#[cfg(feature = "registered_events")]
Some(
registered_evt_q
.try_clone()
.context("failed to clone registered_evt_q tube")?,
),
cfg.balloon_ws_num_bins,
)?);
}
#[cfg(feature = "net")]
for opt in &cfg.net {
let dev = opt.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
devs.push(dev);
}
#[cfg(feature = "audio")]
{
for virtio_snd in &cfg.virtio_snds {
devs.push(create_virtio_snd_device(
cfg.protection_type,
&cfg.jail_config,
virtio_snd.clone(),
)?);
}
}
#[cfg(feature = "video-decoder")]
{
for (tube, backend) in video_dec_cfg {
register_video_device(
backend,
&mut devs,
tube,
cfg.protection_type,
&cfg.jail_config,
VideoDeviceType::Decoder,
)?;
}
}
#[cfg(feature = "video-encoder")]
{
for (tube, backend) in video_enc_cfg {
register_video_device(
backend,
&mut devs,
tube,
cfg.protection_type,
&cfg.jail_config,
VideoDeviceType::Encoder,
)?;
}
}
if let Some(vsock_config) = &cfg.vsock {
devs.push(
vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
);
}
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
{
if cfg.vhost_scmi {
devs.push(create_vhost_scmi_device(
cfg.protection_type,
&cfg.jail_config,
cfg.vhost_scmi_device.clone(),
)?);
}
}
for vhost_user_fs in &cfg.vhost_user_fs {
devs.push(create_vhost_user_fs_device(
cfg.protection_type,
vhost_user_fs,
)?);
}
for shared_dir in &cfg.shared_dirs {
let SharedDir {
src,
tag,
kind,
ugid,
uid_map,
gid_map,
fs_cfg,
p9_cfg,
} = shared_dir;
let dev = match kind {
SharedDirKind::FS => {
let device_tube = fs_device_tubes.remove(0);
create_fs_device(
cfg.protection_type,
&cfg.jail_config,
*ugid,
uid_map,
gid_map,
src,
tag,
fs_cfg.clone(),
device_tube,
)?
}
SharedDirKind::P9 => create_9p_device(
cfg.protection_type,
&cfg.jail_config,
*ugid,
uid_map,
gid_map,
src,
tag,
p9_cfg.clone(),
)?,
};
devs.push(dev);
}
#[cfg(feature = "audio")]
if let Some(path) = &cfg.sound {
devs.push(create_sound_device(
path,
cfg.protection_type,
&cfg.jail_config,
)?);
}
for opt in &cfg.vhost_user {
devs.push(create_vhost_user_frontend(cfg.protection_type, opt)?);
}
Ok(devs)
}
fn create_devices(
cfg: &Config,
vm: &mut impl Vm,
resources: &mut SystemAllocator,
vm_evt_wrtube: &SendTube,
iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
control_tubes: &mut Vec<TaggedControlTube>,
#[cfg(feature = "balloon")] balloon_device_tube: Option<Tube>,
#[cfg(feature = "balloon")] init_balloon_size: u64,
#[cfg(feature = "balloon")] dynamic_mapping_device_tube: Option<Tube>,
disk_device_tubes: &mut Vec<Tube>,
pmem_device_tubes: &mut Vec<Tube>,
fs_device_tubes: &mut Vec<Tube>,
#[cfg(feature = "usb")] usb_provider: DeviceProvider,
#[cfg(feature = "gpu")] gpu_control_tube: Tube,
#[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
iova_max_addr: &mut Option<u64>,
#[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
#[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
#[cfg(feature = "balloon")]
let mut balloon_inflate_tube: Option<Tube> = None;
#[cfg(feature = "gpu")]
let mut has_vfio_gfx_device = false;
if !cfg.vfio.is_empty() {
let mut coiommu_attached_endpoints = Vec::new();
for vfio_dev in &cfg.vfio {
let (dev, jail, viommu_mapper) = create_vfio_device(
&cfg.jail_config,
vm,
resources,
irq_control_tubes,
vm_memory_control_tubes,
control_tubes,
&vfio_dev.path,
false,
None,
vfio_dev.guest_address,
Some(&mut coiommu_attached_endpoints),
vfio_dev.iommu,
vfio_dev.dt_symbol.clone(),
)?;
match dev {
VfioDeviceVariant::Pci(vfio_pci_device) => {
*iova_max_addr = Some(max(
vfio_pci_device.get_max_iova(),
iova_max_addr.unwrap_or(0),
));
#[cfg(feature = "gpu")]
if vfio_pci_device.is_gfx() {
has_vfio_gfx_device = true;
}
if let Some(viommu_mapper) = viommu_mapper {
iommu_attached_endpoints.insert(
vfio_pci_device
.pci_address()
.context("not initialized")?
.to_u32(),
Arc::new(Mutex::new(Box::new(viommu_mapper))),
);
}
devices.push((Box::new(vfio_pci_device), jail));
}
VfioDeviceVariant::Platform(vfio_plat_dev) => {
devices.push((Box::new(vfio_plat_dev), jail));
}
}
}
if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
// SAFETY: trivially safe
let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
if res == 0 {
// SAFETY: safe because getrlimit64 has returned success.
let limit = unsafe { buf.assume_init() };
let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
let rlim_max = max(limit.rlim_max, rlim_new);
if limit.rlim_cur < rlim_new {
let limit_arg = libc::rlimit64 {
rlim_cur: rlim_new,
rlim_max,
};
// SAFETY: trivially safe
let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
if res != 0 {
bail!("Set rlimit failed");
}
}
} else {
bail!("Get rlimit failed");
}
}
#[cfg(feature = "balloon")]
let coiommu_tube: Option<Tube>;
#[cfg(not(feature = "balloon"))]
let coiommu_tube: Option<Tube> = None;
if !coiommu_attached_endpoints.is_empty() {
let vfio_container =
VfioCommonSetup::vfio_get_container(IommuDevType::CoIommu, None as Option<&Path>)
.context("failed to get vfio container")?;
let (coiommu_host_tube, coiommu_device_tube) =
Tube::pair().context("failed to create coiommu tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: coiommu_host_tube,
expose_with_viommu: false,
});
let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
#[cfg(feature = "balloon")]
match Tube::pair() {
Ok((x, y)) => {
coiommu_tube = Some(x);
balloon_inflate_tube = Some(y);
}
Err(x) => return Err(x).context("failed to create coiommu tube"),
}
let dev = CoIommuDev::new(
vm.get_memory().clone(),
vfio_container,
VmMemoryClient::new(coiommu_device_tube),
coiommu_tube,
coiommu_attached_endpoints,
vcpu_count,
cfg.coiommu_param.unwrap_or_default(),
)
.context("failed to create coiommu device")?;
devices.push((
Box::new(dev),
simple_jail(&cfg.jail_config, "coiommu_device")?,
));
}
}
let stubs = create_virtio_devices(
cfg,
vm,
resources,
vm_evt_wrtube,
#[cfg(feature = "balloon")]
balloon_device_tube,
#[cfg(feature = "balloon")]
balloon_inflate_tube,
#[cfg(feature = "balloon")]
init_balloon_size,
#[cfg(feature = "balloon")]
dynamic_mapping_device_tube,
disk_device_tubes,
pmem_device_tubes,
fs_device_tubes,
#[cfg(feature = "gpu")]
gpu_control_tube,
#[cfg(feature = "gpu")]
render_server_fd,
#[cfg(feature = "gpu")]
has_vfio_gfx_device,
#[cfg(feature = "registered_events")]
registered_evt_q,
#[cfg(feature = "pvclock")]
pvclock_device_tube,
)?;
for stub in stubs {
match stub.dev.transport_type() {
VirtioTransportType::Pci => {
let (msi_host_tube, msi_device_tube) =
Tube::pair().context("failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
let (host_tube, device_tube) =
Tube::pair().context("failed to create shared memory tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: host_tube,
expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
});
Some(device_tube)
} else {
None
};
let (ioevent_host_tube, ioevent_device_tube) =
Tube::pair().context("failed to create ioevent tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: ioevent_host_tube,
expose_with_viommu: false,
});
let (host_tube, device_tube) =
Tube::pair().context("failed to create device control tube")?;
control_tubes.push(TaggedControlTube::Vm(host_tube));
let dev = VirtioPciDevice::new(
vm.get_memory().clone(),
stub.dev,
msi_device_tube,
cfg.disable_virtio_intx,
shared_memory_tube.map(VmMemoryClient::new),
VmMemoryClient::new(ioevent_device_tube),
device_tube,
)
.context("failed to create virtio pci dev")?;
devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
}
VirtioTransportType::Mmio => {
let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
.context("failed to create virtio mmio dev")?;
devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
}
}
}
#[cfg(feature = "usb")]
if cfg.usb {
// Create xhci controller.
let usb_controller = Box::new(XhciController::new(
vm.get_memory().clone(),
Box::new(usb_provider),
));
devices.push((
usb_controller,
simple_jail(&cfg.jail_config, "xhci_device")?,
));
}
for params in &cfg.stub_pci_devices {
// Stub devices don't need jailing since they don't do anything.
devices.push((Box::new(StubPciDevice::new(params)), None));
}
devices.push((
Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
None,
));
Ok(devices)
}
fn create_file_backed_mappings(
cfg: &Config,
vm: &mut impl Vm,
resources: &mut SystemAllocator,
) -> Result<()> {
for mapping in &cfg.file_backed_mappings {
let file = OpenOptions::new()
.read(true)
.write(mapping.writable)
.custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
.open(&mapping.path)
.context("failed to open file for file-backed mapping")?;
let prot = if mapping.writable {
Protection::read_write()
} else {
Protection::read()
};
let size = mapping
.size
.try_into()
.context("Invalid size for file-backed mapping")?;
let memory_mapping = MemoryMappingBuilder::new(size)
.from_file(&file)
.offset(mapping.offset)
.protection(prot)
.build()
.context("failed to map backing file for file-backed mapping")?;
let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
.context("failed to convert to AddressRange")?;
match resources.mmio_allocator_any().allocate_at(
mapping_range,
Alloc::FileBacked(mapping.address),
"file-backed mapping".to_owned(),
) {
// OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
// consider it an error.
// TODO(b/222769529): Reserve this region in a global memory address space allocator
// once we have that so nothing else can accidentally overlap with it.
Ok(()) | Err(resources::Error::OutOfSpace) => {}
e => e.context("failed to allocate guest address for file-backed mapping")?,
}
vm.add_memory_region(
GuestAddress(mapping.address),
Box::new(memory_mapping),
!mapping.writable,
/* log_dirty_pages = */ false,
MemCacheType::CacheCoherent,
)
.context("failed to configure file-backed mapping")?;
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
/// Collection of devices related to PCI hotplug.
struct HotPlugStub {
/// Map from bus index to hotplug bus.
hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
/// Bus ranges of devices for virtio-iommu.
iommu_bus_ranges: Vec<RangeInclusive<u32>>,
/// Map from gpe index to GpeNotify devices.
gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
/// Map from bus index to GpeNotify devices.
pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
}
#[cfg(target_arch = "x86_64")]
impl HotPlugStub {
/// Constructs empty HotPlugStub.
fn new() -> Self {
Self {
hotplug_buses: BTreeMap::new(),
iommu_bus_ranges: Vec::new(),
gpe_notify_devs: BTreeMap::new(),
pme_notify_devs: BTreeMap::new(),
}
}
}
#[cfg(target_arch = "x86_64")]
/// Creates PCIE root port with only virtual devices.
///
/// user doesn't specify host pcie root port which link to this virtual pcie rp,
/// find the empty bus and create a total virtual pcie rp
fn create_pure_virtual_pcie_root_port(
sys_allocator: &mut SystemAllocator,
irq_control_tubes: &mut Vec<Tube>,
devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
hp_bus_count: u8,
) -> Result<HotPlugStub> {
let mut hp_sec_buses = Vec::new();
let mut hp_stub = HotPlugStub::new();
// Create Pcie Root Port for non-root buses, each non-root bus device will be
// connected behind a virtual pcie root port.
for i in 1..255 {
if sys_allocator.pci_bus_empty(i) {
if hp_sec_buses.len() < hp_bus_count.into() {
hp_sec_buses.push(i);
}
continue;
}
let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
hp_stub
.pme_notify_devs
.insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
// no ipc is used if the root port disables hotplug
devices.push((pci_bridge, None));
}
// Create Pcie Root Port for hot-plug
if hp_sec_buses.len() < hp_bus_count.into() {
return Err(anyhow!("no more addresses are available"));
}
for hp_sec_bus in hp_sec_buses {
let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
hp_stub.pme_notify_devs.insert(
hp_sec_bus,
pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
);
let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
PciAddress {
bus: pci_bridge.get_secondary_num(),
dev: 0,
func: 0,
}
.to_u32(),
PciAddress {
bus: pci_bridge.get_subordinate_num(),
dev: 32,
func: 8,
}
.to_u32(),
));
devices.push((pci_bridge, None));
hp_stub
.hotplug_buses
.insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
}
Ok(hp_stub)
}
fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
Some(
open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
.with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
)
} else {
None
};
let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
Some(
open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
.with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
)
} else {
None
};
let vm_image = match cfg.executable_path {
Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
|| format!("failed to open kernel image {}", kernel_path.display()),
)?,
),
Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
.with_context(|| format!("failed to open bios {}", bios_path.display()))?,
),
_ => panic!("Did not receive a bios or kernel, should be impossible."),
};
let swiotlb = if let Some(size) = cfg.swiotlb {
Some(
size.checked_mul(1024 * 1024)
.ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
)
} else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
None
} else {
Some(64 * 1024 * 1024)
};
let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
{
(
Some(
open_file_or_duplicate(
&pflash_parameters.path,
OpenOptions::new().read(true).write(true),
)
.with_context(|| {
format!("failed to open pflash {}", pflash_parameters.path.display())
})?,
),
pflash_parameters.block_size,
)
} else {
(None, 0)
};
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
let mut cpu_frequencies = BTreeMap::new();
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
let mut virt_cpufreq_socket = None;
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
if cfg.virt_cpufreq {
let host_cpu_frequencies = Arch::get_host_cpu_frequencies_khz()?;
for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
let vcpu_affinity = match cfg.vcpu_affinity.clone() {
Some(VcpuAffinity::Global(v)) => v,
Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
None => {
panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
}
};
// Check that the physical CPUs that the vCPU is affined to all share the same
// frequency domain.
if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
for cpu in vcpu_affinity.iter() {
if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
if frequencies != freq_domain {
panic!("Affined CPUs do not share a frequency domain!");
}
}
}
cpu_frequencies.insert(cpu_id, freq_domain.clone());
} else {
panic!("No frequency domain for cpu:{}", cpu_id);
}
}
virt_cpufreq_socket = if let Some(path) = &cfg.virt_cpufreq_socket {
let file = base::open_file_or_duplicate(path, OpenOptions::new().write(true))
.with_context(|| {
format!("failed to open virt_cpufreq_socket {}", path.display())
})?;
let fd: std::os::fd::OwnedFd = file.into();
let socket: std::os::unix::net::UnixStream = fd.into();
Some(socket)
} else {
None
};
}
// if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
(
Arch::get_host_cpu_clusters()?,
Arch::get_host_cpu_capacity()?,
)
} else {
(cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
};
Ok(VmComponents {
#[cfg(target_arch = "x86_64")]
ac_adapter: cfg.ac_adapter,
#[cfg(target_arch = "x86_64")]
break_linux_pci_config_io: cfg.break_linux_pci_config_io,
memory_size: cfg
.memory
.unwrap_or(256)
.checked_mul(1024 * 1024)
.ok_or_else(|| anyhow!("requested memory size too large"))?,
swiotlb,
fw_cfg_enable,
bootorder_fw_cfg_blob: Vec::new(),
vcpu_count: cfg.vcpu_count.unwrap_or(1),
vcpu_affinity: cfg.vcpu_affinity.clone(),
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
cpu_frequencies,
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
virt_cpufreq_socket,
fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
cpu_clusters,
cpu_capacity,
no_smt: cfg.no_smt,
hugepages: cfg.hugepages,
hv_cfg: hypervisor::Config {
#[cfg(target_arch = "aarch64")]
mte: cfg.mte,
protection_type: cfg.protection_type,
},
vm_image,
android_fstab: cfg
.android_fstab
.as_ref()
.map(|x| {
File::open(x)
.with_context(|| format!("failed to open android fstab file {}", x.display()))
})
.map_or(Ok(None), |v| v.map(Some))?,
pstore: cfg.pstore.clone(),
pflash_block_size,
pflash_image,
initrd_image,
extra_kernel_params: cfg.params.clone(),
acpi_sdts: cfg
.acpi_tables
.iter()
.map(|path| {
SDT::from_file(path)
.with_context(|| format!("failed to open ACPI file {}", path.display()))
})
.collect::<Result<Vec<SDT>>>()?,
rt_cpus: cfg.rt_cpus.clone(),
delay_rt: cfg.delay_rt,
#[cfg(feature = "gdb")]
gdb: None,
no_i8042: cfg.no_i8042,
no_rtc: cfg.no_rtc,
#[cfg(target_arch = "x86_64")]
smbios: cfg.smbios.clone(),
host_cpu_topology: cfg.host_cpu_topology,
itmt: cfg.itmt,
#[cfg(target_arch = "x86_64")]
force_s2idle: cfg.force_s2idle,
pvm_fw: pvm_fw_image,
#[cfg(target_arch = "x86_64")]
pcie_ecam: cfg.pcie_ecam,
#[cfg(target_arch = "x86_64")]
pci_low_start: cfg.pci_low_start,
dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
boot_cpu: cfg.boot_cpu,
})
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum ExitState {
Reset,
Stop,
Crash,
GuestPanic,
WatchdogReset,
}
// Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
// Returns the updated guest memory layout.
fn punch_holes_in_guest_mem_layout_for_mappings(
guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
file_backed_mappings: &[FileBackedMappingParameters],
) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
// Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
// at end is not included in the range).
let mut layout_set = BTreeSet::new();
for (addr, size, options) in &guest_mem_layout {
layout_set.insert((addr.offset(), addr.offset() + size, *options));
}
for mapping in file_backed_mappings {
let mapping_start = mapping.address;
let mapping_end = mapping_start + mapping.size;
// Repeatedly split overlapping guest memory regions until no overlaps remain.
while let Some((range_start, range_end, options)) = layout_set
.iter()
.find(|&&(range_start, range_end, _)| {
mapping_start < range_end && mapping_end > range_start
})
.cloned()
{
layout_set.remove(&(range_start, range_end, options));
if range_start < mapping_start {
layout_set.insert((range_start, mapping_start, options));
}
if range_end > mapping_end {
layout_set.insert((mapping_end, range_end, options));
}
}
}
// Build the final guest memory layout from the modified layout_set.
layout_set
.iter()
.map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
.collect()
}
fn create_guest_memory(
cfg: &Config,
components: &VmComponents,
hypervisor: &impl Hypervisor,
) -> Result<GuestMemory> {
let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor)
.context("failed to create guest memory layout")?;
let guest_mem_layout =
punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
.context("failed to create guest memory")?;
let mut mem_policy = MemoryPolicy::empty();
if components.hugepages {
mem_policy |= MemoryPolicy::USE_HUGEPAGES;
}
if cfg.lock_guest_memory {
mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
}
guest_mem.set_memory_policy(mem_policy);
if cfg.unmap_guest_memory_on_fork {
// Note that this isn't compatible with sandboxing. We could potentially fix that by
// delaying the call until after the sandboxed devices are forked. However, the main use
// for this is in conjunction with protected VMs, where most of the guest memory has been
// unshared with the host. We'd need to be confident that the guest memory is unshared with
// the host only after the `use_dontfork` call and those details will vary by hypervisor.
// So, for now we keep things simple to be safe.
guest_mem.use_dontfork().context("use_dontfork failed")?;
}
Ok(guest_mem)
}
#[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
use devices::GeniezoneKernelIrqChip;
use hypervisor::geniezone::Geniezone;
use hypervisor::geniezone::GeniezoneVcpu;
use hypervisor::geniezone::GeniezoneVm;
let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
let gzvm = Geniezone::new_with_path(device_path)
.with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
let guest_mem = create_guest_memory(&cfg, &components, &gzvm)?;
#[cfg(feature = "swap")]
let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
Some(
SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
.context("launch vmm-swap monitor process")?,
)
} else {
None
};
let vm =
GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
// Check that the VM was actually created in protected mode as expected.
if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
bail!("Failed to create protected VM");
}
let vm_clone = vm.try_clone().context("failed to clone vm")?;
let ioapic_host_tube;
let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
IrqChipKind::Kernel => {
ioapic_host_tube = None;
GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
.context("failed to create IRQ chip")?
}
};
run_vm::<GeniezoneVcpu, GeniezoneVm>(
cfg,
components,
vm,
&mut irq_chip,
ioapic_host_tube,
#[cfg(feature = "swap")]
swap_controller,
)
}
fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
use devices::KvmKernelIrqChip;
#[cfg(target_arch = "x86_64")]
use devices::KvmSplitIrqChip;
use hypervisor::kvm::Kvm;
use hypervisor::kvm::KvmVcpu;
use hypervisor::kvm::KvmVm;
let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
let kvm = Kvm::new_with_path(device_path)
.with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
let guest_mem = create_guest_memory(&cfg, &components, &kvm)?;
#[cfg(feature = "swap")]
let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
Some(
SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
.context("launch vmm-swap monitor process")?,
)
} else {
None
};
let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
#[cfg(target_arch = "x86_64")]
if cfg.itmt {
vm.set_platform_info_read_access(false)
.context("failed to disable MSR_PLATFORM_INFO read access")?;
}
// Check that the VM was actually created in protected mode as expected.
if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
bail!("Failed to create protected VM");
}
let vm_clone = vm.try_clone().context("failed to clone vm")?;
enum KvmIrqChip {
#[cfg(target_arch = "x86_64")]
Split(KvmSplitIrqChip),
Kernel(KvmKernelIrqChip),
}
impl KvmIrqChip {
fn as_mut(&mut self) -> &mut dyn IrqChipArch {
match self {
#[cfg(target_arch = "x86_64")]
KvmIrqChip::Split(i) => i,
KvmIrqChip::Kernel(i) => i,
}
}
}
let ioapic_host_tube;
let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
IrqChipKind::Userspace => {
bail!("KVM userspace irqchip mode not implemented");
}
IrqChipKind::Split => {
#[cfg(not(target_arch = "x86_64"))]
bail!("KVM split irqchip mode only supported on x86 processors");
#[cfg(target_arch = "x86_64")]
{
let (host_tube, ioapic_device_tube) =
Tube::pair().context("failed to create tube")?;
ioapic_host_tube = Some(host_tube);
KvmIrqChip::Split(
KvmSplitIrqChip::new(
vm_clone,
components.vcpu_count,
ioapic_device_tube,
Some(24),
)
.context("failed to create IRQ chip")?,
)
}
}
IrqChipKind::Kernel => {
ioapic_host_tube = None;
KvmIrqChip::Kernel(
KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
.context("failed to create IRQ chip")?,
)
}
};
run_vm::<KvmVcpu, KvmVm>(
cfg,
components,
vm,
irq_chip.as_mut(),
ioapic_host_tube,
#[cfg(feature = "swap")]
swap_controller,
)
}
#[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
fn run_gunyah(
device_path: Option<&Path>,
cfg: Config,
components: VmComponents,
) -> Result<ExitState> {
use devices::GunyahIrqChip;
use hypervisor::gunyah::Gunyah;
use hypervisor::gunyah::GunyahVcpu;
use hypervisor::gunyah::GunyahVm;
let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
let gunyah = Gunyah::new_with_path(device_path)
.with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
let guest_mem = create_guest_memory(&cfg, &components, &gunyah)?;
#[cfg(feature = "swap")]
let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
Some(
SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
.context("launch vmm-swap monitor process")?,
)
} else {
None
};
let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
// Check that the VM was actually created in protected mode as expected.
if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
bail!("Failed to create protected VM");
}
let vm_clone = vm.try_clone()?;
run_vm::<GunyahVcpu, GunyahVm>(
cfg,
components,
vm,
&mut GunyahIrqChip::new(vm_clone)?,
None,
#[cfg(feature = "swap")]
swap_controller,
)
}
/// Choose a default hypervisor if no `--hypervisor` option was specified.
fn get_default_hypervisor() -> Option<HypervisorKind> {
let kvm_path = Path::new(KVM_PATH);
if kvm_path.exists() {
return Some(HypervisorKind::Kvm {
device: Some(kvm_path.to_path_buf()),
});
}
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
#[cfg(feature = "geniezone")]
{
let gz_path = Path::new(GENIEZONE_PATH);
if gz_path.exists() {
return Some(HypervisorKind::Geniezone {
device: Some(gz_path.to_path_buf()),
});
}
}
#[cfg(all(
unix,
any(target_arch = "arm", target_arch = "aarch64"),
feature = "gunyah"
))]
{
let gunyah_path = Path::new(GUNYAH_PATH);
if gunyah_path.exists() {
return Some(HypervisorKind::Gunyah {
device: Some(gunyah_path.to_path_buf()),
});
}
}
None
}
pub fn run_config(cfg: Config) -> Result<ExitState> {
if let Some(async_executor) = cfg.async_executor {
Executor::set_default_executor_kind(async_executor)
.context("Failed to set the default async executor")?;
}
let components = setup_vm_components(&cfg)?;
let hypervisor = cfg
.hypervisor
.clone()
.or_else(get_default_hypervisor)
.context("no enabled hypervisor")?;
debug!("creating hypervisor: {:?}", hypervisor);
match hypervisor {
HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
#[cfg(feature = "geniezone")]
HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
#[cfg(all(
unix,
any(target_arch = "arm", target_arch = "aarch64"),
feature = "gunyah"
))]
HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
}
}
fn run_vm<Vcpu, V>(
cfg: Config,
#[allow(unused_mut)] mut components: VmComponents,
mut vm: V,
irq_chip: &mut dyn IrqChipArch,
ioapic_host_tube: Option<Tube>,
#[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
) -> Result<ExitState>
where
Vcpu: VcpuArch + 'static,
V: VmArch + 'static,
{
if cfg.jail_config.is_some() {
// Printing something to the syslog before entering minijail so that libc's syslogger has a
// chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
// access to those files will not be possible.
info!("crosvm entering multiprocess mode");
}
let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
metrics::initialize(metrics_send);
#[cfg(all(feature = "pci-hotplug", feature = "swap"))]
let swap_device_helper = match &swap_controller {
Some(swap_controller) => Some(swap_controller.create_device_helper()?),
None => None,
};
// pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
// would crash.
#[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
if cfg.pci_hotplug_slots.is_some() {
bail!("pci-hotplug is not implemented for non x86_64 architecture");
}
// hotplug_manager must be created before vm is started since it forks jail warden process.
#[cfg(feature = "pci-hotplug")]
// TODO(293801301): Remove unused_mut after aarch64 support
#[allow(unused_mut)]
let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
Some(PciHotPlugManager::new(
vm.get_memory().clone(),
&cfg,
#[cfg(feature = "swap")]
swap_device_helper,
)?)
} else {
None
};
#[cfg(feature = "gpu")]
let (gpu_control_host_tube, gpu_control_device_tube) =
Tube::pair().context("failed to create gpu tube")?;
#[cfg(feature = "usb")]
let (usb_control_tube, usb_provider) =
DeviceProvider::new().context("failed to create usb provider")?;
// Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
// before any jailed devices have been spawned, so that we can catch any of them that fail very
// quickly.
let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
let control_server_socket = match &cfg.socket_path {
Some(path) => Some(UnlinkUnixSeqpacketListener(
UnixSeqpacketListener::bind(path).context("failed to create control server")?,
)),
None => None,
};
let mut control_tubes = Vec::new();
let mut irq_control_tubes = Vec::new();
let mut vm_memory_control_tubes = Vec::new();
#[cfg(feature = "gdb")]
if let Some(port) = cfg.gdb {
// GDB needs a control socket to interrupt vcpus.
let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
components.gdb = Some((port, gdb_control_tube));
}
#[cfg(feature = "balloon")]
let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
if let Some(ref path) = cfg.balloon_control {
(
None,
Some(Tube::new_from_unix_seqpacket(
UnixSeqpacket::connect(path).with_context(|| {
format!(
"failed to connect to balloon control socket {}",
path.display(),
)
})?,
)?),
)
} else {
// Balloon gets a special socket so balloon requests can be forwarded
// from the main process.
let (host, device) = Tube::pair().context("failed to create tube")?;
(Some(host), Some(device))
}
} else {
(None, None)
};
// The balloon device also needs a tube to communicate back to the main process to
// handle remapping memory dynamically.
#[cfg(feature = "balloon")]
let dynamic_mapping_device_tube = if cfg.balloon {
let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
Tube::pair().context("failed to create tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: dynamic_mapping_host_tube,
expose_with_viommu: false,
});
Some(dynamic_mapping_device_tube)
} else {
None
};
// Create one control socket per disk.
let mut disk_device_tubes = Vec::new();
let mut disk_host_tubes = Vec::new();
let disk_count = cfg.disks.len();
for _ in 0..disk_count {
let (disk_host_tub, disk_device_tube) = Tube::pair().context("failed to create tube")?;
disk_host_tubes.push(disk_host_tub);
disk_device_tubes.push(disk_device_tube);
}
let mut pmem_device_tubes = Vec::new();
let pmem_count = cfg.pmem_devices.len();
for _ in 0..pmem_count {
let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
pmem_device_tubes.push(pmem_device_tube);
control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
}
if let Some(ioapic_host_tube) = ioapic_host_tube {
irq_control_tubes.push(ioapic_host_tube);
}
let battery = if cfg.battery_config.is_some() {
#[cfg_attr(
not(feature = "power-monitor-powerd"),
allow(clippy::manual_map, clippy::needless_match, unused_mut)
)]
let jail = if let Some(jail_config) = &cfg.jail_config {
let mut config = SandboxConfig::new(jail_config, "battery");
#[cfg(feature = "power-monitor-powerd")]
{
config.bind_mounts = true;
}
let mut jail =
create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
// Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
#[cfg(feature = "power-monitor-powerd")]
{
let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
}
Some(jail)
} else {
None
};
(cfg.battery_config.as_ref().map(|c| c.type_), jail)
} else {
(cfg.battery_config.as_ref().map(|c| c.type_), None)
};
let fs_count = cfg
.shared_dirs
.iter()
.filter(|sd| sd.kind == SharedDirKind::FS)
.count();
let mut fs_device_tubes = Vec::with_capacity(fs_count);
for _ in 0..fs_count {
let (fs_host_tube, fs_device_tube) = Tube::pair().context("failed to create tube")?;
control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
fs_device_tubes.push(fs_device_tube);
}
let (vm_evt_wrtube, vm_evt_rdtube) =
Tube::directional_pair().context("failed to create vm event tube")?;
let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
let mut sys_allocator = SystemAllocator::new(
Arch::get_system_allocator_config(&vm),
pstore_size,
&cfg.mmio_address_ranges,
)
.context("failed to create system allocator")?;
let ramoops_region = match &components.pstore {
Some(pstore) => Some(
arch::pstore::create_memory_region(
&mut vm,
sys_allocator.reserved_region().unwrap(),
pstore,
)
.context("failed to allocate pstore region")?,
),
None => None,
};
create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
#[cfg(feature = "gpu")]
// Hold on to the render server jail so it keeps running until we exit run_vm()
let (_render_server_jail, render_server_fd) =
if let Some(parameters) = &cfg.gpu_render_server_parameters {
let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
(Some(ScopedMinijail(jail)), Some(fd))
} else {
(None, None)
};
#[cfg(feature = "balloon")]
let init_balloon_size = components
.memory_size
.checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
}))
.context("failed to calculate init balloon size")?;
let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
BTreeMap::new();
let mut iova_max_addr: Option<u64> = None;
// pvclock gets a tube for handling suspend/resume requests from the main thread.
#[cfg(feature = "pvclock")]
let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
let (host, device) = Tube::pair().context("failed to create tube")?;
(Some(host), Some(device))
} else {
(None, None)
};
#[cfg(not(feature = "pvclock"))]
if cfg.pvclock {
bail!("pvclock device is only supported when crosvm is built with a feature 'pvclock'");
}
#[cfg(feature = "registered_events")]
let (reg_evt_wrtube, reg_evt_rdtube) =
Tube::directional_pair().context("failed to create registered event tube")?;
let mut devices = create_devices(
&cfg,
&mut vm,
&mut sys_allocator,
&vm_evt_wrtube,
&mut iommu_attached_endpoints,
&mut irq_control_tubes,
&mut vm_memory_control_tubes,
&mut control_tubes,
#[cfg(feature = "balloon")]
balloon_device_tube,
#[cfg(feature = "balloon")]
init_balloon_size,
#[cfg(feature = "balloon")]
dynamic_mapping_device_tube,
&mut disk_device_tubes,
&mut pmem_device_tubes,
&mut fs_device_tubes,
#[cfg(feature = "usb")]
usb_provider,
#[cfg(feature = "gpu")]
gpu_control_device_tube,
#[cfg(feature = "gpu")]
render_server_fd,
&mut iova_max_addr,
#[cfg(feature = "registered_events")]
&reg_evt_wrtube,
#[cfg(feature = "pvclock")]
pvclock_device_tube,
)?;
#[cfg(feature = "pci-hotplug")]
// TODO(293801301): Remove unused_variables after aarch64 support
#[allow(unused_variables)]
let pci_hotplug_slots = cfg.pci_hotplug_slots;
#[cfg(not(feature = "pci-hotplug"))]
#[allow(unused_variables)]
let pci_hotplug_slots: Option<u8> = None;
#[cfg(target_arch = "x86_64")]
let hp_stub = create_pure_virtual_pcie_root_port(
&mut sys_allocator,
&mut irq_control_tubes,
&mut devices,
pci_hotplug_slots.unwrap_or(1),
)?;
arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
let pci_devices: Vec<&dyn PciDevice> = devices
.iter()
.filter_map(|d| (d.0).as_pci_device())
.collect();
let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
.into_iter()
.flat_map(|s| {
if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
std::iter::zip(
Some(virtio_pci_device.virtio_device()),
virtio_pci_device.pci_address(),
)
.next()
} else {
None
}
})
.collect();
let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
.iter()
.flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
.collect();
// order the OpenFirmware device paths, in ascending order, by their boot_index
open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
// "/pci@iocf8/" is x86 specific and represents the root at the system bus port
let mut bootorder_fw_cfg_blob =
open_firmware_device_paths
.into_iter()
.fold(Vec::new(), |a, b| {
a.into_iter()
.chain("/pci@i0cf8/".as_bytes().iter().copied())
.chain(b.0)
.chain("\n".as_bytes().iter().copied())
.collect()
});
// the "bootorder" file is expected to end with a null terminator
bootorder_fw_cfg_blob.push(0);
components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
// if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
// "bootorder" file can be accessed by the guest.
components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
let (translate_response_senders, request_rx) = setup_virtio_access_platform(
&mut sys_allocator,
&mut iommu_attached_endpoints,
&mut devices,
)?;
#[cfg(target_arch = "x86_64")]
let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
#[cfg(not(target_arch = "x86_64"))]
let iommu_bus_ranges = Vec::new();
let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
|| (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
{
let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
let iommu_dev = create_iommu_device(
cfg.protection_type,
&cfg.jail_config,
iova_max_addr.unwrap_or(u64::MAX),
iommu_attached_endpoints,
iommu_bus_ranges,
translate_response_senders,
request_rx,
iommu_device_tube,
)?;
let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let (ioevent_host_tube, ioevent_device_tube) =
Tube::pair().context("failed to create ioevent tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: ioevent_host_tube,
expose_with_viommu: false,
});
let (host_tube, device_tube) =
Tube::pair().context("failed to create device control tube")?;
control_tubes.push(TaggedControlTube::Vm(host_tube));
let mut dev = VirtioPciDevice::new(
vm.get_memory().clone(),
iommu_dev.dev,
msi_device_tube,
cfg.disable_virtio_intx,
None,
VmMemoryClient::new(ioevent_device_tube),
device_tube,
)
.context("failed to create virtio pci dev")?;
// early reservation for viommu.
dev.allocate_address(&mut sys_allocator)
.context("failed to allocate resources early for virtio pci dev")?;
let dev = Box::new(dev);
devices.push((dev, iommu_dev.jail));
Some(iommu_host_tube)
} else {
None
};
#[cfg(target_arch = "x86_64")]
for device in devices
.iter_mut()
.filter_map(|(dev, _)| dev.as_pci_device_mut())
{
let sdts = device
.generate_acpi(components.acpi_sdts)
.or_else(|| {
error!("ACPI table generation error");
None
})
.ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
components.acpi_sdts = sdts;
}
// KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
let mut vcpu_ids = Vec::new();
let guest_suspended_cvar = if cfg.force_s2idle {
Some(Arc::new((Mutex::new(false), Condvar::new())))
} else {
None
};
let dt_overlays = cfg
.device_tree_overlay
.iter()
.map(|o| {
Ok(DtbOverlay {
file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
.with_context(|| {
format!("failed to open device tree overlay {}", o.path.display())
})?,
do_filter: o.filter_devs,
})
})
.collect::<Result<Vec<DtbOverlay>>>()?;
let mut linux = Arch::build_vm::<V, Vcpu>(
components,
&vm_evt_wrtube,
&mut sys_allocator,
&cfg.serial_parameters,
simple_jail(&cfg.jail_config, "serial_device")?,
battery,
vm,
ramoops_region,
devices,
irq_chip,
&mut vcpu_ids,
cfg.dump_device_tree_blob.clone(),
simple_jail(&cfg.jail_config, "serial_device")?,
#[cfg(target_arch = "x86_64")]
simple_jail(&cfg.jail_config, "block_device")?,
#[cfg(target_arch = "x86_64")]
simple_jail(&cfg.jail_config, "fw_cfg_device")?,
#[cfg(feature = "swap")]
&mut swap_controller,
guest_suspended_cvar.clone(),
dt_overlays,
)
.context("the architecture failed to build the vm")?;
if let Some(tube) = linux.vm_request_tube.take() {
control_tubes.push(TaggedControlTube::Vm(tube));
}
#[cfg(target_arch = "x86_64")]
let (hp_control_tube, hp_worker_tube) = mpsc::channel();
#[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
if let Some(hotplug_manager) = &mut hotplug_manager {
hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
}
#[cfg(target_arch = "x86_64")]
let hp_thread = {
for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
#[cfg(feature = "pci-hotplug")]
if let Some(hotplug_manager) = &mut hotplug_manager {
hotplug_manager.add_port(hp_bus)?;
} else {
linux.hotplug_bus.insert(bus_num, hp_bus);
}
#[cfg(not(feature = "pci-hotplug"))]
linux.hotplug_bus.insert(bus_num, hp_bus);
}
if let Some(pm) = &linux.pm {
for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
pm.lock().register_gpe_notify_dev(gpe, notify_dev);
}
for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
pm.lock().register_pme_notify_dev(bus, notify_dev);
}
}
let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
Tube::pair().context("failed to create tube")?;
vm_memory_control_tubes.push(VmMemoryTube {
tube: hp_vm_mem_host_tube,
expose_with_viommu: false,
});
let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
let pci_root = linux.root_config.clone();
std::thread::Builder::new()
.name("pci_root".to_string())
.spawn(move || {
start_pci_root_worker(
supports_readonly_mapping,
pci_root,
hp_worker_tube,
hp_vm_mem_worker_tube,
)
})?
};
let gralloc = RutabagaGralloc::new().context("failed to create gralloc")?;
run_control(
linux,
sys_allocator,
cfg,
control_server_socket,
irq_control_tubes,
vm_memory_control_tubes,
control_tubes,
#[cfg(feature = "balloon")]
balloon_host_tube,
&disk_host_tubes,
#[cfg(feature = "gpu")]
gpu_control_host_tube,
#[cfg(feature = "usb")]
usb_control_tube,
vm_evt_rdtube,
vm_evt_wrtube,
sigchld_fd,
gralloc,
vcpu_ids,
iommu_host_tube,
#[cfg(target_arch = "x86_64")]
hp_control_tube,
#[cfg(target_arch = "x86_64")]
hp_thread,
#[cfg(feature = "pci-hotplug")]
hotplug_manager,
#[cfg(feature = "swap")]
swap_controller,
#[cfg(feature = "registered_events")]
reg_evt_rdtube,
guest_suspended_cvar,
#[cfg(feature = "pvclock")]
pvclock_host_tube,
metrics_recv,
)
}
// Hotplug command is facing dead lock issue when it tries to acquire the lock
// for pci root in the vm control thread. Dead lock could happen when the vm
// control thread(Thread A namely) is handling the hotplug command and it tries
// to get the lock for pci root. However, the lock is already hold by another
// device in thread B, which is actively sending an vm control to be handled by
// thread A and waiting for response. However, thread A is blocked on acquiring
// the lock, so dead lock happens. In order to resolve this issue, we add this
// worker thread and push all work that locks pci root to this thread.
#[cfg(target_arch = "x86_64")]
fn start_pci_root_worker(
supports_readonly_mapping: bool,
pci_root: Arc<Mutex<PciRoot>>,
hp_device_tube: mpsc::Receiver<PciRootCommand>,
vm_control_tube: Tube,
) {
struct PciMmioMapperTube {
supports_readonly_mapping: bool,
vm_control_tube: Tube,
registered_regions: BTreeMap<u32, VmMemoryRegionId>,
next_id: u32,
}
impl PciMmioMapper for PciMmioMapperTube {
fn supports_readonly_mapping(&self) -> bool {
self.supports_readonly_mapping
}
fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
let shmem = shmem
.try_clone()
.context("failed to create new SharedMemory")?;
self.vm_control_tube
.send(&VmMemoryRequest::RegisterMemory {
source: VmMemorySource::SharedMemory(shmem),
dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
prot: Protection::read(),
cache: MemCacheType::CacheCoherent,
})
.context("failed to send request")?;
match self.vm_control_tube.recv::<VmMemoryResponse>() {
Ok(VmMemoryResponse::RegisterMemory(slot)) => {
let cur_id = self.next_id;
self.registered_regions.insert(cur_id, slot);
self.next_id += 1;
Ok(cur_id)
}
res => bail!("Bad response: {:?}", res),
}
}
}
let mut mapper = PciMmioMapperTube {
supports_readonly_mapping,
vm_control_tube,
registered_regions: BTreeMap::new(),
next_id: 0,
};
loop {
match hp_device_tube.recv() {
Ok(cmd) => match cmd {
PciRootCommand::Add(addr, device) => {
if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
error!("failed to add hotplugged device to PCI root port: {}", e);
}
}
PciRootCommand::AddBridge(pci_bus) => {
if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
error!("failed to add hotplugged bridge to PCI root port: {}", e);
}
}
PciRootCommand::Remove(addr) => {
pci_root.lock().remove_device(addr);
}
PciRootCommand::Kill => break,
},
Err(e) => {
error!("Error: pci root worker channel closed: {}", e);
break;
}
}
}
}
#[cfg(target_arch = "x86_64")]
fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
linux: &RunnableLinuxVm<V, Vcpu>,
host_addr: PciAddress,
) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
for (_, hp_bus) in linux.hotplug_bus.iter() {
if hp_bus.lock().is_match(host_addr).is_some() {
return Ok(hp_bus.clone());
}
}
Err(anyhow!("Failed to find a suitable hotplug bus"))
}
#[cfg(target_arch = "x86_64")]
fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
cfg: &Config,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
control_tubes: &mut Vec<TaggedControlTube>,
hp_control_tube: &mpsc::Sender<PciRootCommand>,
iommu_host_tube: Option<&Tube>,
device: &HotPlugDeviceInfo,
#[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
) -> Result<()> {
let host_addr = PciAddress::from_path(&device.path)
.context("failed to parse hotplug device's PCI address")?;
let hp_bus = get_hp_bus(linux, host_addr)?;
let (hotplug_key, pci_address) = match device.device_type {
HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
control_tubes.push(TaggedControlTube::Vm(vm_host_tube));
let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
let (hotplug_key, pci_bridge) = match device.device_type {
HotPlugDeviceType::UpstreamPort => {
let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
pcie_host, true,
)?));
let pci_bridge =
Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
linux
.hotplug_bus
.insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
(hotplug_key, pci_bridge)
}
HotPlugDeviceType::DownstreamPort => {
let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
let pcie_downstream_port = Arc::new(Mutex::new(
PcieDownstreamPort::new_from_host(pcie_host, true)?,
));
let pci_bridge = Box::new(PciBridge::new(
pcie_downstream_port.clone(),
msi_device_tube,
));
linux
.hotplug_bus
.insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
(hotplug_key, pci_bridge)
}
_ => {
bail!("Impossible to reach here")
}
};
let pci_address = Arch::register_pci_device(
linux,
pci_bridge,
None,
sys_allocator,
hp_control_tube,
#[cfg(feature = "swap")]
swap_controller,
)?;
(hotplug_key, pci_address)
}
HotPlugDeviceType::EndPoint => {
let hotplug_key = HotPlugKey::HostVfio { host_addr };
let (vfio_device, jail, viommu_mapper) = create_vfio_device(
&cfg.jail_config,
&linux.vm,
sys_allocator,
irq_control_tubes,
vm_memory_control_tubes,
control_tubes,
&device.path,
true,
None,
None,
None,
if iommu_host_tube.is_some() {
IommuDevType::VirtioIommu
} else {
IommuDevType::NoIommu
},
None,
)?;
let vfio_pci_device = match vfio_device {
VfioDeviceVariant::Pci(pci) => Box::new(pci),
VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
};
let pci_address = Arch::register_pci_device(
linux,
vfio_pci_device,
jail,
sys_allocator,
hp_control_tube,
#[cfg(feature = "swap")]
swap_controller,
)?;
if let Some(iommu_host_tube) = iommu_host_tube {
let endpoint_addr = pci_address.to_u32();
let vfio_wrapper = viommu_mapper.context("expected mapper")?;
let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
let request =
VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
endpoint_addr,
wrapper_id: vfio_wrapper.id(),
container: {
// SAFETY:
// Safe because the descriptor is uniquely owned by `descriptor`.
unsafe { File::from_raw_descriptor(descriptor) }
},
});
match virtio_iommu_request(iommu_host_tube, &request)
.map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
{
VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
resp => bail!("Unexpected message response: {:?}", resp),
}
}
(hotplug_key, pci_address)
}
};
hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
if device.hp_interrupt {
hp_bus.lock().hot_plug(pci_address)?;
}
Ok(())
}
#[cfg(feature = "pci-hotplug")]
fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
vm_control_tubes: &mut Vec<TaggedControlTube>,
hotplug_manager: &mut PciHotPlugManager,
net_param: NetParameters,
) -> Result<u8> {
let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
irq_control_tubes.push(msi_host_tube);
let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
vm_memory_control_tubes.push(VmMemoryTube {
tube: ioevent_host_tube,
expose_with_viommu: false,
});
let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
vm_control_tubes.push(TaggedControlTube::Vm(vm_control_host_tube));
let net_carrier_device = NetResourceCarrier::new(
net_param,
msi_device_tube,
ioevent_vm_memory_client,
vm_control_device_tube,
);
hotplug_manager.hotplug_device(
vec![ResourceCarrier::VirtioNet(net_carrier_device)],
linux,
sys_allocator,
)
}
#[cfg(feature = "pci-hotplug")]
fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
net_cmd: NetControlCommand,
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
vm_control_tubes: &mut Vec<TaggedControlTube>,
hotplug_manager: &mut PciHotPlugManager,
) -> VmResponse {
match net_cmd {
NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
linux,
sys_allocator,
irq_control_tubes,
vm_memory_control_tubes,
vm_control_tubes,
hotplug_manager,
&tap_name,
),
NetControlCommand::RemoveTap(bus) => {
handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
}
}
}
#[cfg(feature = "pci-hotplug")]
fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
vm_control_tubes: &mut Vec<TaggedControlTube>,
hotplug_manager: &mut PciHotPlugManager,
tap_name: &str,
) -> VmResponse {
let net_param_mode = NetParametersMode::TapName {
tap_name: tap_name.to_owned(),
mac: None,
};
let net_param = NetParameters {
mode: net_param_mode,
vhost_net: None,
vq_pairs: None,
packed_queue: false,
pci_address: None,
};
let ret = add_hotplug_net(
linux,
sys_allocator,
irq_control_tubes,
vm_memory_control_tubes,
vm_control_tubes,
hotplug_manager,
net_param,
);
match ret {
Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
Err(e) => VmResponse::ErrString(format!("{:?}", e)),
}
}
#[cfg(feature = "pci-hotplug")]
fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
hotplug_manager: &mut PciHotPlugManager,
bus: u8,
) -> VmResponse {
match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
Ok(_) => VmResponse::Ok,
Err(e) => VmResponse::ErrString(format!("{:?}", e)),
}
}
#[cfg(target_arch = "x86_64")]
fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
linux: &RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
buses_to_remove: &mut Vec<u8>,
hotplug_key: HotPlugKey,
child_bus: u8,
) -> Result<()> {
for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
let mut hp_bus_lock = hp_bus.lock();
if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
hp_bus_lock.hot_unplug(pci_addr)?;
buses_to_remove.push(child_bus);
if hp_bus_lock.is_empty() {
if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
remove_hotplug_bridge(
linux,
sys_allocator,
buses_to_remove,
hotplug_key,
*bus_num,
)?;
}
}
return Ok(());
}
}
Err(anyhow!(
"Can not find device {:?} on hotplug buses",
hotplug_key
))
}
#[cfg(target_arch = "x86_64")]
fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
iommu_host_tube: Option<&Tube>,
device: &HotPlugDeviceInfo,
) -> Result<()> {
let host_addr = PciAddress::from_path(&device.path)?;
let hotplug_key = match device.device_type {
HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
};
let hp_bus = linux
.hotplug_bus
.iter()
.find(|(_, hp_bus)| {
let hp_bus = hp_bus.lock();
hp_bus.get_hotplug_device(hotplug_key).is_some()
})
.map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
if let Some((bus_num, hp_bus)) = hp_bus {
let mut buses_to_remove = Vec::new();
let mut removed_key = None;
let mut hp_bus_lock = hp_bus.lock();
if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
if let Some(iommu_host_tube) = iommu_host_tube {
let request =
VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
endpoint_addr: pci_addr.to_u32(),
});
match virtio_iommu_request(iommu_host_tube, &request)
.map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
{
VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
resp => bail!("Unexpected message response: {:?}", resp),
}
}
let mut empty_simbling = true;
if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
hp_bus_lock.get_hotplug_key()
{
let addr_alias = host_addr;
for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
if *simbling_bus_num != bus_num {
let hp_bus_lock = hp_bus.lock();
let hotplug_key = hp_bus_lock.get_hotplug_key();
if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
empty_simbling = false;
break;
}
}
}
}
}
// If all simbling downstream ports are empty, do not send hot unplug event for this
// downstream port. Root port will send one plug out interrupt and remove all
// the remaining devices
if !empty_simbling {
hp_bus_lock.hot_unplug(pci_addr)?;
}
sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
if empty_simbling || hp_bus_lock.is_empty() {
if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
removed_key = Some(hotplug_key);
remove_hotplug_bridge(
linux,
sys_allocator,
&mut buses_to_remove,
hotplug_key,
bus_num,
)?;
}
}
}
// Some types of TBT device has a few empty downstream ports. The emulated bridges
// of these ports won't be removed since no vfio device is connected to our emulated
// bridges. So we explicitly check all simbling bridges of the removed bridge here,
// and remove them if bridge has no child device connected.
if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
let addr_alias = host_addr;
for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
if *simbling_bus_num != bus_num {
let hp_bus_lock = hp_bus.lock();
let hotplug_key = hp_bus_lock.get_hotplug_key();
if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
remove_hotplug_bridge(
linux,
sys_allocator,
&mut buses_to_remove,
hotplug_key.unwrap(),
*simbling_bus_num,
)?;
}
}
}
}
}
for bus in buses_to_remove.iter() {
linux.hotplug_bus.remove(bus);
}
return Ok(());
}
Err(anyhow!(
"Can not find device {:?} on hotplug buses",
hotplug_key
))
}
pub fn trigger_vm_suspend_and_wait_for_entry(
guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
tube: &SendTube,
response: vm_control::VmResponse,
suspend_evt: Event,
pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
) {
let (lock, cvar) = &*guest_suspended_cvar;
let mut guest_suspended = lock.lock();
*guest_suspended = false;
// During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
// reacts on sleep button events)
if let Some(pm) = pm {
pm.lock().slpbtn_evt();
} else {
error!("generating sleepbtn during suspend not supported");
}
// Wait for notification about guest suspension, if not received after 15sec,
// proceed anyway.
let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
guest_suspended = result.0;
if result.1.timed_out() {
warn!("Guest suspension timeout - proceeding anyway");
} else if *guest_suspended {
info!("Guest suspended");
}
if let Err(e) = suspend_evt.signal() {
error!("failed to trigger suspend event: {}", e);
}
// Now we ready to send response over the tube and communicate that VM suspend has finished
if let Err(e) = tube.send(&response) {
error!("failed to send VmResponse: {}", e);
}
}
#[cfg(feature = "pvclock")]
fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<()> {
tube.send(&command)
.with_context(|| format!("failed to send pvclock command {:?}", command))?;
let resp = tube
.recv::<PvClockCommandResponse>()
.context("failed to receive pvclock command response")?;
if let PvClockCommandResponse::Err(e) = resp {
bail!("pvclock encountered error on {:?}: {}", command, e);
}
if let PvClockCommandResponse::DeviceInactive = resp {
warn!("Tried to send {command:?} but pvclock device was inactive");
} else {
info!("{command:?} completed with {resp:?}");
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
linux: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator: &mut SystemAllocator,
cfg: &Config,
add_irq_control_tubes: &mut Vec<Tube>,
add_vm_memory_control_tubes: &mut Vec<VmMemoryTube>,
add_tubes: &mut Vec<TaggedControlTube>,
hp_control_tube: &mpsc::Sender<PciRootCommand>,
iommu_host_tube: Option<&Tube>,
device: &HotPlugDeviceInfo,
add: bool,
#[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
) -> VmResponse {
let iommu_host_tube = if cfg.vfio_isolate_hotplug {
iommu_host_tube
} else {
None
};
let ret = if add {
add_hotplug_device(
linux,
sys_allocator,
cfg,
add_irq_control_tubes,
add_vm_memory_control_tubes,
add_tubes,
hp_control_tube,
iommu_host_tube,
device,
#[cfg(feature = "swap")]
swap_controller,
)
} else {
remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
};
match ret {
Ok(()) => VmResponse::Ok,
Err(e) => {
error!("hanlde_hotplug_command failure: {}", e);
add_tubes.clear();
VmResponse::Err(base::Error::new(libc::EINVAL))
}
}
}
struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
linux: &'a mut RunnableLinuxVm<V, Vcpu>,
cfg: &'a Config,
sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
disk_host_tubes: &'a [Tube],
#[cfg(feature = "gpu")]
gpu_control_tube: &'a Tube,
#[cfg(feature = "usb")]
usb_control_tube: &'a Tube,
#[cfg(target_arch = "x86_64")]
iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
#[cfg(target_arch = "x86_64")]
hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
#[cfg(feature = "pci-hotplug")]
hotplug_manager: &'a mut Option<PciHotPlugManager>,
#[cfg(feature = "swap")]
swap_controller: &'a mut Option<SwapController>,
vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
#[cfg(feature = "balloon")]
balloon_tube: Option<&'a mut BalloonTube>,
device_ctrl_tube: &'a Tube,
irq_handler_control: &'a Tube,
#[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
vm_memory_handler_control: &'a Tube,
#[cfg(feature = "registered_events")]
registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
#[cfg(feature = "pvclock")]
pvclock_host_tube: Option<Arc<Tube>>,
}
fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
state: &mut ControlLoopState<V, Vcpu>,
id: usize,
tube: &Tube,
request: VmRequest,
#[cfg_attr(
not(any(target_arch = "x86_64", feature = "pci-hotplug")),
allow(unused_variables, clippy::ptr_arg)
)]
add_tubes: &mut Vec<TaggedControlTube>,
) -> Result<(Option<VmResponse>, bool, Option<VmRunMode>)> {
let mut suspend_requested = false;
let mut run_mode_opt = None;
#[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
let mut add_irq_control_tubes = Vec::new();
#[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
let mut add_vm_memory_control_tubes = Vec::new();
let response = match request {
VmRequest::HotPlugVfioCommand { device, add } => {
#[cfg(target_arch = "x86_64")]
{
handle_hotplug_command(
state.linux,
&mut state.sys_allocator.lock(),
state.cfg,
&mut add_irq_control_tubes,
&mut add_vm_memory_control_tubes,
add_tubes,
state.hp_control_tube,
state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
&device,
add,
#[cfg(feature = "swap")]
state.swap_controller,
)
}
#[cfg(not(target_arch = "x86_64"))]
{
// Suppress warnings.
let _ = (device, add);
VmResponse::Ok
}
}
#[cfg(feature = "pci-hotplug")]
VmRequest::HotPlugNetCommand(net_cmd) => {
if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
handle_hotplug_net_command(
net_cmd,
state.linux,
&mut state.sys_allocator.lock(),
&mut add_irq_control_tubes,
&mut add_vm_memory_control_tubes,
add_tubes,
hotplug_manager,
)
} else {
VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
}
}
#[cfg(feature = "registered_events")]
VmRequest::RegisterListener { socket_addr, event } => {
let (registered_tube, already_registered) =
find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
if !already_registered {
let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
tubes.insert(addr_tube);
} else {
state
.registered_evt_tubes
.insert(event, vec![addr_tube].into_iter().collect());
}
}
VmResponse::Ok
}
#[cfg(feature = "registered_events")]
VmRequest::UnregisterListener { socket_addr, event } => {
if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
tubes.retain(|t| t.socket_addr != socket_addr);
}
state
.registered_evt_tubes
.retain(|_, tubes| !tubes.is_empty());
VmResponse::Ok
}
#[cfg(feature = "registered_events")]
VmRequest::Unregister { socket_addr } => {
for (_, tubes) in state.registered_evt_tubes.iter_mut() {
tubes.retain(|t| t.socket_addr != socket_addr);
}
state
.registered_evt_tubes
.retain(|_, tubes| !tubes.is_empty());
VmResponse::Ok
}
#[cfg(feature = "balloon")]
VmRequest::BalloonCommand(cmd) => {
if let Some(tube) = state.balloon_tube.as_mut() {
let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
return Ok((None, false, None));
};
if key != id {
let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
return Ok((None, false, None));
};
if let Err(e) = tube.send(&r) {
error!("failed to send VmResponse: {}", e);
}
return Ok((None, false, None));
}
r
} else {
VmResponse::Err(base::Error::new(libc::ENOTSUP))
}
}
_ => {
let response = request.execute(
&state.linux.vm,
&mut run_mode_opt,
state.disk_host_tubes,
&mut state.linux.pm,
#[cfg(feature = "gpu")]
Some(state.gpu_control_tube),
#[cfg(not(feature = "gpu"))]
None,
#[cfg(feature = "usb")]
Some(state.usb_control_tube),
#[cfg(not(feature = "usb"))]
None,
&mut state.linux.bat_control,
|msg| {
vcpu::kick_all_vcpus(
state.vcpu_handles,
state.linux.irq_chip.as_irq_chip(),
msg,
)
},
state.cfg.force_s2idle,
#[cfg(feature = "swap")]
state.swap_controller.as_ref(),
state.device_ctrl_tube,
state.vcpu_handles.len(),
state.irq_handler_control,
|| state.linux.irq_chip.snapshot(state.linux.vcpu_count),
);
if state.cfg.force_s2idle {
if let VmRequest::SuspendVcpus = request {
suspend_requested = true;
// Spawn s2idle wait thread.
let send_tube = tube.try_clone_send_tube().unwrap();
let suspend_evt = state.linux.suspend_evt.try_clone().unwrap();
let guest_suspended_cvar = state.guest_suspended_cvar.clone();
let delayed_response = response.clone();
let pm = state.linux.pm.clone();
std::thread::Builder::new()
.name("s2idle_wait".to_owned())
.spawn(move || {
trigger_vm_suspend_and_wait_for_entry(
guest_suspended_cvar.unwrap(),
&send_tube,
delayed_response,
suspend_evt,
pm,
)
})
.context("failed to spawn s2idle_wait thread")?;
}
} else {
// if not doing s2idle, the guest clock should
// behave as the host does, so let the guest
// know about the suspend / resume via
// virtio-pvclock.
#[cfg(feature = "pvclock")]
if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
let cmd = match request {
VmRequest::SuspendVcpus => Some(PvClockCommand::Suspend),
VmRequest::ResumeVcpus => Some(PvClockCommand::Resume),
_ => None,
};
if let Some(cmd) = cmd {
if let Err(e) = send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
error!("{:?} command failed: {:#}", cmd, e);
} else {
info!("{:?} command successfully processed", cmd);
}
}
}
}
response
}
};
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
if !add_irq_control_tubes.is_empty() {
state
.irq_handler_control
.send(&IrqHandlerRequest::AddIrqControlTubes(
add_irq_control_tubes,
))?;
}
if !add_vm_memory_control_tubes.is_empty() {
state
.vm_memory_handler_control
.send(&VmMemoryHandlerRequest::AddControlTubes(
add_vm_memory_control_tubes,
))?;
}
}
}
Ok((Some(response), suspend_requested, run_mode_opt))
}
fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
state: &mut ControlLoopState<V, Vcpu>,
id: usize,
socket: &TaggedControlTube,
) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
let mut vm_control_ids_to_remove = Vec::new();
let mut add_tubes = Vec::new();
match socket {
TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
Ok(request) => {
let (response, suspend_requested, run_mode_opt) =
process_vm_request(state, id, tube, request, &mut add_tubes)?;
if let Some(response) = response {
// If suspend requested skip that step since it will be
// performed by s2idle_wait thread when suspension actually
// happens.
if !suspend_requested {
if let Err(e) = tube.send(&response) {
error!("failed to send VmResponse: {}", e);
}
}
}
if let Some(run_mode) = run_mode_opt {
info!("control socket changed run mode to {}", run_mode);
match run_mode {
VmRunMode::Exiting => {
return Ok((true, Vec::new(), Vec::new()));
}
other => {
if other == VmRunMode::Running {
for dev in &state.linux.resume_notify_devices {
dev.lock().resume_imminent();
}
}
// If suspend requested skip that step since it
// will be performed by s2idle_wait thread when
// needed.
if !suspend_requested {
vcpu::kick_all_vcpus(
state.vcpu_handles,
state.linux.irq_chip.as_irq_chip(),
VcpuControl::RunState(other),
);
}
}
}
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmRequest: {}", e);
}
}
},
TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMsyncRequest>() {
Ok(request) => {
let response = request.execute(&mut state.linux.vm);
if let Err(e) = tube.send(&response) {
error!("failed to send VmMsyncResponse: {}", e);
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmMsyncRequest: {}", e);
}
}
},
TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
Ok(request) => {
let response =
request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
if let Err(e) = tube.send(&response) {
error!("failed to send VmResponse: {}", e);
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmResponse: {}", e);
}
}
},
}
Ok((false, vm_control_ids_to_remove, add_tubes))
}
#[cfg(feature = "registered_events")]
struct AddressedProtoTube {
tube: Rc<ProtoTube>,
socket_addr: String,
}
#[cfg(feature = "registered_events")]
impl PartialEq for AddressedProtoTube {
fn eq(&self, other: &Self) -> bool {
self.socket_addr == other.socket_addr
}
}
#[cfg(feature = "registered_events")]
impl Eq for AddressedProtoTube {}
#[cfg(feature = "registered_events")]
impl Hash for AddressedProtoTube {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.socket_addr.hash(state);
}
}
#[cfg(feature = "registered_events")]
impl AddressedProtoTube {
pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
self.tube.send_proto(msg)
}
}
#[cfg(feature = "registered_events")]
fn find_registered_tube<'a>(
registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
socket_addr: &str,
event: RegisteredEvent,
) -> (Option<&'a Rc<ProtoTube>>, bool) {
let mut registered_tube: Option<&Rc<ProtoTube>> = None;
let mut already_registered = false;
'outer: for (evt, addr_tubes) in registered_tubes {
for addr_tube in addr_tubes {
if addr_tube.socket_addr == socket_addr {
if *evt == event {
already_registered = true;
break 'outer;
}
// Since all tubes of the same addr should
// be an RC to the same tube, it doesn't
// matter which one we get. But we do need
// to check for a registration for the
// current event, so can't break here.
registered_tube = Some(&addr_tube.tube);
}
}
}
(registered_tube, already_registered)
}
#[cfg(feature = "registered_events")]
fn make_addr_tube_from_maybe_existing(
tube: Option<&Rc<ProtoTube>>,
addr: String,
) -> Result<AddressedProtoTube> {
if let Some(registered_tube) = tube {
Ok(AddressedProtoTube {
tube: registered_tube.clone(),
socket_addr: addr,
})
} else {
let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
format!("failed to connect to registered listening socket {}", addr)
})?;
let tube = ProtoTube::new_from_unix_seqpacket(sock)?;
Ok(AddressedProtoTube {
tube: Rc::new(tube),
socket_addr: addr,
})
}
}
fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
mut linux: RunnableLinuxVm<V, Vcpu>,
sys_allocator: SystemAllocator,
cfg: Config,
control_server_socket: Option<UnlinkUnixSeqpacketListener>,
irq_control_tubes: Vec<Tube>,
vm_memory_control_tubes: Vec<VmMemoryTube>,
control_tubes: Vec<TaggedControlTube>,
#[cfg(feature = "balloon")] balloon_host_tube: Option<Tube>,
disk_host_tubes: &[Tube],
#[cfg(feature = "gpu")] gpu_control_tube: Tube,
#[cfg(feature = "usb")] usb_control_tube: Tube,
vm_evt_rdtube: RecvTube,
vm_evt_wrtube: SendTube,
sigchld_fd: SignalFd,
gralloc: RutabagaGralloc,
vcpu_ids: Vec<usize>,
iommu_host_tube: Option<Tube>,
#[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
#[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
#[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
#[allow(unused_mut)] // mut is required x86 only
#[cfg(feature = "swap")]
mut swap_controller: Option<SwapController>,
#[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
#[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
metrics_tube: RecvTube,
) -> Result<ExitState> {
#[derive(EventToken)]
enum Token {
VmEvent,
Suspend,
ChildSignal,
VmControlServer,
VmControl {
id: usize,
},
#[cfg(feature = "registered_events")]
RegisteredEvent,
#[cfg(feature = "balloon")]
BalloonTube,
}
stdin()
.set_raw_mode()
.expect("failed to set terminal raw mode");
let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
let wait_ctx = WaitContext::build_with(&[
(&linux.suspend_evt, Token::Suspend),
(&sigchld_fd, Token::ChildSignal),
(&vm_evt_rdtube, Token::VmEvent),
#[cfg(feature = "registered_events")]
(&reg_evt_rdtube, Token::RegisteredEvent),
])
.context("failed to build wait context")?;
if let Some(socket_server) = &control_server_socket {
wait_ctx
.add(socket_server, Token::VmControlServer)
.context("failed to add descriptor to wait context")?;
}
let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
let mut next_control_id = control_tubes.len();
for (id, socket) in control_tubes.iter() {
wait_ctx
.add(socket.as_ref(), Token::VmControl { id: *id })
.context("failed to add descriptor to wait context")?;
}
#[cfg(feature = "balloon")]
let mut balloon_tube = balloon_host_tube
.map(|tube| -> Result<BalloonTube> {
wait_ctx
.add(&tube, Token::BalloonTube)
.context("failed to add descriptor to wait context")?;
Ok(BalloonTube::new(tube))
})
.transpose()
.context("failed to create balloon tube")?;
if cfg.jail_config.is_some() {
// Before starting VCPUs, in case we started with some capabilities, drop them all.
drop_capabilities().context("failed to drop process capabilities")?;
}
#[cfg(feature = "gdb")]
// Create a channel for GDB thread.
let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
let (s, r) = mpsc::channel();
(Some(s), Some(r))
} else {
(None, None)
};
let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
// Create devices thread, and restore if a restore file exists.
linux.devices_thread = match create_devices_worker_thread(
linux.vm.get_memory().clone(),
linux.io_bus.clone(),
linux.mmio_bus.clone(),
device_ctrl_resp,
) {
Ok(join_handle) => Some(join_handle),
Err(e) => {
return Err(anyhow!("Failed to start devices thread: {}", e));
}
};
let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
if !linux
.vm
.get_hypervisor()
.check_capability(HypervisorCap::ImmediateExit)
{
return Err(anyhow!(
"missing required hypervisor capability ImmediateExit"
));
}
vcpu::setup_vcpu_signal_handler()?;
let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
Some(vec) => vec.into_iter().map(Some).collect(),
None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
};
// Enable core scheduling before creating vCPUs so that the cookie will be
// shared by all vCPU threads.
// TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
// itself for even better performance. Only vCPUs need the feature.
if cfg.core_scheduling && cfg.per_vm_core_scheduling {
if let Err(e) = enable_core_scheduling() {
error!("Failed to enable core scheduling: {}", e);
}
}
let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
None => None,
Some(cgroup_path) => {
// Move main process to cgroup_path
let mut f = File::create(&cgroup_path.join("tasks")).with_context(|| {
format!(
"failed to create vcpu-cgroup-path {}",
cgroup_path.display(),
)
})?;
f.write_all(process::id().to_string().as_bytes())?;
Some(f)
}
};
#[cfg(target_arch = "x86_64")]
let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
#[cfg(target_arch = "x86_64")]
if cfg.bus_lock_ratelimit > 0 {
let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
if linux.vm.check_capability(VmCap::BusLockDetect) {
info!("Hypervisor support bus lock detect");
linux
.vm
.enable_capability(VmCap::BusLockDetect, 0)
.expect("kvm: Failed to enable bus lock detection cap");
info!("Hypervisor enabled bus lock detect");
bus_lock_ratelimit_ctrl
.lock()
.ratelimit_set_speed(bus_lock_ratelimit);
} else {
bail!("Kvm: bus lock detection unsuported");
}
}
#[cfg(target_os = "android")]
android::set_process_profiles(&cfg.task_profiles)?;
#[allow(unused_mut)]
let mut run_mode = if cfg.suspended {
// Sleep devices before creating vcpus.
device_ctrl_tube
.send(&DeviceControlCommand::SleepDevices)
.context("send command to devices control socket")?;
match device_ctrl_tube
.recv()
.context("receive from devices control socket")?
{
VmResponse::Ok => (),
resp => bail!("device sleep failed: {}", resp),
}
VmRunMode::Suspending
} else {
VmRunMode::Running
};
#[cfg(feature = "gdb")]
if to_gdb_channel.is_some() {
// Wait until a GDB client attaches
run_mode = VmRunMode::Breakpoint;
}
// If we are restoring from a snapshot, then start suspended.
let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
(VmRunMode::Suspending, run_mode)
} else {
(run_mode, run_mode)
};
#[cfg(feature = "pvclock")]
let pvclock_host_tube = pvclock_host_tube.map(Arc::new);
// Architecture-specific code must supply a vcpu_init element for each VCPU.
assert_eq!(vcpus.len(), linux.vcpu_init.len());
for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
{
let (to_vcpu_channel, from_main_channel) = mpsc::channel();
let vcpu_affinity = match linux.vcpu_affinity.clone() {
Some(VcpuAffinity::Global(v)) => v,
Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
None => Default::default(),
};
#[cfg(target_arch = "x86_64")]
let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
} else {
None
};
#[cfg(target_arch = "x86_64")]
let cpu_config = Some(CpuConfigX86_64::new(
cfg.force_calibrated_tsc_leaf,
cfg.host_cpu_topology,
cfg.enable_hwp,
cfg.no_smt,
cfg.itmt,
vcpu_hybrid_type,
));
#[cfg(target_arch = "x86_64")]
let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
let cpu_config = None;
#[cfg(target_arch = "riscv64")]
let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
let handle = vcpu::run_vcpu(
cpu_id,
vcpu_ids[cpu_id],
vcpu,
vcpu_init,
linux.vm.try_clone().context("failed to clone vm")?,
linux
.irq_chip
.try_box_clone()
.context("failed to clone irqchip")?,
linux.vcpu_count,
linux.rt_cpus.contains(&cpu_id),
vcpu_affinity,
linux.delay_rt,
vcpu_thread_barrier.clone(),
(*linux.io_bus).clone(),
(*linux.mmio_bus).clone(),
vm_evt_wrtube
.try_clone()
.context("failed to clone vm event tube")?,
from_main_channel,
#[cfg(feature = "gdb")]
to_gdb_channel.clone(),
cfg.core_scheduling,
cfg.per_vm_core_scheduling,
cpu_config,
match vcpu_cgroup_tasks_file {
None => None,
Some(ref f) => Some(
f.try_clone()
.context("failed to clone vcpu cgroup tasks file")?,
),
},
#[cfg(target_arch = "x86_64")]
bus_lock_ratelimit_ctrl,
run_mode,
)?;
vcpu_handles.push((handle, to_vcpu_channel));
}
#[cfg(feature = "gdb")]
// Spawn GDB thread.
if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
let to_vcpu_channels = vcpu_handles
.iter()
.map(|(_handle, channel)| channel.clone())
.collect();
let target = GdbStub::new(
gdb_control_tube,
to_vcpu_channels,
from_vcpu_channel.unwrap(), // Must succeed to unwrap()
);
std::thread::Builder::new()
.name("gdb".to_owned())
.spawn(move || gdb_thread(target, gdb_port_num))
.context("failed to spawn GDB thread")?;
};
let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
let sys_allocator_for_thread = sys_allocator_mutex.clone();
let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
let irq_handler_thread = std::thread::Builder::new()
.name("irq_handler_thread".into())
.spawn(move || {
irq_handler_thread(
irq_control_tubes,
irq_chip_for_thread,
sys_allocator_for_thread,
irq_handler_control_for_thread,
)
})
.unwrap();
let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
let vm_memory_handler_thread = std::thread::Builder::new()
.name("vm_memory_handler_thread".into())
.spawn({
let vm = linux.vm.try_clone().context("failed to clone Vm")?;
let sys_allocator_mutex = sys_allocator_mutex.clone();
let iommu_client = iommu_host_tube
.as_ref()
.map(|t| VmMemoryRequestIommuClient::new(t.clone()));
move || {
vm_memory_handler_thread(
vm_memory_control_tubes,
vm,
sys_allocator_mutex,
gralloc,
iommu_client,
vm_memory_handler_control_for_thread,
)
}
})
.unwrap();
vcpu_thread_barrier.wait();
// Restore VM (if applicable).
// Must happen after the vCPU barrier to avoid deadlock.
if let Some(path) = &cfg.restore_path {
vm_control::do_restore(
path.clone(),
&linux.vm,
|msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
|msg, index| {
vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
},
&irq_handler_control,
&device_ctrl_tube,
linux.vcpu_count,
|image| {
linux
.irq_chip
.try_box_clone()?
.restore(image, linux.vcpu_count)
},
/* require_encrypted= */ false,
)?;
// Allow the vCPUs to start for real.
vcpu::kick_all_vcpus(
&vcpu_handles,
linux.irq_chip.as_irq_chip(),
VcpuControl::RunState(post_restore_run_mode),
)
}
#[cfg(feature = "swap")]
if let Some(swap_controller) = &swap_controller {
swap_controller
.on_static_devices_setup_complete()
.context("static device setup complete")?;
}
let metrics_thread = if metrics::is_initialized() {
Some(
std::thread::Builder::new()
.name("metrics_thread".into())
.spawn(move || {
if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
error!("Metrics controller error: {:?}", e);
}
})
.context("metrics thread failed")?,
)
} else {
None
};
let mut exit_state = ExitState::Stop;
let mut pvpanic_code = PvPanicCode::Unknown;
#[cfg(feature = "registered_events")]
let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
HashMap::new();
'wait: loop {
let events = {
match wait_ctx.wait() {
Ok(v) => v,
Err(e) => {
error!("failed to poll: {}", e);
break;
}
}
};
let mut vm_control_ids_to_remove = Vec::new();
for event in events.iter().filter(|e| e.is_readable) {
match event.token {
#[cfg(feature = "registered_events")]
Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
Ok(reg_evt) => {
let evt = reg_evt.into_event();
let mut tubes_to_remove: Vec<String> = Vec::new();
if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
for tube in tubes.iter() {
if let Err(e) = tube.send(&reg_evt.into_proto()) {
warn!(
"failed to send registered event {:?} to {}, removing from \
registrations: {}",
reg_evt, tube.socket_addr, e
);
tubes_to_remove.push(tube.socket_addr.clone());
}
}
}
for tube_addr in tubes_to_remove {
for tubes in registered_evt_tubes.values_mut() {
tubes.retain(|t| t.socket_addr != tube_addr);
}
}
registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
}
Err(e) => {
warn!("failed to recv RegisteredEvent: {}", e);
}
},
Token::VmEvent => {
let mut break_to_wait: bool = true;
match vm_evt_rdtube.recv::<VmEventType>() {
Ok(vm_event) => match vm_event {
VmEventType::Exit => {
info!("vcpu requested shutdown");
exit_state = ExitState::Stop;
}
VmEventType::Reset => {
info!("vcpu requested reset");
exit_state = ExitState::Reset;
}
VmEventType::Crash => {
info!("vcpu crashed");
exit_state = ExitState::Crash;
}
VmEventType::Panic(panic_code) => {
pvpanic_code = PvPanicCode::from_u8(panic_code);
info!("Guest reported panic [Code: {}]", pvpanic_code);
break_to_wait = false;
}
VmEventType::WatchdogReset => {
info!("vcpu stall detected");
exit_state = ExitState::WatchdogReset;
}
},
Err(e) => {
warn!("failed to recv VmEvent: {}", e);
}
}
if break_to_wait {
if pvpanic_code == PvPanicCode::Panicked {
exit_state = ExitState::GuestPanic;
}
break 'wait;
}
}
Token::Suspend => {
info!("VM requested suspend");
linux.suspend_evt.wait().unwrap();
vcpu::kick_all_vcpus(
&vcpu_handles,
linux.irq_chip.as_irq_chip(),
VcpuControl::RunState(VmRunMode::Suspending),
);
}
Token::ChildSignal => {
// Print all available siginfo structs, then exit the loop if child process has
// been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
// here since they are used by the vmm-swap feature.
let mut do_exit = false;
while let Some(siginfo) =
sigchld_fd.read().context("failed to read signalfd")?
{
let pid = siginfo.ssi_pid;
let pid_label = match linux.pid_debug_label_map.get(&pid) {
Some(label) => format!("{} (pid {})", label, pid),
None => format!("pid {}", pid),
};
// TODO(kawasin): this is a temporary exception until device suspension.
#[cfg(feature = "swap")]
if siginfo.ssi_code == libc::CLD_STOPPED
|| siginfo.ssi_code == libc::CLD_CONTINUED
{
continue;
}
// Ignore clean exits of non-tracked child processes when running without
// sandboxing. The virtio gpu process launches a render server for
// pass-through graphics. Host GPU drivers have been observed to fork
// child processes that exit cleanly which should not be considered a
// crash. When running with sandboxing, this should be handled by the
// device's process handler.
if cfg.jail_config.is_none()
&& !linux.pid_debug_label_map.contains_key(&pid)
&& siginfo.ssi_signo == libc::SIGCHLD as u32
&& siginfo.ssi_code == libc::CLD_EXITED
&& siginfo.ssi_status == 0
{
continue;
}
error!(
"child {} exited: signo {}, status {}, code {}",
pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
);
do_exit = true;
}
if do_exit {
exit_state = ExitState::Crash;
break 'wait;
}
}
Token::VmControlServer => {
if let Some(socket_server) = &control_server_socket {
match socket_server.accept() {
Ok(socket) => {
let id = next_control_id;
next_control_id += 1;
wait_ctx
.add(&socket, Token::VmControl { id })
.context("failed to add descriptor to wait context")?;
control_tubes.insert(
id,
TaggedControlTube::Vm(Tube::new_from_unix_seqpacket(socket)?),
);
}
Err(e) => error!("failed to accept socket: {}", e),
}
}
}
Token::VmControl { id } => {
if let Some(socket) = control_tubes.get(&id) {
let mut state = ControlLoopState {
linux: &mut linux,
cfg: &cfg,
sys_allocator: &sys_allocator_mutex,
control_tubes: &control_tubes,
disk_host_tubes,
#[cfg(feature = "gpu")]
gpu_control_tube: &gpu_control_tube,
#[cfg(feature = "usb")]
usb_control_tube: &usb_control_tube,
#[cfg(target_arch = "x86_64")]
iommu_host_tube: &iommu_host_tube,
#[cfg(target_arch = "x86_64")]
hp_control_tube: &hp_control_tube,
guest_suspended_cvar: &guest_suspended_cvar,
#[cfg(feature = "pci-hotplug")]
hotplug_manager: &mut hotplug_manager,
#[cfg(feature = "swap")]
swap_controller: &mut swap_controller,
vcpu_handles: &vcpu_handles,
#[cfg(feature = "balloon")]
balloon_tube: balloon_tube.as_mut(),
device_ctrl_tube: &device_ctrl_tube,
irq_handler_control: &irq_handler_control,
#[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
vm_memory_handler_control: &vm_memory_handler_control,
#[cfg(feature = "registered_events")]
registered_evt_tubes: &mut registered_evt_tubes,
#[cfg(feature = "pvclock")]
pvclock_host_tube: pvclock_host_tube.clone(),
};
let (exit_requested, mut ids_to_remove, add_tubes) =
process_vm_control_event(&mut state, id, socket)?;
if exit_requested {
break 'wait;
}
vm_control_ids_to_remove.append(&mut ids_to_remove);
for socket in add_tubes {
let id = next_control_id;
next_control_id += 1;
wait_ctx
.add(socket.as_ref(), Token::VmControl { id })
.context(
"failed to add hotplug vfio-pci descriptor to wait context",
)?;
control_tubes.insert(id, socket);
}
}
}
#[cfg(feature = "balloon")]
Token::BalloonTube => {
match balloon_tube.as_mut().expect("missing balloon tube").recv() {
Ok(resp) => {
for (resp, idx) in resp {
if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
if let Err(e) = tube.send(&resp) {
error!("failed to send VmResponse: {}", e);
}
} else {
error!("Bad tube index {}", idx);
}
}
}
Err(err) => {
error!("Error processing balloon tube {:?}", err)
}
}
}
}
}
remove_hungup_and_drained_tubes(
&events,
&wait_ctx,
&mut control_tubes,
vm_control_ids_to_remove,
|token: &Token| {
if let Token::VmControl { id } = token {
return Some(*id);
}
None
},
)?;
}
vcpu::kick_all_vcpus(
&vcpu_handles,
linux.irq_chip.as_irq_chip(),
VcpuControl::RunState(VmRunMode::Exiting),
);
for (handle, _) in vcpu_handles {
if let Err(e) = handle.join() {
error!("failed to join vcpu thread: {:?}", e);
}
}
// After joining all vcpu threads, unregister the process-wide signal handler.
if let Err(e) = vcpu::remove_vcpu_signal_handler() {
error!("failed to remove vcpu thread signal handler: {:#}", e);
}
// Stop the vmm-swap monitor process.
#[cfg(feature = "swap")]
drop(swap_controller);
// Stop pci root worker thread
#[cfg(target_arch = "x86_64")]
{
let _ = hp_control_tube.send(PciRootCommand::Kill);
if let Err(e) = hp_thread.join() {
error!("failed to join hotplug thread: {:?}", e);
}
}
if linux.devices_thread.is_some() {
if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
error!("failed to stop device control loop: {}", e);
};
if let Some(thread) = linux.devices_thread.take() {
if let Err(e) = thread.join() {
error!("failed to exit devices thread: {:?}", e);
}
}
}
// Shut down the VM Memory handler thread.
if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
error!(
"failed to request exit from VM Memory handler thread: {}",
e
);
}
if let Err(e) = vm_memory_handler_thread.join() {
error!("failed to exit VM Memory handler thread: {:?}", e);
}
// Shut down the IRQ handler thread.
if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
error!("failed to request exit from IRQ handler thread: {}", e);
}
if let Err(e) = irq_handler_thread.join() {
error!("failed to exit irq handler thread: {:?}", e);
}
// At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
// inside `linux`. If the checks below fail, then some other thread is probably still running
// and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
// cleaned up.
match Arc::try_unwrap(std::mem::replace(
&mut linux.mmio_bus,
Arc::new(Bus::new(BusType::Mmio)),
)) {
Ok(_) => {}
Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
}
match Arc::try_unwrap(std::mem::replace(
&mut linux.io_bus,
Arc::new(Bus::new(BusType::Io)),
)) {
Ok(_) => {}
Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
}
// Explicitly drop the VM structure here to allow the devices to clean up before the
// control sockets are closed when this function exits.
mem::drop(linux);
// Drop the hotplug manager to tell the warden process to exit before we try to join
// the metrics thread.
#[cfg(feature = "pci-hotplug")]
mem::drop(hotplug_manager);
// All our children should have exited by now, so closing our fd should
// terminate metrics. Then join so that everything gets flushed.
metrics::get_destructor().cleanup();
if let Some(metrics_thread) = metrics_thread {
if let Err(e) = metrics_thread.join() {
error!("failed to exit irq handler thread: {:?}", e);
}
}
stdin()
.set_canon_mode()
.expect("failed to restore canonical mode for terminal");
Ok(exit_state)
}
#[derive(EventToken)]
enum IrqHandlerToken {
IrqFd { index: IrqEventIndex },
VmIrq { id: usize },
DelayedIrqFd,
HandlerControl,
}
/// Handles IRQs and requests from devices to add additional IRQ lines.
fn irq_handler_thread(
irq_control_tubes: Vec<Tube>,
mut irq_chip: Box<dyn IrqChipArch + 'static>,
sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
handler_control: Tube,
) -> anyhow::Result<()> {
let wait_ctx = WaitContext::build_with(&[(
handler_control.get_read_notifier(),
IrqHandlerToken::HandlerControl,
)])
.context("failed to build wait context")?;
if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
wait_ctx
.add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
.context("failed to add descriptor to wait context")?;
}
let mut irq_event_tokens = irq_chip
.irq_event_tokens()
.context("failed get event tokens from irqchip")?;
for (index, _gsi, evt) in irq_event_tokens.iter() {
wait_ctx
.add(evt, IrqHandlerToken::IrqFd { index: *index })
.context("failed to add irq chip event tokens to wait context")?;
}
let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
let mut next_control_id = irq_control_tubes.len();
for (id, socket) in irq_control_tubes.iter() {
wait_ctx
.add(
socket.get_read_notifier(),
IrqHandlerToken::VmIrq { id: *id },
)
.context("irq control tubes to wait context")?;
}
'wait: loop {
let events = {
match wait_ctx.wait() {
Ok(v) => v,
Err(e) => {
error!("failed to poll: {}", e);
break 'wait;
}
}
};
let token_count = events.len();
let mut vm_irq_tubes_to_remove = Vec::new();
let mut notify_control_on_iteration_end = false;
for event in events.iter().filter(|e| e.is_readable) {
match event.token {
IrqHandlerToken::HandlerControl => {
match handler_control.recv::<IrqHandlerRequest>() {
Ok(request) => {
match request {
IrqHandlerRequest::Exit => break 'wait,
IrqHandlerRequest::AddIrqControlTubes(tubes) => {
for socket in tubes {
let id = next_control_id;
next_control_id += 1;
wait_ctx
.add(
socket.get_read_notifier(),
IrqHandlerToken::VmIrq { id },
)
.context("failed to add new IRQ control Tube to wait context")?;
irq_control_tubes.insert(id, socket);
}
}
IrqHandlerRequest::RefreshIrqEventTokens => {
for (_index, _gsi, evt) in irq_event_tokens.iter() {
wait_ctx.delete(evt).context(
"failed to remove irq chip event \
token from wait context",
)?;
}
irq_event_tokens = irq_chip
.irq_event_tokens()
.context("failed get event tokens from irqchip")?;
for (index, _gsi, evt) in irq_event_tokens.iter() {
wait_ctx
.add(evt, IrqHandlerToken::IrqFd { index: *index })
.context(
"failed to add irq chip event \
tokens to wait context",
)?;
}
if let Err(e) = handler_control
.send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
{
error!(
"failed to notify IRQ event token refresh \
was completed: {}",
e
);
}
}
IrqHandlerRequest::WakeAndNotifyIteration => {
notify_control_on_iteration_end = true;
}
}
}
Err(e) => {
if let TubeError::Disconnected = e {
panic!("irq handler control tube disconnected.");
} else {
error!("failed to recv IrqHandlerRequest: {}", e);
}
}
}
}
IrqHandlerToken::VmIrq { id } => {
if let Some(tube) = irq_control_tubes.get(&id) {
handle_irq_tube_request(
&sys_allocator_mutex,
&mut irq_chip,
&mut vm_irq_tubes_to_remove,
&wait_ctx,
tube,
id,
);
}
}
IrqHandlerToken::IrqFd { index } => {
if let Err(e) = irq_chip.service_irq_event(index) {
error!("failed to signal irq {}: {}", index, e);
}
}
IrqHandlerToken::DelayedIrqFd => {
if let Err(e) = irq_chip.process_delayed_irq_events() {
warn!("can't deliver delayed irqs: {}", e);
}
}
}
}
if notify_control_on_iteration_end {
if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
token_count - 1,
)) {
error!(
"failed to notify on iteration completion (snapshotting may fail): {}",
e
);
}
}
remove_hungup_and_drained_tubes(
&events,
&wait_ctx,
&mut irq_control_tubes,
vm_irq_tubes_to_remove,
|token: &IrqHandlerToken| {
if let IrqHandlerToken::VmIrq { id } = token {
return Some(*id);
}
None
},
)?;
if events.iter().any(|e| {
e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
}) {
error!("IRQ handler control hung up but did not request an exit.");
break 'wait;
}
}
Ok(())
}
fn handle_irq_tube_request(
sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
irq_chip: &mut Box<dyn IrqChipArch + 'static>,
vm_irq_tubes_to_remove: &mut Vec<usize>,
wait_ctx: &WaitContext<IrqHandlerToken>,
tube: &Tube,
tube_index: usize,
) {
match tube.recv::<VmIrqRequest>() {
Ok(request) => {
let response = {
request.execute(
|setup| match setup {
IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
let source = IrqEventSource {
device_id: device_id.try_into().expect("Invalid device_id"),
queue_id,
device_name,
};
if let Some(event_index) =
irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
{
if let Err(e) =
wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
{
warn!("failed to add IrqFd to poll context: {}", e);
return Err(e);
}
}
Ok(())
}
IrqSetup::Route(route) => irq_chip.route_irq(route),
IrqSetup::UnRegister(irq, ev) => {
let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
irq_chip.unregister_edge_irq_event(irq, &irq_evt)
}
},
&mut sys_allocator_mutex.lock(),
)
};
if let Err(e) = tube.send(&response) {
error!("failed to send VmIrqResponse: {}", e);
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_irq_tubes_to_remove.push(tube_index);
} else {
error!("failed to recv VmIrqRequest: {}", e);
}
}
}
}
/// Commands to control the VM Memory handler thread.
#[derive(serde::Serialize, serde::Deserialize)]
pub enum VmMemoryHandlerRequest {
/// No response is sent for this command.
AddControlTubes(Vec<VmMemoryTube>),
/// No response is sent for this command.
Exit,
}
fn vm_memory_handler_thread(
control_tubes: Vec<VmMemoryTube>,
mut vm: impl Vm,
sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
mut gralloc: RutabagaGralloc,
mut iommu_client: Option<VmMemoryRequestIommuClient>,
handler_control: Tube,
) -> anyhow::Result<()> {
#[derive(EventToken)]
enum Token {
VmControl { id: usize },
HandlerControl,
}
let wait_ctx =
WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
.context("failed to build wait context")?;
let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
let mut next_control_id = control_tubes.len();
for (id, socket) in control_tubes.iter() {
wait_ctx
.add(socket.as_ref(), Token::VmControl { id: *id })
.context("failed to add descriptor to wait context")?;
}
let mut region_state = VmMemoryRegionState::new();
'wait: loop {
let events = {
match wait_ctx.wait() {
Ok(v) => v,
Err(e) => {
error!("failed to poll: {}", e);
break;
}
}
};
let mut vm_control_ids_to_remove = Vec::new();
for event in events.iter().filter(|e| e.is_readable) {
match event.token {
Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
Ok(request) => match request {
VmMemoryHandlerRequest::Exit => break 'wait,
VmMemoryHandlerRequest::AddControlTubes(tubes) => {
for socket in tubes {
let id = next_control_id;
next_control_id += 1;
wait_ctx
.add(socket.get_read_notifier(), Token::VmControl { id })
.context(
"failed to add new vm memory control Tube to wait context",
)?;
control_tubes.insert(id, socket);
}
}
},
Err(e) => {
if let TubeError::Disconnected = e {
panic!("vm memory control tube disconnected.");
} else {
error!("failed to recv VmMemoryHandlerRequest: {}", e);
}
}
},
Token::VmControl { id } => {
if let Some(VmMemoryTube {
tube,
expose_with_viommu,
}) = control_tubes.get(&id)
{
match tube.recv::<VmMemoryRequest>() {
Ok(request) => {
let response = request.execute(
&mut vm,
&mut sys_allocator_mutex.lock(),
&mut gralloc,
if *expose_with_viommu {
iommu_client.as_mut()
} else {
None
},
&mut region_state,
);
if let Err(e) = tube.send(&response) {
error!("failed to send VmMemoryControlResponse: {}", e);
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmMemoryControlRequest: {}", e);
}
}
}
}
}
}
}
remove_hungup_and_drained_tubes(
&events,
&wait_ctx,
&mut control_tubes,
vm_control_ids_to_remove,
|token: &Token| {
if let Token::VmControl { id } = token {
return Some(*id);
}
None
},
)?;
if events
.iter()
.any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
{
error!("vm memory handler control hung up but did not request an exit.");
break 'wait;
}
}
Ok(())
}
/// When control tubes hang up, we want to make sure that we've fully drained
/// the underlying socket before removing it. This function also handles
/// removing closed sockets in such a way that avoids phantom events.
///
/// `tube_ids_to_remove` is the set of ids that we already know should
/// be removed (e.g. from getting a disconnect error on read).
fn remove_hungup_and_drained_tubes<T, U>(
events: &SmallVec<[TriggeredEvent<T>; 16]>,
wait_ctx: &WaitContext<T>,
tubes: &mut BTreeMap<usize, U>,
mut tube_ids_to_remove: Vec<usize>,
get_tube_id: fn(token: &T) -> Option<usize>,
) -> anyhow::Result<()>
where
T: EventToken,
U: ReadNotifier,
{
// It's possible more data is readable and buffered while the socket is hungup,
// so don't delete the tube from the poll context until we're sure all the
// data is read.
// Below case covers a condition where we have received a hungup event and the tube is not
// readable.
// In case of readable tube, once all data is read, any attempt to read more data on hungup
// tube should fail. On such failure, we get Disconnected error and ids gets added to
// tube_ids_to_remove by the time we reach here.
for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
if let Some(id) = get_tube_id(&event.token) {
tube_ids_to_remove.push(id);
}
}
tube_ids_to_remove.dedup();
for id in tube_ids_to_remove {
// Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
// this automatically when the FD inserted into the `wait_ctx` is closed after this
// if-block, but this removal can be deferred unpredictably. In some instances where the
// system is under heavy load, we can even get events returned by `wait_ctx` for an FD
// that has already been closed. Because the token associated with that spurious event
// now belongs to a different socket, the control loop will start to interact with
// sockets that might not be ready to use. This can cause incorrect hangup detection or
// blocking on a socket that will never be ready. See also: crbug.com/1019986
if let Some(socket) = tubes.remove(&id) {
wait_ctx
.delete(socket.get_read_notifier())
.context("failed to remove descriptor from wait context")?;
}
}
Ok(())
}
/// Start and jail a vhost-user device according to its configuration and a vhost listener string.
///
/// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
/// call outside of `start_devices`!
///
/// Returns the pid of the jailed device process.
fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
jail_config: &Option<JailConfig>,
params: T,
vhost: &str,
name: &str,
) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
let mut keep_rds = Vec::new();
base::syslog::push_descriptors(&mut keep_rds);
cros_tracing::push_descriptors!(&mut keep_rds);
metrics::push_descriptors(&mut keep_rds);
let jail_type = VirtioDeviceType::VhostUser;
// Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
// return `None` so fall back to an empty (i.e. non-constrained) Minijail.
let jail = params
.create_jail(jail_config, jail_type)
.with_context(|| format!("failed to create jail for {}", name))?
.ok_or(())
.or_else(|_| Minijail::new())
.with_context(|| format!("failed to create empty jail for {}", name))?;
// Create the device in the parent process, so the child does not need any privileges necessary
// to do it (only runtime capabilities are required).
let device = params
.create_vhost_user_device(&mut keep_rds)
.context("failed to create vhost-user device")?;
let mut listener = VhostUserListener::new(vhost, Some(&mut keep_rds))
.context("failed to create the vhost listener")?;
let parent_resources = listener.take_parent_process_resources();
// Executor must be created before jail in order to prevent the jailed process from creating
// unrestricted io_urings.
let ex = Executor::new().context("Failed to create an Executor")?;
keep_rds.extend(ex.as_raw_descriptors());
// Deduplicate the FDs since minijail expects them to be unique.
keep_rds.sort_unstable();
keep_rds.dedup();
// SAFETY:
// Safe because we are keeping all the descriptors needed for the child to function.
match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
0 => {
// In the child process.
// Free memory for the resources managed by the parent, without running drop() on them.
// The parent will do it as we exit.
let _ = std::mem::ManuallyDrop::new(parent_resources);
// Make sure the child process does not survive its parent.
// SAFETY: trivially safe
if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
}
// Set the name for the thread.
const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
let thread_name = CString::new(debug_label_trimmed).unwrap();
// SAFETY:
// Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
// an error if we don't anyway).
let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
// Run the device loop and terminate the child process once it exits.
let res = match listener.run_device(ex, device) {
Ok(()) => 0,
Err(e) => {
error!("error while running device {}: {:#}", name, e);
1
}
};
// SAFETY: trivially safe
unsafe { libc::exit(res) };
}
pid => {
// In the parent process. We will drop the device and listener when exiting this method.
// This is fine as ownership for both has been transferred to the child process and they
// will keep living there. We just retain `parent_resources` for things we are supposed
// to clean up ourselves.
info!("process for device {} (PID {}) started", &name, pid);
#[cfg(feature = "seccomp_trace")]
debug!(
"seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
pid,
&name,
read_jail_addr(&jail)
);
Ok((pid, parent_resources))
}
}
}
fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
let command = tube
.recv::<VmRequest>()
.context("failed to receive VmRequest")?;
let resp = match command {
VmRequest::DiskCommand {
disk_index,
ref command,
} => match &disk_host_tubes.get(disk_index) {
Some(tube) => handle_disk_command(command, tube),
None => VmResponse::Err(base::Error::new(libc::ENODEV)),
},
request => {
error!(
"Request {:?} currently not supported in vhost user backend",
request
);
VmResponse::Err(base::Error::new(libc::EPERM))
}
};
tube.send(&resp).context("failed to send VmResponse")?;
Ok(())
}
fn start_vhost_user_control_server(
control_server_socket: UnlinkUnixSeqpacketListener,
disk_host_tubes: Vec<Tube>,
) {
info!("Start vhost-user control server");
loop {
match control_server_socket.accept() {
Ok(socket) => {
let tube = match Tube::new_from_unix_seqpacket(socket) {
Ok(tube) => tube,
Err(e) => {
error!("failed to open tube: {:#}", e);
return;
}
};
if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
error!("failed to process control request: {:#}", e);
}
}
Err(e) => {
error!("failed to establish connection: {}", e);
}
}
}
}
pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
if let Some(async_executor) = opts.async_executor {
Executor::set_default_executor_kind(async_executor)
.context("Failed to set the default async executor")?;
}
struct DeviceJailInfo {
// Unique name for the device, in the form `foomatic-0`.
name: String,
_drop_resources: Option<Box<dyn std::any::Any>>,
}
fn add_device<T: VirtioDeviceBuilder>(
i: usize,
device_params: T,
vhost: &str,
jail_config: &Option<JailConfig>,
devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
) -> anyhow::Result<()> {
let name = format!("{}-{}", T::NAME, i);
let (pid, _drop_resources) =
jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
devices_jails.insert(
pid,
DeviceJailInfo {
name,
_drop_resources,
},
);
Ok(())
}
let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
let jail = if opts.disable_sandbox {
None
} else {
Some(opts.jail)
};
// Create control server socket
let control_server_socket = opts.control_socket.map(|path| {
UnlinkUnixSeqpacketListener(
UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
)
});
// Create serial devices.
for (i, params) in opts.serial.iter().enumerate() {
let serial_config = &params.device;
add_device(i, serial_config, &params.vhost, &jail, &mut devices_jails)?;
}
let mut disk_host_tubes = Vec::new();
let control_socket_exists = control_server_socket.is_some();
// Create block devices.
for (i, params) in opts.block.iter().enumerate() {
let tube = if control_socket_exists {
let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
disk_host_tubes.push(host_tube);
Some(device_tube)
} else {
None
};
let disk_config = DiskConfig::new(&params.device, tube);
add_device(i, disk_config, &params.vhost, &jail, &mut devices_jails)?;
}
// Create vsock devices.
for (i, params) in opts.vsock.iter().enumerate() {
add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
}
// Create network devices.
#[cfg(feature = "net")]
for (i, params) in opts.net.iter().enumerate() {
add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
}
// No device created, that's probably not intended - print the help in that case.
if devices_jails.is_empty() {
let err = DevicesCommand::from_args(
&[&std::env::args().next().unwrap_or(String::from("crosvm"))],
&["--help"],
)
.unwrap_err();
println!("{}", err.output);
return Ok(());
}
let ex = Executor::new()?;
if let Some(control_server_socket) = control_server_socket {
// Start the control server in the parent process.
ex.spawn_blocking(move || {
start_vhost_user_control_server(control_server_socket, disk_host_tubes)
})
.detach();
}
// Now wait for all device processes to return.
while !devices_jails.is_empty() {
match base::linux::wait_for_pid(-1, 0) {
Err(e) => panic!("error waiting for child process to complete: {:#}", e),
Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
Some((_, info)) => {
if let Some(status) = wait_status.code() {
info!(
"process for device {} (PID {}) exited with code {}",
&info.name, pid, status
);
} else if let Some(signal) = wait_status.signal() {
warn!(
"process for device {} (PID {}) has been killed by signal {:?}",
&info.name, pid, signal,
);
}
}
None => error!("pid {} is not one of our device processes", pid),
},
// `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
// complete.
Ok((None, _)) => unreachable!(),
}
}
info!("all device processes have exited");
Ok(())
}
/// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
/// making crash reports incomprehensible.
#[cfg(feature = "crash-report")]
pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
product_type: "emulator".to_owned(),
pipe_name: None,
report_uuid: None,
product_name: None,
product_version: None,
})
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
// Create a file-backed mapping parameters struct with the given `address` and `size` and other
// parameters set to default values.
fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
FileBackedMappingParameters {
address,
size,
path: PathBuf::new(),
offset: 0,
writable: false,
sync: false,
align: false,
}
}
#[test]
fn guest_mem_file_backed_mappings_overlap() {
// Base case: no file mappings; output layout should be identical.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[]
),
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
]
);
// File mapping that does not overlap guest memory.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0xD000_0000, 0x1000)]
),
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
]
);
// File mapping at the start of the low address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0, 0x2000)]
),
vec![
(
GuestAddress(0x2000),
0xD000_0000 - 0x2000,
Default::default()
),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
]
);
// File mapping at the end of the low address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
),
vec![
(GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
]
);
// File mapping fully contained within the middle of the low address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0x1000, 0x2000)]
),
vec![
(GuestAddress(0), 0x1000, Default::default()),
(
GuestAddress(0x3000),
0xD000_0000 - 0x3000,
Default::default()
),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
]
);
// File mapping at the start of the high address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
),
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(
GuestAddress(0x1_0000_2000),
0x8_0000 - 0x2000,
Default::default()
),
]
);
// File mapping at the end of the high address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
),
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(
GuestAddress(0x1_0000_0000),
0x8_0000 - 0x2000,
Default::default()
),
]
);
// File mapping fully contained within the middle of the high address space region.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
),
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
(
GuestAddress(0x1_0000_3000),
0x8_0000 - 0x3000,
Default::default()
),
]
);
// File mapping overlapping two guest memory regions.
assert_eq!(
punch_holes_in_guest_mem_layout_for_mappings(
vec![
(GuestAddress(0), 0xD000_0000, Default::default()),
(GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
],
&[test_file_backed_mapping(0xA000_0000, 0x60002000)]
),
vec![
(GuestAddress(0), 0xA000_0000, Default::default()),
(
GuestAddress(0x1_0000_2000),
0x8_0000 - 0x2000,
Default::default()
),
]
);
}
}