blob: 29497dda4e6df3f531ba52760718546c89e418c8 [file] [log] [blame]
// Copyright 2022 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
// but isn't marked so. Remove this when we do so.
#![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
pub(crate) mod control_server;
pub(crate) mod irq_wait;
pub(crate) mod main;
#[cfg(not(feature = "crash-report"))]
mod panic_hook;
mod generic;
use generic as product;
pub(crate) mod run_vcpu;
#[cfg(feature = "whpx")]
use std::arch::x86_64::__cpuid;
#[cfg(feature = "whpx")]
use std::arch::x86_64::__cpuid_count;
use std::cmp::Reverse;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::fs::File;
use std::fs::OpenOptions;
use std::io::stdin;
use std::iter;
use std::mem;
use std::os::windows::fs::OpenOptionsExt;
use std::path::PathBuf;
use std::sync::mpsc;
use std::sync::Arc;
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
use aarch64::AArch64 as Arch;
use acpi_tables::sdt::SDT;
use anyhow::anyhow;
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use arch::CpuConfigArch;
use arch::DtbOverlay;
use arch::IrqChipArch;
use arch::LinuxArch;
use arch::RunnableLinuxVm;
use arch::VcpuArch;
use arch::VirtioDeviceStub;
use arch::VmArch;
use arch::VmComponents;
use arch::VmImage;
use base::enable_high_res_timers;
use base::error;
use base::info;
use base::open_file_or_duplicate;
use base::warn;
use base::AsRawDescriptor;
#[cfg(feature = "gpu")]
use base::BlockingMode;
use base::CloseNotifier;
use base::Event;
use base::EventToken;
use base::EventType;
use base::FlushOnDropTube;
#[cfg(feature = "gpu")]
use base::FramingMode;
use base::FromRawDescriptor;
use base::ProtoTube;
use base::RawDescriptor;
use base::ReadNotifier;
use base::RecvTube;
use base::SendTube;
#[cfg(feature = "gpu")]
use base::StreamChannel;
use base::Terminal;
use base::TriggeredEvent;
use base::Tube;
use base::TubeError;
use base::VmEventType;
use base::WaitContext;
use broker_ipc::common_child_setup;
use broker_ipc::CommonChildStartupArgs;
use control_server::ControlServer;
use crosvm_cli::sys::windows::exit::Exit;
use crosvm_cli::sys::windows::exit::ExitContext;
use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
use crosvm_cli::sys::windows::exit::ExitContextOption;
use devices::create_devices_worker_thread;
use devices::serial_device::SerialHardware;
use devices::serial_device::SerialParameters;
use devices::tsc::get_tsc_sync_mitigations;
use devices::tsc::standard_deviation;
use devices::tsc::TscSyncMitigations;
use devices::virtio;
use devices::virtio::block::DiskOption;
#[cfg(feature = "audio")]
use devices::virtio::snd::common_backend::VirtioSnd;
#[cfg(feature = "audio")]
use devices::virtio::snd::parameters::Parameters as SndParameters;
#[cfg(feature = "gpu")]
use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
#[cfg(feature = "gpu")]
use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
#[cfg(feature = "gpu")]
use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
#[cfg(feature = "gpu")]
use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
#[cfg(feature = "gpu")]
use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
#[cfg(feature = "audio")]
use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
#[cfg(feature = "balloon")]
use devices::virtio::BalloonFeatures;
#[cfg(feature = "balloon")]
use devices::virtio::BalloonMode;
use devices::virtio::Console;
#[cfg(feature = "gpu")]
use devices::virtio::GpuParameters;
use devices::BusDeviceObj;
#[cfg(feature = "gvm")]
use devices::GvmIrqChip;
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
use devices::IrqChip;
use devices::UserspaceIrqChip;
use devices::VcpuRunState;
use devices::VirtioPciDevice;
#[cfg(feature = "whpx")]
use devices::WhpxSplitIrqChip;
#[cfg(feature = "gpu")]
use gpu_display::EventDevice;
#[cfg(feature = "gpu")]
use gpu_display::WindowProcedureThread;
#[cfg(feature = "gpu")]
use gpu_display::WindowProcedureThreadBuilder;
#[cfg(feature = "gvm")]
use hypervisor::gvm::Gvm;
#[cfg(feature = "gvm")]
use hypervisor::gvm::GvmVcpu;
#[cfg(feature = "gvm")]
use hypervisor::gvm::GvmVersion;
#[cfg(feature = "gvm")]
use hypervisor::gvm::GvmVm;
#[cfg(feature = "haxm")]
use hypervisor::haxm::get_use_ghaxm;
#[cfg(feature = "haxm")]
use hypervisor::haxm::set_use_ghaxm;
#[cfg(feature = "haxm")]
use hypervisor::haxm::Haxm;
#[cfg(feature = "haxm")]
use hypervisor::haxm::HaxmVcpu;
#[cfg(feature = "haxm")]
use hypervisor::haxm::HaxmVm;
#[cfg(feature = "whpx")]
use hypervisor::whpx::Whpx;
#[cfg(feature = "whpx")]
use hypervisor::whpx::WhpxFeature;
#[cfg(feature = "whpx")]
use hypervisor::whpx::WhpxVcpu;
#[cfg(feature = "whpx")]
use hypervisor::whpx::WhpxVm;
use hypervisor::Hypervisor;
#[cfg(feature = "whpx")]
use hypervisor::HypervisorCap;
#[cfg(feature = "whpx")]
use hypervisor::HypervisorX86_64;
use hypervisor::ProtectionType;
use hypervisor::Vm;
use irq_wait::IrqWaitWorker;
use jail::FakeMinijailStub as Minijail;
#[cfg(not(feature = "crash-report"))]
pub(crate) use panic_hook::set_panic_hook;
use product::create_snd_mute_tube_pair;
#[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
use product::create_snd_state_tube;
use product::handle_pvclock_request;
use product::merge_session_invariants;
use product::run_ime_thread;
use product::set_package_name;
pub(crate) use product::setup_metrics_reporting;
use product::start_service_ipc_listener;
use product::RunControlArgs;
use product::ServiceVmState;
use product::Token;
use resources::SystemAllocator;
use run_vcpu::run_all_vcpus;
use run_vcpu::VcpuRunMode;
use rutabaga_gfx::RutabagaGralloc;
use smallvec::SmallVec;
use sync::Mutex;
use tube_transporter::TubeToken;
use tube_transporter::TubeTransporterReader;
use vm_control::api::VmMemoryClient;
#[cfg(feature = "balloon")]
use vm_control::BalloonControlCommand;
#[cfg(feature = "balloon")]
use vm_control::BalloonTube;
use vm_control::DeviceControlCommand;
use vm_control::IrqHandlerRequest;
use vm_control::PvClockCommand;
use vm_control::VcpuControl;
use vm_control::VmMemoryRegionState;
use vm_control::VmMemoryRequest;
use vm_control::VmRequest;
use vm_control::VmResponse;
use vm_control::VmRunMode;
use vm_memory::GuestAddress;
use vm_memory::GuestMemory;
use win_util::ProcessType;
#[cfg(feature = "whpx")]
use x86_64::cpuid::adjust_cpuid;
#[cfg(feature = "whpx")]
use x86_64::cpuid::CpuIdContext;
#[cfg(all(target_arch = "x86_64", feature = "haxm"))]
use x86_64::get_cpu_manufacturer;
#[cfg(all(target_arch = "x86_64", feature = "haxm"))]
use x86_64::CpuManufacturer;
#[cfg(target_arch = "x86_64")]
use x86_64::X8664arch as Arch;
use crate::crosvm::config::Config;
use crate::crosvm::config::Executable;
use crate::crosvm::config::InputDeviceOption;
#[cfg(any(feature = "gvm", feature = "whpx"))]
use crate::crosvm::config::IrqChipKind;
#[cfg(feature = "gpu")]
use crate::crosvm::config::TouchDeviceOption;
use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
use crate::crosvm::sys::config::HypervisorKind;
use crate::crosvm::sys::windows::broker::BrokerTubes;
#[cfg(feature = "stats")]
use crate::crosvm::sys::windows::stats::StatisticsCollector;
#[cfg(feature = "gpu")]
pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
#[cfg(feature = "audio")]
pub(crate) use crate::sys::windows::product::get_snd_product_configs;
#[cfg(feature = "gpu")]
pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
use crate::sys::windows::product::log_descriptor;
#[cfg(feature = "audio")]
pub(crate) use crate::sys::windows::product::num_input_sound_devices;
#[cfg(feature = "audio")]
pub(crate) use crate::sys::windows::product::num_input_sound_streams;
use crate::sys::windows::product::spawn_anti_tamper_thread;
use crate::sys::windows::product::MetricEventType;
const DEFAULT_GUEST_CID: u64 = 3;
// by default, if enabled, the balloon WS features will use 4 bins.
const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
enum TaggedControlTube {
Vm(FlushOnDropTube),
Product(product::TaggedControlTube),
}
impl ReadNotifier for TaggedControlTube {
fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
match self {
Self::Vm(tube) => tube.0.get_read_notifier(),
Self::Product(tube) => tube.get_read_notifier(),
}
}
}
impl CloseNotifier for TaggedControlTube {
fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
match self {
Self::Vm(tube) => tube.0.get_close_notifier(),
Self::Product(tube) => tube.get_close_notifier(),
}
}
}
pub enum ExitState {
Reset,
Stop,
Crash,
#[allow(dead_code)]
GuestPanic,
WatchdogReset,
}
type DeviceResult<T = VirtioDeviceStub> = Result<T>;
fn create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult {
let dev = virtio::VhostUserFrontend::new(
virtio::DeviceType::Block,
virtio::base_features(cfg.protection_type),
disk_device_tube,
None,
None,
)
.exit_context(
Exit::VhostUserBlockDeviceNew,
"failed to set up vhost-user block device",
)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
let features = virtio::base_features(cfg.protection_type);
let dev = virtio::BlockAsync::new(
features,
disk.open()?,
disk,
Some(disk_device_tube),
None,
None,
)
.exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "gpu")]
fn create_vhost_user_gpu_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
let dev = virtio::VhostUserFrontend::new(
virtio::DeviceType::Gpu,
base_features,
vhost_user_tube,
None,
None,
)
.exit_context(
Exit::VhostUserGpuDeviceNew,
"failed to set up vhost-user gpu device",
)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "audio")]
fn create_snd_device(
cfg: &Config,
parameters: SndParameters,
_product_args: SndBackendConfigProduct,
) -> DeviceResult {
let features = virtio::base_features(cfg.protection_type);
let dev = VirtioSnd::new(features, parameters)
.exit_context(Exit::VirtioSoundDeviceNew, "failed to create snd device")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "audio")]
fn create_vhost_user_snd_device(base_features: u64, vhost_user_tube: Tube) -> DeviceResult {
let dev = virtio::VhostUserFrontend::new(
virtio::DeviceType::Sound,
base_features,
vhost_user_tube,
None,
None,
)
.exit_context(
Exit::VhostUserSndDeviceNew,
"failed to set up vhost-user snd device",
)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "gpu")]
fn create_multi_touch_device(
cfg: &Config,
event_pipe: StreamChannel,
width: u32,
height: u32,
name: Option<&str>,
idx: u32,
) -> DeviceResult {
let dev = virtio::input::new_multi_touch(
idx,
event_pipe,
width,
height,
name,
virtio::base_features(cfg.protection_type),
)
.exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "gpu")]
fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
.exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "slirp")]
fn create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult {
let features = virtio::base_features(cfg.protection_type);
let dev = virtio::VhostUserFrontend::new(
virtio::DeviceType::Net,
features,
net_device_tube,
None,
None,
)
.exit_context(
Exit::VhostUserNetDeviceNew,
"failed to set up vhost-user net device",
)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
fn create_rng_device(cfg: &Config) -> DeviceResult {
let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
.exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
let mut keep_rds = Vec::new();
let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
let dev = param
.create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
.exit_context(Exit::CreateConsole, "failed to create console device")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
#[cfg(feature = "balloon")]
fn create_balloon_device(
cfg: &Config,
balloon_device_tube: Tube,
dynamic_mapping_device_tube: Tube,
inflate_tube: Option<Tube>,
init_balloon_size: u64,
) -> DeviceResult {
let balloon_features =
(cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
let dev = virtio::Balloon::new(
virtio::base_features(cfg.protection_type),
balloon_device_tube,
VmMemoryClient::new(dynamic_mapping_device_tube),
inflate_tube,
init_balloon_size,
if cfg.strict_balloon {
BalloonMode::Strict
} else {
BalloonMode::Relaxed
},
balloon_features,
#[cfg(feature = "registered_events")]
None,
VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
)
.exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
fn create_vsock_device(cfg: &Config) -> DeviceResult {
// We only support a single guest, so we can confidently assign a default
// CID if one isn't provided. We choose the lowest non-reserved value.
let dev = virtio::vsock::Vsock::new(
cfg.vsock
.as_ref()
.map(|cfg| cfg.cid)
.unwrap_or(DEFAULT_GUEST_CID),
cfg.host_guid.clone(),
virtio::base_features(cfg.protection_type),
)
.exit_context(
Exit::UserspaceVsockDeviceNew,
"failed to create userspace vsock device",
)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
})
}
fn create_virtio_devices(
cfg: &mut Config,
vm_evt_wrtube: &SendTube,
#[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
disk_device_tubes: &mut Vec<Tube>,
balloon_device_tube: Option<Tube>,
pvclock_device_tube: Option<Tube>,
dynamic_mapping_device_tube: Option<Tube>,
inflate_tube: Option<Tube>,
init_balloon_size: u64,
tsc_frequency: u64,
virtio_snd_state_device_tube: Option<Tube>,
virtio_snd_control_device_tube: Option<Tube>,
) -> DeviceResult<Vec<VirtioDeviceStub>> {
let mut devs = Vec::new();
if cfg.block_vhost_user_tube.is_empty() {
// Disk devices must precede virtio-console devices or the kernel does not boot.
// TODO(b/171215421): figure out why this ordering is required and fix it.
for disk in &cfg.disks {
let disk_device_tube = disk_device_tubes.remove(0);
devs.push(create_block_device(cfg, disk, disk_device_tube)?);
}
} else {
info!("Starting up vhost user block backends...");
for _disk in &cfg.disks {
let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
devs.push(create_vhost_user_block_device(cfg, disk_device_tube)?);
}
}
for (_, param) in cfg
.serial_parameters
.iter()
.filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
{
let dev = create_console_device(cfg, param)?;
devs.push(dev);
}
#[cfg(feature = "audio")]
if product::virtio_sound_enabled() {
let snd_split_config = cfg
.snd_split_config
.as_mut()
.expect("snd_split_config must exist");
let snd_vmm_config = snd_split_config
.vmm_config
.as_mut()
.expect("snd_vmm_config must exist");
product::push_snd_control_tubes(control_tubes, snd_vmm_config);
match snd_split_config.backend_config.take() {
None => {
// No backend config present means the backend is running in another process.
devs.push(create_vhost_user_snd_device(
virtio::base_features(cfg.protection_type),
snd_vmm_config
.main_vhost_user_tube
.take()
.expect("Snd VMM vhost-user tube should be set"),
)?);
}
Some(backend_config) => {
// Backend config present, so initialize Snd in this process.
devs.push(create_snd_device(
cfg,
backend_config.parameters,
backend_config.product_config,
)?);
}
}
}
if let Some(tube) = pvclock_device_tube {
product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
}
devs.push(create_rng_device(cfg)?);
#[cfg(feature = "slirp")]
if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
devs.push(create_vhost_user_net_device(cfg, net_vhost_user_tube)?);
}
#[cfg(feature = "balloon")]
if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
(balloon_device_tube, dynamic_mapping_device_tube)
{
devs.push(create_balloon_device(
cfg,
balloon_device_tube,
dynamic_mapping_device_tube,
inflate_tube,
init_balloon_size,
)?);
}
devs.push(create_vsock_device(cfg)?);
#[cfg(feature = "gpu")]
let event_devices = if let Some(InputEventSplitConfig {
backend_config,
vmm_config,
}) = cfg.input_event_split_config.take()
{
devs.extend(
create_virtio_input_event_devices(cfg, vmm_config)
.context("create input event devices")?,
);
backend_config.map(|cfg| cfg.event_devices)
} else {
None
};
#[cfg(feature = "gpu")]
if let Some(wndproc_thread_vmm_config) = cfg
.window_procedure_thread_split_config
.as_mut()
.map(|split_cfg| &mut split_cfg.vmm_config)
{
product::push_window_procedure_thread_control_tubes(
control_tubes,
wndproc_thread_vmm_config,
);
}
#[cfg(feature = "gpu")]
let mut wndproc_thread = cfg
.window_procedure_thread_split_config
.as_mut()
.and_then(|cfg| cfg.wndproc_thread_builder.take())
.map(WindowProcedureThreadBuilder::start_thread)
.transpose()
.context("Failed to start the window procedure thread.")?;
#[cfg(feature = "gpu")]
if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
devs.push(create_virtio_gpu_device(
cfg,
gpu_vmm_config,
event_devices,
&mut wndproc_thread,
control_tubes,
)?);
}
Ok(devs)
}
#[cfg(feature = "gpu")]
fn create_virtio_input_event_devices(
cfg: &Config,
mut input_event_vmm_config: InputEventVmmConfig,
) -> DeviceResult<Vec<VirtioDeviceStub>> {
let mut devs = Vec::new();
// Iterate event devices, create the VMM end.
let mut multi_touch_pipes = input_event_vmm_config
.multi_touch_pipes
.drain(..)
.enumerate();
for input in &cfg.virtio_input {
match input {
InputDeviceOption::SingleTouch { .. } => {
unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
}
InputDeviceOption::MultiTouch {
width,
height,
name,
..
} => {
let Some((idx, pipe)) = multi_touch_pipes.next() else {
break;
};
let mut width = *width;
let mut height = *height;
if idx == 0 {
if width.is_none() {
width = cfg.display_input_width;
}
if height.is_none() {
height = cfg.display_input_height;
}
}
devs.push(create_multi_touch_device(
cfg,
pipe,
width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
name.as_deref(),
idx as u32,
)?);
}
_ => {}
}
}
drop(multi_touch_pipes);
product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
}
let keyboard_pipe = input_event_vmm_config
.keyboard_pipes
.pop()
.expect("at least one keyboard should be in GPU VMM config");
let dev = virtio::input::new_keyboard(
/* idx= */ 0,
keyboard_pipe,
virtio::base_features(cfg.protection_type),
)
.exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
devs.push(VirtioDeviceStub {
dev: Box::new(dev),
jail: None,
});
Ok(devs)
}
#[cfg(feature = "gpu")]
fn create_virtio_gpu_device(
cfg: &mut Config,
mut gpu_vmm_config: GpuVmmConfig,
event_devices: Option<Vec<EventDevice>>,
wndproc_thread: &mut Option<WindowProcedureThread>,
#[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
) -> DeviceResult<VirtioDeviceStub> {
let resource_bridges = Vec::<Tube>::new();
product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
// If the GPU backend is passed, start up the vhost-user worker in the main process.
if let Some(backend_config) = cfg.gpu_backend_config.take() {
let event_devices = event_devices.ok_or_else(|| {
anyhow!("event devices are missing when creating virtio-gpu in the current process.")
})?;
let wndproc_thread = wndproc_thread
.take()
.ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
std::thread::spawn(move || {
run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
});
}
// The GPU is always vhost-user, even if running in the main process.
create_vhost_user_gpu_device(
virtio::base_features(cfg.protection_type),
gpu_vmm_config
.main_vhost_user_tube
.take()
.expect("GPU VMM vhost-user tube should be set"),
)
.context("create vhost-user GPU device")
}
fn create_devices(
cfg: &mut Config,
mem: &GuestMemory,
exit_evt_wrtube: &SendTube,
irq_control_tubes: &mut Vec<Tube>,
vm_memory_control_tubes: &mut Vec<Tube>,
control_tubes: &mut Vec<TaggedControlTube>,
disk_device_tubes: &mut Vec<Tube>,
balloon_device_tube: Option<Tube>,
pvclock_device_tube: Option<Tube>,
dynamic_mapping_device_tube: Option<Tube>,
inflate_tube: Option<Tube>,
init_balloon_size: u64,
tsc_frequency: u64,
virtio_snd_state_device_tube: Option<Tube>,
virtio_snd_control_device_tube: Option<Tube>,
) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
let stubs = create_virtio_devices(
cfg,
exit_evt_wrtube,
control_tubes,
disk_device_tubes,
balloon_device_tube,
pvclock_device_tube,
dynamic_mapping_device_tube,
inflate_tube,
init_balloon_size,
tsc_frequency,
virtio_snd_state_device_tube,
virtio_snd_control_device_tube,
)?;
let mut pci_devices = Vec::new();
for stub in stubs {
let (msi_host_tube, msi_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
irq_control_tubes.push(msi_host_tube);
let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
let (host_tube, device_tube) =
Tube::pair().context("failed to create VVU proxy tube")?;
vm_memory_control_tubes.push(host_tube);
Some(device_tube)
} else {
None
};
let (ioevent_host_tube, ioevent_device_tube) =
Tube::pair().context("failed to create ioevent tube")?;
vm_memory_control_tubes.push(ioevent_host_tube);
let (vm_control_host_tube, vm_control_device_tube) =
Tube::pair().context("failed to create vm_control tube")?;
control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
vm_control_host_tube,
)));
let dev = Box::new(
VirtioPciDevice::new(
mem.clone(),
stub.dev,
msi_device_tube,
cfg.disable_virtio_intx,
shared_memory_tube.map(VmMemoryClient::new),
VmMemoryClient::new(ioevent_device_tube),
vm_control_device_tube,
)
.exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
) as Box<dyn BusDeviceObj>;
pci_devices.push((dev, stub.jail));
}
Ok(pci_devices)
}
#[derive(Debug)]
struct PvClockError(String);
fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
event: &TriggeredEvent<Token>,
vm_control_ids_to_remove: &mut Vec<usize>,
next_control_id: &mut usize,
service_vm_state: &mut ServiceVmState,
disk_host_tubes: &[Tube],
ipc_main_loop_tube: Option<&Tube>,
#[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
vm_evt_rdtube: &RecvTube,
control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
guest_os: &mut RunnableLinuxVm<V, Vcpu>,
sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
virtio_snd_host_mute_tube: &mut Option<Tube>,
proto_main_loop_tube: Option<&ProtoTube>,
anti_tamper_main_thread_tube: &Option<ProtoTube>,
#[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
memory_size_mb: u64,
vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
pvclock_host_tube: &Option<Tube>,
run_mode_arc: &VcpuRunMode,
region_state: &mut VmMemoryRegionState,
vm_control_server: Option<&mut ControlServer>,
irq_handler_control: &Tube,
device_ctrl_tube: &Tube,
wait_ctx: &WaitContext<Token>,
force_s2idle: bool,
vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
) -> Result<Option<ExitState>> {
let execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
let mut run_mode_opt = None;
let vcpu_size = vcpu_boxes.lock().len();
let resp = request.execute(
&guest_os.vm,
&mut run_mode_opt,
disk_host_tubes,
&mut guest_os.pm,
#[cfg(feature = "gpu")]
gpu_control_tube,
#[cfg(not(feature = "gpu"))]
None,
None,
&mut None,
|msg| {
kick_all_vcpus(
run_mode_arc,
vcpu_control_channels,
vcpu_boxes,
guest_os.irq_chip.as_ref(),
pvclock_host_tube,
msg,
);
},
force_s2idle,
#[cfg(feature = "swap")]
None,
device_ctrl_tube,
vcpu_size,
irq_handler_control,
|| guest_os.irq_chip.as_ref().snapshot(vcpu_size),
);
(resp, run_mode_opt)
};
match event.token {
Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
Ok(vm_event) => {
let exit_state = match vm_event {
VmEventType::Exit => {
info!("vcpu requested shutdown");
Some(ExitState::Stop)
}
VmEventType::Reset => {
info!("vcpu requested reset");
Some(ExitState::Reset)
}
VmEventType::Crash => {
info!("vcpu crashed");
Some(ExitState::Crash)
}
VmEventType::Panic(_) => {
error!("got pvpanic event. this event is not expected on Windows.");
None
}
VmEventType::WatchdogReset => {
info!("vcpu stall detected");
Some(ExitState::WatchdogReset)
}
};
return Ok(exit_state);
}
Err(e) => {
warn!("failed to recv VmEvent: {}", e);
}
},
Token::BrokerShutdown => {
info!("main loop got broker shutdown event");
return Ok(Some(ExitState::Stop));
}
Token::VmControlServer => {
let server =
vm_control_server.expect("control server must exist if this event triggers");
let client = server.accept();
let id = *next_control_id;
*next_control_id += 1;
wait_ctx
.add(client.0.get_read_notifier(), Token::VmControl { id })
.exit_context(
Exit::WaitContextAdd,
"failed to add trigger to wait context",
)?;
wait_ctx
.add(client.0.get_close_notifier(), Token::VmControl { id })
.exit_context(
Exit::WaitContextAdd,
"failed to add trigger to wait context",
)?;
control_tubes.insert(id, TaggedControlTube::Vm(client));
}
#[allow(clippy::collapsible_match)]
Token::VmControl { id } => {
if let Some(tube) = control_tubes.get(&id) {
#[allow(clippy::single_match)]
match tube {
TaggedControlTube::Product(product_tube) => {
product::handle_tagged_control_tube_event(
product_tube,
virtio_snd_host_mute_tube,
service_vm_state,
ipc_main_loop_tube,
)
}
TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
Ok(request) => {
let mut run_mode_opt = None;
let response = match request {
VmRequest::HotPlugVfioCommand { device, add } => {
// Suppress warnings.
let _ = (device, add);
unimplemented!("not implemented on Windows");
}
#[cfg(feature = "registered_events")]
VmRequest::RegisterListener { socket_addr, event } => {
unimplemented!("not implemented on Windows");
}
#[cfg(feature = "registered_events")]
VmRequest::UnregisterListener { socket_addr, event } => {
unimplemented!("not implemented on Windows");
}
#[cfg(feature = "registered_events")]
VmRequest::Unregister { socket_addr } => {
unimplemented!("not implemented on Windows");
}
#[cfg(feature = "balloon")]
VmRequest::BalloonCommand(cmd) => {
if let Some(balloon_tube) = balloon_tube {
if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
{
if key != id {
unimplemented!("not implemented on Windows");
}
Some(r)
} else {
None
}
} else {
error!("balloon not enabled");
None
}
}
_ => {
let (resp, run_mode_ret) =
execute_vm_request(request, guest_os);
run_mode_opt = run_mode_ret;
Some(resp)
}
};
if let Some(response) = response {
if let Err(e) = tube.0.send(&response) {
error!("failed to send VmResponse: {}", e);
}
}
if let Some(exit_state) =
handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
{
return Ok(Some(exit_state));
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmRequest: {}", e);
}
}
},
}
}
}
#[cfg(feature = "balloon")]
Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
Ok(resp) => {
for (resp, idx) in resp {
if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
if let Err(e) = tube.0.send(&resp) {
error!("failed to send VmResponse: {}", e);
}
} else {
error!("Bad tube index {}", idx);
}
}
}
Err(err) => {
error!("Error processing balloon tube {:?}", err)
}
},
#[cfg(not(feature = "balloon"))]
Token::BalloonTube => unreachable!("balloon tube not registered"),
#[allow(unreachable_patterns)]
_ => {
let run_mode_opt = product::handle_received_token(
&event.token,
anti_tamper_main_thread_tube,
#[cfg(feature = "balloon")]
balloon_tube,
control_tubes,
guest_os,
ipc_main_loop_tube,
memory_size_mb,
proto_main_loop_tube,
pvclock_host_tube,
run_mode_arc,
service_vm_state,
vcpu_boxes,
virtio_snd_host_mute_tube,
execute_vm_request,
);
if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
{
return Ok(Some(exit_state));
}
}
};
Ok(None)
}
/// Handles a run mode change (if one occurred) if one is pending as a
/// result a VmRequest. The parameter, run_mode_opt, is the run mode change
/// proposed by the VmRequest's execution.
///
/// Returns the exit state, if it changed due to a run mode change.
/// None otherwise.
fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
run_mode_opt: &Option<VmRunMode>,
guest_os: &mut RunnableLinuxVm<V, Vcpu>,
) -> Option<ExitState> {
if let Some(run_mode) = run_mode_opt {
info!("control socket changed run mode to {}", run_mode);
match run_mode {
VmRunMode::Exiting => return Some(ExitState::Stop),
other => {
if other == &VmRunMode::Running {
for dev in &guest_os.resume_notify_devices {
dev.lock().resume_imminent();
}
}
}
}
}
// No exit state change.
None
}
/// Commands to control the VM Memory handler thread.
#[derive(serde::Serialize, serde::Deserialize)]
pub enum VmMemoryHandlerRequest {
/// No response is sent for this command.
Exit,
}
fn vm_memory_handler_thread(
control_tubes: Vec<Tube>,
mut vm: impl Vm,
sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
mut gralloc: RutabagaGralloc,
handler_control: Tube,
) -> anyhow::Result<()> {
#[derive(EventToken)]
enum Token {
VmControl { id: usize },
HandlerControl,
}
let wait_ctx =
WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
.context("failed to build wait context")?;
let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
for (id, socket) in control_tubes.iter() {
wait_ctx
.add(socket.get_read_notifier(), Token::VmControl { id: *id })
.context("failed to add descriptor to wait context")?;
}
let mut region_state = VmMemoryRegionState::new();
'wait: loop {
let events = {
match wait_ctx.wait() {
Ok(v) => v,
Err(e) => {
error!("failed to poll: {}", e);
break;
}
}
};
let mut vm_control_ids_to_remove = Vec::new();
for event in events.iter().filter(|e| e.is_readable) {
match event.token {
Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
Ok(request) => match request {
VmMemoryHandlerRequest::Exit => break 'wait,
},
Err(e) => {
if let TubeError::Disconnected = e {
panic!("vm memory control tube disconnected.");
} else {
error!("failed to recv VmMemoryHandlerRequest: {}", e);
}
}
},
Token::VmControl { id } => {
if let Some(tube) = control_tubes.get(&id) {
match tube.recv::<VmMemoryRequest>() {
Ok(request) => {
let response = request.execute(
&mut vm,
&mut sys_allocator_mutex.lock(),
&mut gralloc,
None,
&mut region_state,
);
if let Err(e) = tube.send(&response) {
error!("failed to send VmMemoryControlResponse: {}", e);
}
}
Err(e) => {
if let TubeError::Disconnected = e {
vm_control_ids_to_remove.push(id);
} else {
error!("failed to recv VmMemoryControlRequest: {}", e);
}
}
}
}
}
}
}
remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
if events
.iter()
.any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
{
error!("vm memory handler control hung up but did not request an exit.");
break 'wait;
}
}
Ok(())
}
fn create_control_server(
control_server_path: Option<PathBuf>,
wait_ctx: &WaitContext<Token>,
) -> Result<Option<ControlServer>> {
#[cfg(not(feature = "prod-build"))]
{
if let Some(path) = control_server_path {
let server =
ControlServer::new(path.to_str().expect("control socket path must be a string"))
.exit_context(
Exit::FailedToCreateControlServer,
"failed to create control server",
)?;
wait_ctx
.add(server.client_waiting(), Token::VmControlServer)
.exit_context(
Exit::WaitContextAdd,
"failed to add control server to wait context",
)?;
return Ok(Some(server));
}
}
Ok::<Option<ControlServer>, anyhow::Error>(None)
}
fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
mut guest_os: RunnableLinuxVm<V, Vcpu>,
sys_allocator: SystemAllocator,
control_tubes: Vec<TaggedControlTube>,
irq_control_tubes: Vec<Tube>,
vm_memory_control_tubes: Vec<Tube>,
vm_evt_rdtube: RecvTube,
vm_evt_wrtube: SendTube,
#[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
broker_shutdown_evt: Option<Event>,
balloon_host_tube: Option<Tube>,
pvclock_host_tube: Option<Tube>,
disk_host_tubes: Vec<Tube>,
gralloc: RutabagaGralloc,
#[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
service_pipe_name: Option<String>,
memory_size_mb: u64,
host_cpu_topology: bool,
tsc_sync_mitigations: TscSyncMitigations,
force_calibrated_tsc_leaf: bool,
mut product_args: RunControlArgs,
mut virtio_snd_host_mute_tube: Option<Tube>,
restore_path: Option<PathBuf>,
control_server_path: Option<PathBuf>,
force_s2idle: bool,
suspended: bool,
) -> Result<ExitState> {
let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
start_service_ipc_listener(service_pipe_name)?;
let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
Exit::CreateTube,
"failed to create IRQ handler control Tube",
)?;
// Create a separate thread to wait on IRQ events. This is a natural division
// because IRQ interrupts have no dependencies on other events, and this lets
// us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
let irq_join_handle = IrqWaitWorker::start(
irq_handler_control_for_worker,
guest_os
.irq_chip
.try_box_clone()
.exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
irq_control_tubes,
sys_allocator_mutex.clone(),
);
let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
let wait_ctx = WaitContext::build_with(&triggers).exit_context(
Exit::WaitContextAdd,
"failed to add trigger to wait context",
)?;
#[cfg(feature = "balloon")]
let mut balloon_tube = balloon_host_tube
.map(|tube| -> Result<BalloonTube> {
wait_ctx
.add(tube.get_read_notifier(), Token::BalloonTube)
.context("failed to add trigger to wait context")?;
Ok(BalloonTube::new(tube))
})
.transpose()
.context("failed to create balloon tube")?;
let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
.name("vm_memory_handler_thread".into())
.spawn({
let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
let sys_allocator_mutex = sys_allocator_mutex.clone();
move || {
vm_memory_handler_thread(
vm_memory_control_tubes,
vm,
sys_allocator_mutex,
gralloc,
vm_memory_handler_control_for_thread,
)
}
})
.unwrap();
if let Some(evt) = broker_shutdown_evt.as_ref() {
wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
Exit::WaitContextAdd,
"failed to add trigger to wait context",
)?;
}
let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
let mut next_control_id = control_tubes.len();
for (id, control_tube) in control_tubes.iter() {
#[allow(clippy::single_match)]
match control_tube {
TaggedControlTube::Product(product_tube) => wait_ctx
.add(
product_tube.get_read_notifier(),
Token::VmControl { id: *id },
)
.exit_context(
Exit::WaitContextAdd,
"failed to add trigger to wait context",
)?,
_ => (),
}
}
let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
guest_os.devices_thread = match create_devices_worker_thread(
guest_os.vm.get_memory().clone(),
guest_os.io_bus.clone(),
guest_os.mmio_bus.clone(),
device_ctrl_resp,
) {
Ok(join_handle) => Some(join_handle),
Err(e) => {
return Err(anyhow!("Failed to start devices thread: {}", e));
}
};
let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
None => iter::repeat_with(|| None)
.take(guest_os.vcpu_count)
.collect(),
};
let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
let original_terminal_mode = stdin().set_raw_mode().ok();
let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
let run_mode_arc = Arc::new(VcpuRunMode::default());
let run_mode_state = if suspended {
// Sleep devices before creating vcpus.
device_ctrl_tube
.send(&DeviceControlCommand::SleepDevices)
.context("send command to devices control socket")?;
match device_ctrl_tube
.recv()
.context("receive from devices control socket")?
{
VmResponse::Ok => (),
resp => bail!("device sleep failed: {}", resp),
}
run_mode_arc.set_and_notify(VmRunMode::Suspending);
VmRunMode::Suspending
} else {
VmRunMode::Running
};
// If we are restoring from a snapshot, then start suspended.
if restore_path.is_some() {
run_mode_arc.set_and_notify(VmRunMode::Suspending);
}
let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
vcpus,
vcpu_boxes.clone(),
&guest_os,
&exit_evt,
&vm_evt_wrtube,
#[cfg(feature = "stats")]
&stats,
host_cpu_topology,
run_mode_arc.clone(),
tsc_sync_mitigations,
force_calibrated_tsc_leaf,
)?;
// Restore VM (if applicable).
if let Some(path) = restore_path {
vm_control::do_restore(
path,
&guest_os.vm,
|msg| {
kick_all_vcpus(
run_mode_arc.as_ref(),
&vcpu_control_channels,
vcpu_boxes.as_ref(),
guest_os.irq_chip.as_ref(),
&pvclock_host_tube,
msg,
)
},
|msg, index| {
kick_vcpu(
run_mode_arc.as_ref(),
&vcpu_control_channels,
vcpu_boxes.as_ref(),
guest_os.irq_chip.as_ref(),
&pvclock_host_tube,
index,
msg,
)
},
&irq_handler_control,
&device_ctrl_tube,
guest_os.vcpu_count,
|image| {
guest_os
.irq_chip
.try_box_clone()?
.restore(image, guest_os.vcpu_count)
},
/* require_encrypted= */ false,
)?;
// Allow the vCPUs to start for real.
kick_all_vcpus(
run_mode_arc.as_ref(),
&vcpu_control_channels,
vcpu_boxes.as_ref(),
guest_os.irq_chip.as_ref(),
&pvclock_host_tube,
// Other platforms (unix) have multiple modes they could start in (e.g. starting for
// guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
// to enter that mode here rather than VmRunMode::Running.
VcpuControl::RunState(run_mode_state),
);
}
let mut exit_state = ExitState::Stop;
let mut region_state = VmMemoryRegionState::new();
'poll: loop {
let events = {
match wait_ctx.wait() {
Ok(v) => v,
Err(e) => {
error!("failed to wait: {}", e);
break;
}
}
};
let mut vm_control_ids_to_remove = Vec::new();
for event in events.iter().filter(|e| e.is_readable) {
let state = handle_readable_event(
event,
&mut vm_control_ids_to_remove,
&mut next_control_id,
&mut service_vm_state,
disk_host_tubes.as_slice(),
ipc_main_loop_tube.as_ref(),
#[cfg(feature = "gpu")]
gpu_control_tube.as_ref(),
&vm_evt_rdtube,
&mut control_tubes,
&mut guest_os,
&sys_allocator_mutex,
&mut virtio_snd_host_mute_tube,
proto_main_loop_tube.as_ref(),
&anti_tamper_main_thread_tube,
#[cfg(feature = "balloon")]
balloon_tube.as_mut(),
memory_size_mb,
vcpu_boxes.as_ref(),
&pvclock_host_tube,
run_mode_arc.as_ref(),
&mut region_state,
vm_control_server.as_mut(),
&irq_handler_control,
&device_ctrl_tube,
&wait_ctx,
force_s2idle,
&vcpu_control_channels,
)?;
if let Some(state) = state {
exit_state = state;
break 'poll;
}
}
remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
}
info!("run_control poll loop completed, forcing vCPUs to exit...");
// VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
run_mode_arc.set_and_notify(VmRunMode::Exiting);
// Force all vcpus to exit from the hypervisor
for vcpu in vcpu_boxes.lock().iter() {
vcpu.set_immediate_exit(true);
}
let mut res = Ok(exit_state);
guest_os.irq_chip.kick_halted_vcpus();
let _ = exit_evt.signal();
if guest_os.devices_thread.is_some() {
if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
error!("failed to stop device control loop: {}", e);
};
if let Some(thread) = guest_os.devices_thread.take() {
if let Err(e) = thread.join() {
error!("failed to exit devices thread: {:?}", e);
}
}
}
// Shut down the VM memory handler thread.
if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
error!(
"failed to request exit from VM memory handler thread: {}",
e
);
}
if let Err(e) = vm_memory_handler_thread_join_handle.join() {
error!("failed to exit VM Memory handler thread: {:?}", e);
}
// Shut down the IRQ handler thread.
if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
error!("failed to request exit from IRQ handler thread: {}", e);
}
// Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
// their run loops are aborted.
let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
for (i, thread) in vcpu_threads.into_iter().enumerate() {
// wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
// otherwise, we will hit a memory leak if we force kill the thread with terminate.
match thread.join() {
Ok(Err(e)) => {
error!("vcpu thread {} exited with an error: {}", i, e);
res = Err(e);
}
Ok(_) => {}
Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
}
}
info!("vCPU threads have exited.");
if let Some(ime) = ime_thread {
match ime.join() {
Ok(Err(e)) => {
error!("ime thread exited with an error: {}", e);
if res.is_ok() {
// Prioritize past errors, but return this error if it is unique, otherwise just
// log it.
res = Err(e)
}
}
Ok(_) => {}
Err(e) => error!("ime thread panicked: {:?}", e),
}
}
info!("IME thread has exited.");
// This cancels all the outstanding and any future blocking operations.
// TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
// cleaner shutdown we have to call disarm so that all the incoming requests are run and are
// cancelled. If we call shutdown all blocking threads will go away and incoming operations
// won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
// shutdown is when we drop non-global executor.
cros_async::unblock_disarm();
info!("blocking async pool has shut down.");
let _ = irq_join_handle.join();
info!("IrqWaitWorker has shut down.");
#[cfg(feature = "stats")]
if let Some(stats) = stats {
println!("Statistics Collected:\n{}", stats.lock());
println!("Statistics JSON:\n{}", stats.lock().json());
}
if let Some(mode) = original_terminal_mode {
if let Err(e) = stdin().restore_mode(mode) {
warn!("failed to restore terminal mode: {}", e);
}
}
// Explicitly drop the VM structure here to allow the devices to clean up before the
// control tubes are closed when this function exits.
mem::drop(guest_os);
info!("guest_os dropped, run_control is done.");
res
}
/// Remove Tubes that have been closed from the WaitContext.
fn remove_closed_tubes<T, U>(
wait_ctx: &WaitContext<T>,
tubes: &mut BTreeMap<usize, U>,
mut tube_ids_to_remove: Vec<usize>,
) -> anyhow::Result<()>
where
T: EventToken,
U: ReadNotifier + CloseNotifier,
{
tube_ids_to_remove.dedup();
for id in tube_ids_to_remove {
if let Some(socket) = tubes.remove(&id) {
wait_ctx
.delete(socket.get_read_notifier())
.context("failed to remove descriptor from wait context")?;
// There may be a close notifier registered for this Tube. If there isn't one
// registered, we just ignore the error.
let _ = wait_ctx.delete(socket.get_close_notifier());
}
}
Ok(())
}
/// Sends a message to all VCPUs.
fn kick_all_vcpus(
run_mode: &VcpuRunMode,
vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
irq_chip: &dyn IrqChipArch,
pvclock_host_tube: &Option<Tube>,
msg: VcpuControl,
) {
// On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
// like unix does.
match &msg {
VcpuControl::RunState(VmRunMode::Suspending) => {
suspend_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
return;
}
VcpuControl::RunState(VmRunMode::Running) => {
resume_all_vcpus(run_mode, vcpu_boxes, irq_chip, pvclock_host_tube);
return;
}
_ => (),
}
// For non RunState commands, we dispatch just like unix would.
for vcpu in vcpu_control_channels {
if let Err(e) = vcpu.send(msg.clone()) {
error!("failed to send VcpuControl message: {}", e);
}
}
// Now that we've sent a message, we need VCPUs to exit so they can process it.
for vcpu in vcpu_boxes.lock().iter() {
vcpu.set_immediate_exit(true);
}
irq_chip.kick_halted_vcpus();
// If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
// the control message.
let current_run_mode = run_mode.get_mode();
if current_run_mode != VmRunMode::Running {
run_mode.set_and_notify(current_run_mode);
}
}
/// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
/// VCPU.
fn kick_vcpu(
run_mode: &VcpuRunMode,
vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
irq_chip: &dyn IrqChipArch,
pvclock_host_tube: &Option<Tube>,
index: usize,
msg: VcpuControl,
) {
assert!(
!matches!(msg, VcpuControl::RunState(_)),
"Windows does not support RunState changes on a per VCPU basis"
);
let vcpu = vcpu_control_channels
.get(index)
.expect("invalid vcpu index specified");
if let Err(e) = vcpu.send(msg) {
error!("failed to send VcpuControl message: {}", e);
}
// Now that we've sent a message, we need the VCPU to exit so it can
// process the message.
vcpu_boxes
.lock()
.get(index)
.expect("invalid vcpu index specified")
.set_immediate_exit(true);
irq_chip.kick_halted_vcpus();
// If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
// the control message. (Technically this wakes all VCPUs, but those without messages will go
// back to sleep.)
let current_run_mode = run_mode.get_mode();
if current_run_mode != VmRunMode::Running {
run_mode.set_and_notify(current_run_mode);
}
}
/// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
/// though devices on the host will continue to run.
pub(crate) fn suspend_all_vcpus(
run_mode: &VcpuRunMode,
vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
irq_chip: &dyn IrqChipArch,
pvclock_host_tube: &Option<Tube>,
) {
// VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
// they may re-enter the VM.
run_mode.set_and_notify(VmRunMode::Suspending);
// Force all vcpus to exit from the hypervisor
for vcpu in vcpu_boxes.lock().iter() {
vcpu.set_immediate_exit(true);
}
irq_chip.kick_halted_vcpus();
handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
.unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
}
/// Resumes all VCPUs.
pub(crate) fn resume_all_vcpus(
run_mode: &VcpuRunMode,
vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
irq_chip: &dyn IrqChipArch,
pvclock_host_tube: &Option<Tube>,
) {
handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
.unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
// Make sure any immediate exit bits are disabled
for vcpu in vcpu_boxes.lock().iter() {
vcpu.set_immediate_exit(false);
}
run_mode.set_and_notify(VmRunMode::Running);
}
#[cfg(feature = "gvm")]
const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
major: 1,
minor: 4,
patch: 1,
};
#[cfg(feature = "gvm")]
fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
match gvm.get_full_version() {
Ok(version) => {
if version < GVM_MINIMUM_VERSION {
error!(
"GVM version {} is below minimum version {}",
version, GVM_MINIMUM_VERSION
);
return Err(base::Error::new(libc::ENXIO).into());
} else {
info!("Using GVM version {}.", version)
}
}
Err(e) => {
error!("unable to determine gvm version: {}", e);
return Err(base::Error::new(libc::ENXIO).into());
}
}
let vm = GvmVm::new(&gvm, mem)?;
Ok(vm)
}
#[cfg(feature = "haxm")]
fn create_haxm_vm(
haxm: Haxm,
mem: GuestMemory,
kernel_log_file: &Option<String>,
) -> Result<HaxmVm> {
let vm = HaxmVm::new(&haxm, mem)?;
if let Some(path) = kernel_log_file {
use hypervisor::haxm::HAX_CAP_VM_LOG;
if vm.check_raw_capability(HAX_CAP_VM_LOG) {
match vm.register_log_file(path) {
Ok(_) => {}
Err(e) => match e.errno() {
libc::E2BIG => {
error!(
"kernel_log_file path is too long, kernel log file will not be written"
);
}
_ => return Err(e.into()),
},
}
} else {
warn!(
"kernel_log_file specified but this version of HAXM does not support kernel log \
files"
);
}
}
Ok(vm)
}
#[cfg(feature = "whpx")]
#[cfg(target_arch = "x86_64")]
fn create_whpx_vm(
whpx: Whpx,
mem: GuestMemory,
cpu_count: usize,
no_smt: bool,
apic_emulation: bool,
force_calibrated_tsc_leaf: bool,
vm_evt_wrtube: SendTube,
) -> Result<WhpxVm> {
let cpu_config = hypervisor::CpuConfigX86_64::new(
force_calibrated_tsc_leaf,
false, /* host_cpu_topology */
false, /* enable_hwp */
no_smt,
false, /* itmt */
None, /* hybrid_type */
);
// context for non-cpu-specific cpuid results
let ctx = CpuIdContext::new(
0,
cpu_count,
None,
cpu_config,
whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
__cpuid_count,
__cpuid,
);
// Get all cpuid entries that we should pre-set
let mut cpuid = whpx.get_supported_cpuid()?;
// Adjust them for crosvm
for entry in cpuid.cpu_id_entries.iter_mut() {
adjust_cpuid(entry, &ctx);
}
let vm = WhpxVm::new(
&whpx,
cpu_count,
mem,
cpuid,
apic_emulation,
Some(vm_evt_wrtube),
)
.exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
Ok(vm)
}
#[cfg(feature = "gvm")]
fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
info!("Creating GVM irqchip");
let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
Ok(irq_chip)
}
#[cfg(feature = "whpx")]
#[cfg(target_arch = "x86_64")]
fn create_whpx_split_irq_chip(
vm: &WhpxVm,
ioapic_device_tube: Tube,
) -> base::Result<WhpxSplitIrqChip> {
info!("Creating WHPX split irqchip");
WhpxSplitIrqChip::new(
vm.try_clone()?,
ioapic_device_tube,
None, // ioapic_pins
)
}
fn create_userspace_irq_chip<Vcpu>(
vcpu_count: usize,
ioapic_device_tube: Tube,
) -> base::Result<UserspaceIrqChip<Vcpu>>
where
Vcpu: VcpuArch + 'static,
{
info!("Creating userspace irqchip");
let irq_chip =
UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
Ok(irq_chip)
}
pub fn get_default_hypervisor() -> Option<HypervisorKind> {
// The ordering here matters from most preferable to the least.
#[cfg(feature = "whpx")]
match hypervisor::whpx::Whpx::is_enabled() {
true => return Some(HypervisorKind::Whpx),
false => warn!("Whpx not enabled."),
};
#[cfg(feature = "haxm")]
if get_cpu_manufacturer() == CpuManufacturer::Intel {
// Make sure Haxm device can be opened before selecting it.
match Haxm::new() {
Ok(_) => return Some(HypervisorKind::Ghaxm),
Err(e) => warn!("Cannot initialize HAXM: {}", e),
};
}
#[cfg(feature = "gvm")]
// Make sure Gvm device can be opened before selecting it.
match Gvm::new() {
Ok(_) => return Some(HypervisorKind::Gvm),
Err(e) => warn!("Cannot initialize GVM: {}", e),
};
None
}
fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
Some(
File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
format!("failed to open initrd {}", initrd_path.display())
})?,
)
} else {
None
};
let vm_image = match cfg.executable_path {
Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
format!("failed to open kernel image {}", kernel_path.display(),)
})?,
),
Some(Executable::Bios(ref bios_path)) => {
VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
format!("failed to open bios {}", bios_path.display())
})?)
}
_ => panic!("Did not receive a bios or kernel, should be impossible."),
};
let swiotlb = if let Some(size) = cfg.swiotlb {
Some(
size.checked_mul(1024 * 1024)
.ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
)
} else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
None
} else {
Some(64 * 1024 * 1024)
};
let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
{
(
Some(
open_file_or_duplicate(
&pflash_parameters.path,
OpenOptions::new().read(true).write(true),
)
.with_context(|| {
format!("failed to open pflash {}", pflash_parameters.path.display())
})?,
),
pflash_parameters.block_size,
)
} else {
(None, 0)
};
Ok(VmComponents {
memory_size: cfg
.memory
.unwrap_or(256)
.checked_mul(1024 * 1024)
.ok_or_else(|| anyhow!("requested memory size too large"))?,
swiotlb,
vcpu_count: cfg.vcpu_count.unwrap_or(1),
fw_cfg_enable: false,
bootorder_fw_cfg_blob: Vec::new(),
vcpu_affinity: cfg.vcpu_affinity.clone(),
cpu_clusters: cfg.cpu_clusters.clone(),
cpu_capacity: cfg.cpu_capacity.clone(),
no_smt: cfg.no_smt,
hugepages: cfg.hugepages,
hv_cfg: hypervisor::Config {
protection_type: cfg.protection_type,
},
vm_image,
android_fstab: cfg
.android_fstab
.as_ref()
.map(|x| {
File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
format!("failed to open android fstab file {}", x.display())
})
})
.map_or(Ok(None), |v| v.map(Some))?,
pstore: cfg.pstore.clone(),
pflash_block_size,
pflash_image,
initrd_image,
extra_kernel_params: cfg.params.clone(),
acpi_sdts: cfg
.acpi_tables
.iter()
.map(|path| {
SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
format!("failed to open ACPI file {}", path.display())
})
})
.collect::<Result<Vec<SDT>>>()?,
rt_cpus: cfg.rt_cpus.clone(),
delay_rt: cfg.delay_rt,
no_i8042: cfg.no_i8042,
no_rtc: cfg.no_rtc,
host_cpu_topology: cfg.host_cpu_topology,
#[cfg(target_arch = "x86_64")]
force_s2idle: cfg.force_s2idle,
fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
itmt: false,
pvm_fw: None,
#[cfg(target_arch = "x86_64")]
pci_low_start: cfg.pci_low_start,
#[cfg(target_arch = "x86_64")]
pcie_ecam: cfg.pcie_ecam,
#[cfg(target_arch = "x86_64")]
smbios: cfg.smbios.clone(),
dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
#[cfg(target_arch = "x86_64")]
break_linux_pci_config_io: cfg.break_linux_pci_config_io,
boot_cpu: cfg.boot_cpu,
})
}
// Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
enum WindowsIrqChip<V: VcpuArch> {
Userspace(UserspaceIrqChip<V>),
#[cfg(feature = "gvm")]
Gvm(GvmIrqChip),
#[cfg(feature = "whpx")]
WhpxSplit(WhpxSplitIrqChip),
}
impl<V: VcpuArch> WindowsIrqChip<V> {
// Convert our enum to a &mut dyn IrqChipArch
fn as_mut(&mut self) -> &mut dyn IrqChipArch {
match self {
WindowsIrqChip::Userspace(i) => i,
#[cfg(feature = "gvm")]
WindowsIrqChip::Gvm(i) => i,
#[cfg(feature = "whpx")]
WindowsIrqChip::WhpxSplit(i) => i,
}
}
}
/// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
/// need access to it when tracing is enabled.
static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
/// Save the TSC offset for a particular vcpu.
///
/// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
/// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
/// it can cause clock issues in the guest.
pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
let offsets_copy = {
let mut offsets = TSC_OFFSETS.lock();
// make sure offsets vec is large enough before inserting
let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
offsets.resize(newlen, None);
offsets[vcpu_id] = Some(offset);
offsets.clone()
};
// do statistics on a clone of the offsets so we don't hold up other vcpus at this point
info!(
"TSC offset standard deviation is: {}",
standard_deviation(
&offsets_copy
.iter()
.filter(|x| x.is_some())
.map(|x| x.unwrap() as u128)
.collect::<Vec<u128>>()
)
);
}
/// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
#[cfg(feature = "perfetto")]
pub fn get_vcpu_tsc_offset() -> u64 {
if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
return *offset;
}
0
}
/// Callback that is registered with tracing crate, and will be called by the tracing thread when
/// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
/// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
/// host TSC. Redundant snapshots should not be a problem for perfetto.
#[cfg(feature = "perfetto")]
fn set_tsc_clock_snapshot() {
let freq = match devices::tsc::tsc_frequency() {
Err(e) => {
error!(
"Could not determine tsc frequency, unable to snapshot tsc offset: {}",
e
);
return;
}
Ok(freq) => freq,
};
// The offset is host-guest tsc value
let offset = get_vcpu_tsc_offset();
// Safe because _rdtsc takes no arguments;
let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
// Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
// support floating point multipliers yet. So for now we set the freq in Hz and rely
// on the merge tool to fix it.
perfetto::Clock::new(
perfetto::BuiltinClock::Tsc as u32,
host_tsc.wrapping_add(offset),
)
.set_multiplier(freq as u64),
perfetto::Clock::new(
// The host builtin clock ids are all offset from the guest ids by
// HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
// contains both a guest and host clock, we need to offset it before merge.
perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
host_tsc,
)
.set_multiplier(freq as u64),
));
}
/// Launches run_config for the broker, reading configuration from a TubeTransporter.
pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
let tube_transporter =
// SAFETY:
// Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
// the blocking & framing modes are accurate because we create them ourselves in the broker.
unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
let mut tube_data_list = tube_transporter
.read_tubes()
.exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
let bootstrap_tube = tube_data_list
.get_tube(TubeToken::Bootstrap)
.exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
let mut cfg: Config = bootstrap_tube
.recv::<Config>()
.exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
let startup_args: CommonChildStartupArgs = bootstrap_tube
.recv::<CommonChildStartupArgs>()
.exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
let _child_cleanup = common_child_setup(startup_args).exit_context(
Exit::CommonChildSetupError,
"failed to perform common child setup",
)?;
cfg.broker_shutdown_event = Some(
bootstrap_tube
.recv::<Event>()
.exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
);
#[cfg(feature = "crash-report")]
let crash_tube_map = bootstrap_tube
.recv::<HashMap<ProcessType, Vec<SendTube>>>()
.exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
#[cfg(feature = "crash-report")]
crash_report::set_crash_tube_map(crash_tube_map);
let BrokerTubes {
vm_evt_wrtube,
vm_evt_rdtube,
} = bootstrap_tube
.recv::<BrokerTubes>()
.exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
}
pub fn run_config(cfg: Config) -> Result<ExitState> {
let _raise_timer_resolution = enable_high_res_timers()
.exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
// There is no broker when using run_config(), so the vm_evt tubes need to be created.
let (vm_evt_wrtube, vm_evt_rdtube) =
Tube::directional_pair().context("failed to create vm event tube")?;
run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
}
fn create_guest_memory(
components: &VmComponents,
hypervisor: &impl Hypervisor,
) -> Result<GuestMemory> {
let guest_mem_layout = Arch::guest_memory_layout(components, hypervisor).exit_context(
Exit::GuestMemoryLayout,
"failed to create guest memory layout",
)?;
GuestMemory::new_with_options(&guest_mem_layout)
.exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
}
fn run_config_inner(
cfg: Config,
vm_evt_wrtube: SendTube,
vm_evt_rdtube: RecvTube,
) -> Result<ExitState> {
product::setup_common_metric_invariants(&cfg);
#[cfg(feature = "perfetto")]
cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
let components: VmComponents = setup_vm_components(&cfg)?;
#[allow(unused_mut)]
let mut hypervisor = cfg
.hypervisor
.or_else(get_default_hypervisor)
.exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
#[cfg(feature = "whpx")]
if hypervisor::whpx::Whpx::is_enabled() {
// If WHPX is enabled, no other hypervisor can be used, so just override it
hypervisor = HypervisorKind::Whpx;
}
match hypervisor {
#[cfg(feature = "haxm")]
HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
if hypervisor == HypervisorKind::Haxm {
set_use_ghaxm(false);
}
info!("Creating HAXM ghaxm={}", get_use_ghaxm());
let haxm = Haxm::new()?;
let guest_mem = create_guest_memory(&components, &haxm)?;
let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
let (ioapic_host_tube, ioapic_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
let irq_chip =
create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
run_vm::<HaxmVcpu, HaxmVm>(
cfg,
components,
vm,
WindowsIrqChip::Userspace(irq_chip).as_mut(),
Some(ioapic_host_tube),
vm_evt_wrtube,
vm_evt_rdtube,
)
}
#[cfg(feature = "whpx")]
HypervisorKind::Whpx => {
let apic_emulation_supported =
Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
.exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
let no_smt = cfg.no_smt;
// Default to WhpxSplitIrqChip if it's supported because it's more performant
let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
IrqChipKind::Split
} else {
IrqChipKind::Userspace
});
// Both WHPX irq chips use a userspace IOAPIC
let (ioapic_host_tube, ioapic_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
info!("Creating Whpx");
let whpx = Whpx::new()?;
let guest_mem = create_guest_memory(&components, &whpx)?;
let vm = create_whpx_vm(
whpx,
guest_mem,
components.vcpu_count,
no_smt,
apic_emulation_supported && irq_chip == IrqChipKind::Split,
cfg.force_calibrated_tsc_leaf,
vm_evt_wrtube
.try_clone()
.expect("could not clone vm_evt_wrtube"),
)?;
let mut irq_chip = match irq_chip {
IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
IrqChipKind::Split => {
if !apic_emulation_supported {
panic!(
"split irqchip specified but your WHPX version does not support \
local apic emulation"
);
}
WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
}
IrqChipKind::Userspace => {
WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
components.vcpu_count,
ioapic_device_tube,
)?)
}
};
run_vm::<WhpxVcpu, WhpxVm>(
cfg,
components,
vm,
irq_chip.as_mut(),
Some(ioapic_host_tube),
vm_evt_wrtube,
vm_evt_rdtube,
)
}
#[cfg(feature = "gvm")]
HypervisorKind::Gvm => {
info!("Creating GVM");
let gvm = Gvm::new()?;
let guest_mem = create_guest_memory(&components, &gvm)?;
let vm = create_gvm_vm(gvm, guest_mem)?;
let ioapic_host_tube;
let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
IrqChipKind::Kernel => {
ioapic_host_tube = None;
WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
}
IrqChipKind::Userspace => {
let (host_tube, ioapic_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
ioapic_host_tube = Some(host_tube);
WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
components.vcpu_count,
ioapic_device_tube,
)?)
}
};
run_vm::<GvmVcpu, GvmVm>(
cfg,
components,
vm,
irq_chip.as_mut(),
ioapic_host_tube,
vm_evt_wrtube,
vm_evt_rdtube,
)
}
}
}
#[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
fn run_vm<Vcpu, V>(
#[allow(unused_mut)] mut cfg: Config,
#[allow(unused_mut)] mut components: VmComponents,
mut vm: V,
irq_chip: &mut dyn IrqChipArch,
ioapic_host_tube: Option<Tube>,
vm_evt_wrtube: SendTube,
vm_evt_rdtube: RecvTube,
) -> Result<ExitState>
where
Vcpu: VcpuArch + 'static,
V: VmArch + 'static,
{
let vm_memory_size_mb = components.memory_size / (1024 * 1024);
let mut control_tubes = Vec::new();
let mut irq_control_tubes = Vec::new();
let mut vm_memory_control_tubes = Vec::new();
// Create one control tube per disk.
let mut disk_device_tubes = Vec::new();
let mut disk_host_tubes = Vec::new();
let disk_count = cfg.disks.len();
for _ in 0..disk_count {
let (disk_host_tube, disk_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
disk_host_tubes.push(disk_host_tube);
disk_device_tubes.push(disk_device_tube);
}
if let Some(ioapic_host_tube) = ioapic_host_tube {
irq_control_tubes.push(ioapic_host_tube);
}
// Balloon gets a special socket so balloon requests can be forwarded from the main process.
let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
let (balloon_host_tube, balloon_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
(Some(balloon_host_tube), Some(balloon_device_tube))
} else {
(None, None)
};
// The balloon device also needs a tube to communicate back to the main process to
// handle remapping memory dynamically.
let dynamic_mapping_device_tube = if cfg.balloon {
let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
vm_memory_control_tubes.push(dynamic_mapping_host_tube);
Some(dynamic_mapping_device_tube)
} else {
None
};
// PvClock gets a tube for handling suspend/resume requests from the main thread.
let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
let (host, device) =
Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
(Some(host), Some(device))
} else {
(None, None)
};
let gralloc =
RutabagaGralloc::new().exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
let mut sys_allocator = SystemAllocator::new(
Arch::get_system_allocator_config(&vm),
pstore_size,
&cfg.mmio_address_ranges,
)
.context("failed to create system allocator")?;
// Allocate the ramoops region first.
let ramoops_region = match &components.pstore {
Some(pstore) => Some(
arch::pstore::create_memory_region(
&mut vm,
sys_allocator.reserved_region().unwrap(),
pstore,
)
.exit_context(
Exit::Pstore,
format!("failed to allocate pstore region {:?}", &components.pstore),
)?,
),
None => None,
};
let init_balloon_size = components
.memory_size
.checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
}))
.context("failed to calculate init balloon size")?;
let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
if tsc_state.core_grouping.size() > 1 {
// Host TSCs are not in sync, log a metric about it.
warn!(
"Host TSCs are not in sync, applying the following mitigations: {:?}",
tsc_sync_mitigations
);
log_descriptor(
MetricEventType::TscCoresOutOfSync,
// casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
tsc_state.core_grouping.core_grouping_bitmask() as i64,
);
}
#[cfg(feature = "gpu")]
let gpu_control_tube = cfg
.gpu_vmm_config
.as_mut()
.and_then(|config| config.gpu_control_host_tube.take());
let product_args = product::get_run_control_args(&mut cfg);
// We open these files before lowering the token, as in the future a stricter policy may
// prevent it.
let dt_overlays = cfg
.device_tree_overlay
.iter()
.map(|o| {
Ok(DtbOverlay {
file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
.with_context(|| {
format!("failed to open device tree overlay {}", o.path.display())
})?,
})
})
.collect::<Result<Vec<DtbOverlay>>>()?;
// Lower the token, locking the main process down to a stricter security policy.
//
// WARNING:
//
// Windows system calls can behave in unusual ways if they happen concurrently to the token
// lowering. For example, access denied can happen if Tube pairs are created in another thread
// (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
// not privileged resources, but can be broken due to the token changing unexpectedly.
//
// We explicitly lower the token here and *then* call run_control to make it clear that any
// resources that require a privileged token should be created on the main thread & passed into
// run_control, to follow the correct order:
// - Privileged resources are created.
// - Token is lowered.
// - Threads are spawned & may create more non-privileged resources (without fear of the token
// changing at an undefined time).
//
// Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
// should split any resource creation to before this token lowering & pass the resources into
// run_control. Don't move the token lowering somewhere else without considering multi-threaded
// effects.
#[cfg(feature = "sandbox")]
if sandbox::is_sandbox_target() {
sandbox::TargetServices::get()
.exit_code_from_err("failed to create sandbox")?
.expect("Could not create sandbox!")
.lower_token();
}
let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
let pci_devices = create_devices(
&mut cfg,
vm.get_memory(),
&vm_evt_wrtube,
&mut irq_control_tubes,
&mut vm_memory_control_tubes,
&mut control_tubes,
&mut disk_device_tubes,
balloon_device_tube,
pvclock_device_tube,
dynamic_mapping_device_tube,
/* inflate_tube= */ None,
init_balloon_size,
tsc_state.frequency,
virtio_snd_state_device_tube,
virtio_snd_device_mute_tube,
)?;
let mut vcpu_ids = Vec::new();
let windows = Arch::build_vm::<V, Vcpu>(
components,
&vm_evt_wrtube,
&mut sys_allocator,
&cfg.serial_parameters,
None,
(cfg.battery_config.as_ref().map(|t| t.type_), None),
vm,
ramoops_region,
pci_devices,
irq_chip,
&mut vcpu_ids,
cfg.dump_device_tree_blob.clone(),
/* debugcon_jail= */ None,
None,
None,
dt_overlays,
)
.exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
#[cfg(feature = "stats")]
let stats = if cfg.exit_stats {
Some(Arc::new(Mutex::new(StatisticsCollector::new())))
} else {
None
};
run_control(
windows,
sys_allocator,
control_tubes,
irq_control_tubes,
vm_memory_control_tubes,
vm_evt_rdtube,
vm_evt_wrtube,
#[cfg(feature = "gpu")]
gpu_control_tube,
cfg.broker_shutdown_event.take(),
balloon_host_tube,
pvclock_host_tube,
disk_host_tubes,
gralloc,
#[cfg(feature = "stats")]
stats,
cfg.service_pipe_name,
vm_memory_size_mb,
cfg.host_cpu_topology,
tsc_sync_mitigations,
cfg.force_calibrated_tsc_leaf,
product_args,
virtio_snd_host_mute_tube,
cfg.restore_path,
cfg.socket_path,
cfg.force_s2idle,
cfg.suspended,
)
}
#[cfg(test)]
mod tests {
use tempfile::TempDir;
use super::*;
fn create_config(test_dir: &TempDir) -> Config {
let mut config = Config::default();
let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
OpenOptions::new()
.create(true)
.write(true)
.open(&dummy_kernel_path)
.expect("Could not open file!");
config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
config
}
#[test]
#[should_panic(expected = "Did not receive a bios or kernel")]
fn setup_vm_components_panics_when_no_kernel_provided() {
let mut config =
create_config(&TempDir::new().expect("Could not create temporary directory!"));
config.executable_path = None;
let _ = setup_vm_components(&config);
}
#[test]
fn setup_vm_components_stores_memory_in_bytes() {
let tempdir = TempDir::new().expect("Could not create temporary directory!");
let mut config = create_config(&tempdir);
config.memory = Some(1);
let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
assert_eq!(vm_components.memory_size, 1024 * 1024);
}
#[test]
fn setup_vm_components_fails_when_memory_too_large() {
let tempdir = TempDir::new().expect("Could not create temporary directory!");
let mut config = create_config(&tempdir);
// One mb more than a u64 can hold in bytes
config.memory = Some((u64::MAX / 1024 / 1024) + 1);
setup_vm_components(&config).err().expect("expected error");
}
}