| // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! Virtual machine architecture support code. |
| |
| pub mod android; |
| pub mod fdt; |
| pub mod pstore; |
| pub mod serial; |
| |
| pub mod sys; |
| |
| use std::collections::BTreeMap; |
| use std::error::Error as StdError; |
| use std::fs::File; |
| use std::io; |
| use std::io::Read; |
| use std::io::Seek; |
| use std::io::SeekFrom; |
| use std::path::PathBuf; |
| use std::sync::mpsc; |
| use std::sync::mpsc::SendError; |
| use std::sync::Arc; |
| |
| use acpi_tables::sdt::SDT; |
| use base::syslog; |
| use base::AsRawDescriptor; |
| use base::AsRawDescriptors; |
| use base::Event; |
| use base::SendTube; |
| #[cfg(feature = "gdb")] |
| use base::Tube; |
| use devices::virtio::VirtioDevice; |
| use devices::BarRange; |
| use devices::Bus; |
| use devices::BusDevice; |
| use devices::BusDeviceObj; |
| use devices::BusError; |
| use devices::BusResumeDevice; |
| use devices::HotPlugBus; |
| use devices::IrqChip; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use devices::IrqChipAArch64 as IrqChipArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use devices::IrqChipX86_64 as IrqChipArch; |
| use devices::IrqEventSource; |
| #[cfg(windows)] |
| use devices::Minijail; |
| use devices::PciAddress; |
| use devices::PciBus; |
| use devices::PciDevice; |
| use devices::PciDeviceError; |
| use devices::PciInterruptPin; |
| use devices::PciRoot; |
| use devices::PciRootCommand; |
| use devices::PreferredIrq; |
| #[cfg(unix)] |
| use devices::ProxyDevice; |
| use devices::SerialHardware; |
| use devices::SerialParameters; |
| use devices::VirtioMmioDevice; |
| #[cfg(all(target_arch = "x86_64", feature = "gdb"))] |
| use gdbstub::arch::Arch; |
| #[cfg(all(target_arch = "x86_64", feature = "gdb"))] |
| use gdbstub_arch::x86::X86_64_SSE as GdbArch; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use hypervisor::CpuConfigAArch64 as CpuConfigArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use hypervisor::CpuConfigX86_64 as CpuConfigArch; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use hypervisor::Hypervisor as HypervisorArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use hypervisor::HypervisorX86_64 as HypervisorArch; |
| use hypervisor::IoEventAddress; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use hypervisor::VcpuAArch64 as VcpuArch; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use hypervisor::VcpuInitAArch64 as VcpuInitArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use hypervisor::VcpuInitX86_64 as VcpuInitArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use hypervisor::VcpuX86_64 as VcpuArch; |
| use hypervisor::Vm; |
| #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] |
| use hypervisor::VmAArch64 as VmArch; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use hypervisor::VmX86_64 as VmArch; |
| #[cfg(unix)] |
| use minijail::Minijail; |
| use remain::sorted; |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| use resources::AddressRange; |
| use resources::SystemAllocator; |
| use resources::SystemAllocatorConfig; |
| use serde::Deserialize; |
| use serde::Serialize; |
| pub use serial::add_serial_devices; |
| pub use serial::get_serial_cmdline; |
| pub use serial::set_default_serial_parameters; |
| pub use serial::GetSerialCmdlineError; |
| pub use serial::SERIAL_ADDR; |
| use sync::Mutex; |
| use thiserror::Error; |
| use vm_control::BatControl; |
| use vm_control::BatteryType; |
| use vm_control::PmResource; |
| use vm_memory::GuestAddress; |
| use vm_memory::GuestMemory; |
| use vm_memory::GuestMemoryError; |
| |
| pub enum VmImage { |
| Kernel(File), |
| Bios(File), |
| } |
| |
| #[derive(Clone, Deserialize, Serialize)] |
| pub struct Pstore { |
| pub path: PathBuf, |
| pub size: u32, |
| } |
| |
| /// Mapping of guest VCPU threads to host CPU cores. |
| #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] |
| pub enum VcpuAffinity { |
| /// All VCPU threads will be pinned to the same set of host CPU cores. |
| Global(Vec<usize>), |
| /// Each VCPU may be pinned to a set of host CPU cores. |
| /// The map key is a guest VCPU index, and the corresponding value is the set of |
| /// host CPU indices that the VCPU thread will be allowed to run on. |
| /// If a VCPU index is not present in the map, its affinity will not be set. |
| PerVcpu(BTreeMap<usize, Vec<usize>>), |
| } |
| |
| /// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to |
| /// create a `RunnableLinuxVm`. |
| #[sorted] |
| pub struct VmComponents { |
| pub acpi_sdts: Vec<SDT>, |
| pub android_fstab: Option<File>, |
| pub cpu_capacity: BTreeMap<usize, u32>, |
| pub cpu_clusters: Vec<Vec<usize>>, |
| pub delay_rt: bool, |
| #[cfg(feature = "direct")] |
| pub direct_fixed_evts: Vec<devices::ACPIPMFixedEvent>, |
| #[cfg(feature = "direct")] |
| pub direct_gpe: Vec<u32>, |
| pub dmi_path: Option<PathBuf>, |
| pub extra_kernel_params: Vec<String>, |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| pub force_s2idle: bool, |
| #[cfg(all(target_arch = "x86_64", feature = "gdb"))] |
| pub gdb: Option<(u32, Tube)>, // port and control tube. |
| pub host_cpu_topology: bool, |
| pub hugepages: bool, |
| pub hv_cfg: hypervisor::Config, |
| pub initrd_image: Option<File>, |
| pub itmt: bool, |
| pub memory_size: u64, |
| pub no_i8042: bool, |
| pub no_rtc: bool, |
| pub no_smt: bool, |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| pub oem_strings: Vec<String>, |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| pub pci_low_start: Option<u64>, |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| pub pcie_ecam: Option<AddressRange>, |
| pub pflash_block_size: u32, |
| pub pflash_image: Option<File>, |
| pub pstore: Option<Pstore>, |
| /// A file to load as pVM firmware. Must be `Some` iff |
| /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`. |
| pub pvm_fw: Option<File>, |
| pub rt_cpus: Vec<usize>, |
| pub swiotlb: Option<u64>, |
| pub vcpu_affinity: Option<VcpuAffinity>, |
| pub vcpu_count: usize, |
| pub vm_image: VmImage, |
| } |
| |
| /// Holds the elements needed to run a Linux VM. Created by `build_vm`. |
| #[sorted] |
| pub struct RunnableLinuxVm<V: VmArch, Vcpu: VcpuArch> { |
| pub bat_control: Option<BatControl>, |
| pub delay_rt: bool, |
| #[cfg(all(target_arch = "x86_64", feature = "gdb"))] |
| pub gdb: Option<(u32, Tube)>, |
| pub has_bios: bool, |
| pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>, |
| pub io_bus: Arc<Bus>, |
| pub irq_chip: Box<dyn IrqChipArch>, |
| pub mmio_bus: Arc<Bus>, |
| pub no_smt: bool, |
| pub pid_debug_label_map: BTreeMap<u32, String>, |
| #[cfg(unix)] |
| pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>, |
| pub pm: Option<Arc<Mutex<dyn PmResource>>>, |
| /// Devices to be notified before the system resumes from the S3 suspended state. |
| pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>, |
| pub root_config: Arc<Mutex<PciRoot>>, |
| pub rt_cpus: Vec<usize>, |
| pub suspend_evt: Event, |
| pub vcpu_affinity: Option<VcpuAffinity>, |
| pub vcpu_count: usize, |
| pub vcpu_init: Vec<VcpuInitArch>, |
| /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus. |
| /// If it's Some, then `build_vm` already created the vcpus. |
| pub vcpus: Option<Vec<Vcpu>>, |
| pub vm: V, |
| } |
| |
| /// The device and optional jail. |
| pub struct VirtioDeviceStub { |
| pub dev: Box<dyn VirtioDevice>, |
| pub jail: Option<Minijail>, |
| } |
| |
| /// Trait which is implemented for each Linux Architecture in order to |
| /// set up the memory, cpus, and system devices and to boot the kernel. |
| pub trait LinuxArch { |
| type Error: StdError; |
| |
| /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be |
| /// used to configure the `GuestMemory` structure for the platform. |
| /// |
| /// # Arguments |
| /// |
| /// * `components` - Parts used to determine the memory layout. |
| fn guest_memory_layout( |
| components: &VmComponents, |
| ) -> std::result::Result<Vec<(GuestAddress, u64)>, Self::Error>; |
| |
| /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout. |
| /// |
| /// This is the per-architecture template for constructing the `SystemAllocator`. Platform |
| /// agnostic modifications may be made to this configuration, but the final `SystemAllocator` |
| /// will be at least as strict as this configuration. |
| /// |
| /// # Arguments |
| /// |
| /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`. |
| fn get_system_allocator_config<V: Vm>(vm: &V) -> SystemAllocatorConfig; |
| |
| /// Takes `VmComponents` and generates a `RunnableLinuxVm`. |
| /// |
| /// # Arguments |
| /// |
| /// * `components` - Parts to use to build the VM. |
| /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest |
| /// wants to stop/shut down or requested reset. |
| /// * `system_allocator` - Allocator created by this trait's implementation of |
| /// `get_system_allocator_config`. |
| /// * `serial_parameters` - Definitions for how the serial devices should be configured. |
| /// * `serial_jail` - Jail used for serial devices created here. |
| /// * `battery` - Defines what battery device will be created. |
| /// * `vm` - A VM implementation to build upon. |
| /// * `ramoops_region` - Region allocated for ramoops. |
| /// * `devices` - The devices to be built into the VM. |
| /// * `irq_chip` - The IRQ chip implemention for the VM. |
| /// * `debugcon_jail` - Jail used for debugcon devices created here. |
| /// * `pflash_jail` - Jail used for pflash device created here. |
| fn build_vm<V, Vcpu>( |
| components: VmComponents, |
| vm_evt_wrtube: &SendTube, |
| system_allocator: &mut SystemAllocator, |
| serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, |
| serial_jail: Option<Minijail>, |
| battery: (Option<BatteryType>, Option<Minijail>), |
| vm: V, |
| ramoops_region: Option<pstore::RamoopsRegion>, |
| devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, |
| irq_chip: &mut dyn IrqChipArch, |
| vcpu_ids: &mut Vec<usize>, |
| debugcon_jail: Option<Minijail>, |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pflash_jail: Option<Minijail>, |
| ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> |
| where |
| V: VmArch, |
| Vcpu: VcpuArch; |
| |
| /// Configures the vcpu and should be called once per vcpu from the vcpu's thread. |
| /// |
| /// # Arguments |
| /// |
| /// * `vm` - The virtual machine object. |
| /// * `hypervisor` - The `Hypervisor` that created the vcpu. |
| /// * `irq_chip` - The `IrqChip` associated with this vm. |
| /// * `vcpu` - The VCPU object to configure. |
| /// * `vcpu_init` - The data required to initialize VCPU registers and other state. |
| /// * `vcpu_id` - The id of the given `vcpu`. |
| /// * `num_cpus` - Number of virtual CPUs the guest will have. |
| /// * `has_bios` - Whether the `VmImage` is a `Bios` image |
| /// * `cpu_config` - CPU feature configurations. |
| fn configure_vcpu<V: Vm>( |
| vm: &V, |
| hypervisor: &dyn HypervisorArch, |
| irq_chip: &mut dyn IrqChipArch, |
| vcpu: &mut dyn VcpuArch, |
| vcpu_init: VcpuInitArch, |
| vcpu_id: usize, |
| num_cpus: usize, |
| has_bios: bool, |
| cpu_config: Option<CpuConfigArch>, |
| ) -> Result<(), Self::Error>; |
| |
| /// Configures and add a pci device into vm |
| fn register_pci_device<V: VmArch, Vcpu: VcpuArch>( |
| linux: &mut RunnableLinuxVm<V, Vcpu>, |
| device: Box<dyn PciDevice>, |
| #[cfg(unix)] minijail: Option<Minijail>, |
| resources: &mut SystemAllocator, |
| hp_control_tube: &mpsc::Sender<PciRootCommand>, |
| ) -> Result<PciAddress, Self::Error>; |
| } |
| |
| #[cfg(all(target_arch = "x86_64", feature = "gdb"))] |
| pub trait GdbOps<T: VcpuArch> { |
| type Error: StdError; |
| |
| /// Reads vCPU's registers. |
| fn read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>; |
| |
| /// Writes vCPU's registers. |
| fn write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>; |
| |
| /// Reads bytes from the guest memory. |
| fn read_memory( |
| vcpu: &T, |
| guest_mem: &GuestMemory, |
| vaddr: GuestAddress, |
| len: usize, |
| ) -> Result<Vec<u8>, Self::Error>; |
| |
| /// Writes bytes to the specified guest memory. |
| fn write_memory( |
| vcpu: &T, |
| guest_mem: &GuestMemory, |
| vaddr: GuestAddress, |
| buf: &[u8], |
| ) -> Result<(), Self::Error>; |
| |
| /// Make the next vCPU's run single-step. |
| fn enable_singlestep(vcpu: &T) -> Result<(), Self::Error>; |
| |
| /// Get maximum number of hardware breakpoints. |
| fn get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>; |
| |
| /// Set hardware breakpoints at the given addresses. |
| fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>; |
| } |
| |
| /// Errors for device manager. |
| #[sorted] |
| #[derive(Error, Debug)] |
| pub enum DeviceRegistrationError { |
| /// No more MMIO space available. |
| #[error("no more addresses are available")] |
| AddrsExhausted, |
| /// Could not allocate device address space for the device. |
| #[error("Allocating device addresses: {0}")] |
| AllocateDeviceAddrs(PciDeviceError), |
| /// Could not allocate IO space for the device. |
| #[error("Allocating IO addresses: {0}")] |
| AllocateIoAddrs(PciDeviceError), |
| /// Could not allocate MMIO or IO resource for the device. |
| #[error("Allocating IO resource: {0}")] |
| AllocateIoResource(resources::Error), |
| /// Could not allocate an IRQ number. |
| #[error("Allocating IRQ number")] |
| AllocateIrq, |
| /// Could not allocate IRQ resource for the device. |
| #[cfg(unix)] |
| #[error("Allocating IRQ resource: {0}")] |
| AllocateIrqResource(devices::vfio::VfioError), |
| /// Broken pci topology |
| #[error("pci topology is broken")] |
| BrokenPciTopology, |
| /// Unable to clone a jail for the device. |
| #[cfg(unix)] |
| #[error("failed to clone jail: {0}")] |
| CloneJail(minijail::Error), |
| /// Appending to kernel command line failed. |
| #[error("unable to add device to kernel command line: {0}")] |
| Cmdline(kernel_cmdline::Error), |
| /// Configure window size failed. |
| #[error("failed to configure window size: {0}")] |
| ConfigureWindowSize(PciDeviceError), |
| // Unable to create a pipe. |
| #[error("failed to create pipe: {0}")] |
| CreatePipe(base::Error), |
| // Unable to create serial device from serial parameters |
| #[error("failed to create serial device: {0}")] |
| CreateSerialDevice(devices::SerialError), |
| // Unable to create tube |
| #[error("failed to create tube: {0}")] |
| CreateTube(base::TubeError), |
| /// Could not clone an event. |
| #[error("failed to clone event: {0}")] |
| EventClone(base::Error), |
| /// Could not create an event. |
| #[error("failed to create event: {0}")] |
| EventCreate(base::Error), |
| /// Failed to generate ACPI content. |
| #[error("failed to generate ACPI content")] |
| GenerateAcpi, |
| /// No more IRQs are available. |
| #[error("no more IRQs are available")] |
| IrqsExhausted, |
| /// Missing a required serial device. |
| #[error("missing required serial device {0}")] |
| MissingRequiredSerialDevice(u8), |
| /// Could not add a device to the mmio bus. |
| #[error("failed to add to mmio bus: {0}")] |
| MmioInsert(BusError), |
| #[cfg(unix)] |
| /// Failed to initialize proxy device for jailed device. |
| #[error("failed to create proxy device: {0}")] |
| ProxyDeviceCreation(devices::ProxyError), |
| #[cfg(unix)] |
| /// Failed to register battery device. |
| #[error("failed to register battery device to VM: {0}")] |
| RegisterBattery(devices::BatteryError), |
| /// Could not register PCI device to pci root bus |
| #[error("failed to register PCI device to pci root bus")] |
| RegisterDevice(SendError<PciRootCommand>), |
| /// Could not register PCI device capabilities. |
| #[error("could not register PCI device capabilities: {0}")] |
| RegisterDeviceCapabilities(PciDeviceError), |
| /// Failed to register ioevent with VM. |
| #[error("failed to register ioevent to VM: {0}")] |
| RegisterIoevent(base::Error), |
| /// Failed to register irq event with VM. |
| #[error("failed to register irq event to VM: {0}")] |
| RegisterIrqfd(base::Error), |
| /// Could not setup VFIO platform IRQ for the device. |
| #[error("Setting up VFIO platform IRQ: {0}")] |
| SetupVfioPlatformIrq(anyhow::Error), |
| } |
| |
| /// Config a PCI device for used by this vm. |
| pub fn configure_pci_device<V: VmArch, Vcpu: VcpuArch>( |
| linux: &mut RunnableLinuxVm<V, Vcpu>, |
| mut device: Box<dyn PciDevice>, |
| #[cfg(unix)] jail: Option<Minijail>, |
| resources: &mut SystemAllocator, |
| hp_control_tube: &mpsc::Sender<PciRootCommand>, |
| ) -> Result<PciAddress, DeviceRegistrationError> { |
| // Allocate PCI device address before allocating BARs. |
| let pci_address = device |
| .allocate_address(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| |
| // Allocate ranges that may need to be in the low MMIO region (MmioType::Low). |
| let mmio_ranges = device |
| .allocate_io_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateIoAddrs)?; |
| |
| // Allocate device ranges that may be in low or high MMIO after low-only ranges. |
| let device_ranges = device |
| .allocate_device_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| |
| // If device is a pcie bridge, add its pci bus to pci root |
| if let Some(pci_bus) = device.get_new_pci_bus() { |
| hp_control_tube |
| .send(PciRootCommand::AddBridge(pci_bus)) |
| .map_err(DeviceRegistrationError::RegisterDevice)?; |
| let bar_ranges = Vec::new(); |
| device |
| .configure_bridge_window(resources, &bar_ranges) |
| .map_err(DeviceRegistrationError::ConfigureWindowSize)?; |
| } |
| |
| // Do not suggest INTx for hot-plug devices. |
| let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?; |
| |
| if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() { |
| resources.reserve_irq(gsi); |
| |
| device.assign_irq( |
| intx_event |
| .try_clone() |
| .map_err(DeviceRegistrationError::EventClone)?, |
| pin, |
| gsi, |
| ); |
| |
| linux |
| .irq_chip |
| .as_irq_chip_mut() |
| .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device)) |
| .map_err(DeviceRegistrationError::RegisterIrqfd)?; |
| } |
| |
| let mut keep_rds = device.keep_rds(); |
| syslog::push_descriptors(&mut keep_rds); |
| |
| device |
| .register_device_capabilities() |
| .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?; |
| for (event, addr, datamatch) in device.ioevents() { |
| let io_addr = IoEventAddress::Mmio(addr); |
| linux |
| .vm |
| .register_ioevent(event, io_addr, datamatch) |
| .map_err(DeviceRegistrationError::RegisterIoevent)?; |
| keep_rds.push(event.as_raw_descriptor()); |
| } |
| |
| #[cfg(unix)] |
| let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail { |
| let proxy = ProxyDevice::new(device, jail, keep_rds) |
| .map_err(DeviceRegistrationError::ProxyDeviceCreation)?; |
| linux |
| .pid_debug_label_map |
| .insert(proxy.pid() as u32, proxy.debug_label()); |
| Arc::new(Mutex::new(proxy)) |
| } else { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| |
| #[cfg(windows)] |
| let arced_dev = { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| |
| #[cfg(unix)] |
| hp_control_tube |
| .send(PciRootCommand::Add(pci_address, arced_dev.clone())) |
| .map_err(DeviceRegistrationError::RegisterDevice)?; |
| |
| for range in &mmio_ranges { |
| linux |
| .mmio_bus |
| .insert(arced_dev.clone(), range.addr, range.size) |
| .map_err(DeviceRegistrationError::MmioInsert)?; |
| } |
| |
| for range in &device_ranges { |
| linux |
| .mmio_bus |
| .insert(arced_dev.clone(), range.addr, range.size) |
| .map_err(DeviceRegistrationError::MmioInsert)?; |
| } |
| |
| Ok(pci_address) |
| } |
| |
| /// Creates a Virtio MMIO devices for use by this Vm. |
| pub fn generate_virtio_mmio_bus( |
| devices: Vec<(VirtioMmioDevice, Option<Minijail>)>, |
| irq_chip: &mut dyn IrqChip, |
| mmio_bus: &Bus, |
| resources: &mut SystemAllocator, |
| vm: &mut impl Vm, |
| mut sdts: Vec<SDT>, |
| ) -> Result<(BTreeMap<u32, String>, Vec<SDT>), DeviceRegistrationError> { |
| let mut pid_labels = BTreeMap::new(); |
| |
| for dev_value in devices.into_iter() { |
| #[cfg(unix)] |
| let (mut device, jail) = dev_value; |
| #[cfg(windows)] |
| let (mut device, _) = dev_value; |
| |
| let ranges = device |
| .allocate_regions(resources) |
| .map_err(DeviceRegistrationError::AllocateIoResource)?; |
| |
| let mut keep_rds = device.keep_rds(); |
| syslog::push_descriptors(&mut keep_rds); |
| |
| let irq_num = resources |
| .allocate_irq() |
| .ok_or(DeviceRegistrationError::AllocateIrq)?; |
| let irq_evt = devices::IrqEdgeEvent::new().map_err(DeviceRegistrationError::EventCreate)?; |
| irq_chip |
| .register_edge_irq_event(irq_num, &irq_evt, IrqEventSource::from_device(&device)) |
| .map_err(DeviceRegistrationError::RegisterIrqfd)?; |
| device.assign_irq(&irq_evt, irq_num); |
| keep_rds.extend(irq_evt.as_raw_descriptors()); |
| |
| for (event, addr, datamatch) in device.ioevents() { |
| let io_addr = IoEventAddress::Mmio(addr); |
| vm.register_ioevent(event, io_addr, datamatch) |
| .map_err(DeviceRegistrationError::RegisterIoevent)?; |
| keep_rds.push(event.as_raw_descriptor()); |
| } |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| { |
| sdts = device |
| .generate_acpi(sdts) |
| .ok_or(DeviceRegistrationError::GenerateAcpi)?; |
| } |
| |
| #[cfg(unix)] |
| let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail { |
| let proxy = ProxyDevice::new(device, jail, keep_rds) |
| .map_err(DeviceRegistrationError::ProxyDeviceCreation)?; |
| pid_labels.insert(proxy.pid() as u32, proxy.debug_label()); |
| Arc::new(Mutex::new(proxy)) |
| } else { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| |
| #[cfg(windows)] |
| let arced_dev = { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| |
| for range in &ranges { |
| mmio_bus |
| .insert(arced_dev.clone(), range.0, range.1) |
| .map_err(DeviceRegistrationError::MmioInsert)?; |
| } |
| } |
| Ok((pid_labels, sdts)) |
| } |
| |
| // Generate pci topology starting from parent bus |
| pub fn generate_pci_topology( |
| parent_bus: Arc<Mutex<PciBus>>, |
| resources: &mut SystemAllocator, |
| io_ranges: &mut BTreeMap<usize, Vec<BarRange>>, |
| device_ranges: &mut BTreeMap<usize, Vec<BarRange>>, |
| device_addrs: &[PciAddress], |
| devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>, |
| ) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> { |
| let mut bar_ranges = Vec::new(); |
| let bus_num = parent_bus.lock().get_bus_num(); |
| let mut subordinate_bus = bus_num; |
| for (dev_idx, addr) in device_addrs.iter().enumerate() { |
| // Only target for devices that located on this bus |
| if addr.bus == bus_num { |
| // If this device is a pci bridge (a.k.a., it has a pci bus structure), |
| // create its topology recursively |
| if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() { |
| let (child_bar_ranges, child_sub_bus) = generate_pci_topology( |
| child_bus.clone(), |
| resources, |
| io_ranges, |
| device_ranges, |
| device_addrs, |
| devices, |
| )?; |
| let device = &mut devices[dev_idx].0; |
| parent_bus |
| .lock() |
| .add_child_bus(child_bus.clone()) |
| .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?; |
| let bridge_window = device |
| .configure_bridge_window(resources, &child_bar_ranges) |
| .map_err(DeviceRegistrationError::ConfigureWindowSize)?; |
| bar_ranges.extend(bridge_window); |
| |
| let ranges = device |
| .allocate_io_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateIoAddrs)?; |
| io_ranges.insert(dev_idx, ranges.clone()); |
| bar_ranges.extend(ranges); |
| |
| let ranges = device |
| .allocate_device_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| device_ranges.insert(dev_idx, ranges.clone()); |
| bar_ranges.extend(ranges); |
| |
| device.set_subordinate_bus(child_sub_bus); |
| |
| subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus); |
| } |
| } |
| } |
| |
| for (dev_idx, addr) in device_addrs.iter().enumerate() { |
| if addr.bus == bus_num { |
| let device = &mut devices[dev_idx].0; |
| // Allocate MMIO for non-bridge devices |
| if device.get_new_pci_bus().is_none() { |
| let ranges = device |
| .allocate_io_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateIoAddrs)?; |
| io_ranges.insert(dev_idx, ranges.clone()); |
| bar_ranges.extend(ranges); |
| |
| let ranges = device |
| .allocate_device_bars(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| device_ranges.insert(dev_idx, ranges.clone()); |
| bar_ranges.extend(ranges); |
| } |
| } |
| } |
| Ok((bar_ranges, subordinate_bus)) |
| } |
| |
| /// Ensure all PCI devices have an assigned PCI address. |
| pub fn assign_pci_addresses( |
| devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)], |
| resources: &mut SystemAllocator, |
| ) -> Result<(), DeviceRegistrationError> { |
| // First allocate devices with a preferred address. |
| for pci_device in devices |
| .iter_mut() |
| .filter_map(|(device, _jail)| device.as_pci_device_mut()) |
| .filter(|pci_device| pci_device.preferred_address().is_some()) |
| { |
| let _ = pci_device |
| .allocate_address(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| } |
| |
| // Then allocate addresses for the remaining devices. |
| for pci_device in devices |
| .iter_mut() |
| .filter_map(|(device, _jail)| device.as_pci_device_mut()) |
| .filter(|pci_device| pci_device.preferred_address().is_none()) |
| { |
| let _ = pci_device |
| .allocate_address(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| } |
| |
| Ok(()) |
| } |
| |
| /// Creates a root PCI device for use by this Vm. |
| pub fn generate_pci_root( |
| mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>, |
| irq_chip: &mut dyn IrqChip, |
| mmio_bus: Arc<Bus>, |
| io_bus: Arc<Bus>, |
| resources: &mut SystemAllocator, |
| vm: &mut impl Vm, |
| max_irqs: usize, |
| vcfg_base: Option<u64>, |
| ) -> Result< |
| ( |
| PciRoot, |
| Vec<(PciAddress, u32, PciInterruptPin)>, |
| BTreeMap<u32, String>, |
| BTreeMap<PciAddress, Vec<u8>>, |
| ), |
| DeviceRegistrationError, |
| > { |
| let mut device_addrs = Vec::new(); |
| |
| for (device, _jail) in devices.iter_mut() { |
| let address = device |
| .allocate_address(resources) |
| .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?; |
| device_addrs.push(address); |
| } |
| |
| let mut device_ranges = BTreeMap::new(); |
| let mut io_ranges = BTreeMap::new(); |
| let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false))); |
| |
| generate_pci_topology( |
| root_bus.clone(), |
| resources, |
| &mut io_ranges, |
| &mut device_ranges, |
| &device_addrs, |
| &mut devices, |
| )?; |
| |
| let mut root = PciRoot::new(Arc::downgrade(&mmio_bus), Arc::downgrade(&io_bus), root_bus); |
| #[cfg_attr(windows, allow(unused_mut))] |
| let mut pid_labels = BTreeMap::new(); |
| |
| // Allocate legacy INTx |
| let mut pci_irqs = Vec::new(); |
| let mut irqs: Vec<u32> = Vec::new(); |
| |
| // Mapping of (bus, dev, pin) -> IRQ number. |
| let mut dev_pin_irq = BTreeMap::new(); |
| |
| for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() { |
| let pci_address = device_addrs[dev_idx]; |
| |
| let irq = match device.preferred_irq() { |
| PreferredIrq::Fixed { pin, gsi } => { |
| // The device reported a preferred IRQ, so use that rather than allocating one. |
| resources.reserve_irq(gsi); |
| Some((pin, gsi)) |
| } |
| PreferredIrq::Any => { |
| // The device did not provide a preferred IRQ but requested one, so allocate one. |
| |
| // Choose a pin based on the slot's function number. Function 0 must always use |
| // INTA# for single-function devices per the PCI spec, and we choose to use INTA# |
| // for function 0 on multifunction devices and distribute the remaining functions |
| // evenly across the other pins. |
| let pin = match pci_address.func % 4 { |
| 0 => PciInterruptPin::IntA, |
| 1 => PciInterruptPin::IntB, |
| 2 => PciInterruptPin::IntC, |
| _ => PciInterruptPin::IntD, |
| }; |
| |
| // If an IRQ number has already been assigned for a different function with this |
| // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert |
| // it into the map. |
| let pin_key = (pci_address.bus, pci_address.dev, pin); |
| let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) { |
| *irq_num |
| } else { |
| // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs` |
| // pool. Otherwise, share one of the existing `irqs`. |
| let irq_num = if irqs.len() < max_irqs { |
| let irq_num = resources |
| .allocate_irq() |
| .ok_or(DeviceRegistrationError::AllocateIrq)?; |
| irqs.push(irq_num); |
| irq_num |
| } else { |
| // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ |
| // sharing evenly across devices. |
| irqs[dev_idx % max_irqs] |
| }; |
| |
| dev_pin_irq.insert(pin_key, irq_num); |
| irq_num |
| }; |
| Some((pin, irq_num)) |
| } |
| PreferredIrq::None => { |
| // The device does not want an INTx# IRQ. |
| None |
| } |
| }; |
| |
| if let Some((pin, gsi)) = irq { |
| let intx_event = |
| devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?; |
| |
| device.assign_irq( |
| intx_event |
| .try_clone() |
| .map_err(DeviceRegistrationError::EventClone)?, |
| pin, |
| gsi, |
| ); |
| |
| irq_chip |
| .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device)) |
| .map_err(DeviceRegistrationError::RegisterIrqfd)?; |
| |
| pci_irqs.push((pci_address, gsi, pin)); |
| } |
| } |
| |
| // To prevent issues where device's on_sandbox may spawn thread before all |
| // sandboxed devices are sandboxed we partition iterator to go over sandboxed |
| // first. This is needed on linux platforms. On windows, this is a no-op since |
| // jails are always None, even for sandboxed devices. |
| let devices = { |
| let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices |
| .into_iter() |
| .enumerate() |
| .partition(|(_, (_, jail))| jail.is_some()); |
| sandboxed.into_iter().chain(non_sandboxed.into_iter()) |
| }; |
| let mut amls = BTreeMap::new(); |
| for (dev_idx, dev_value) in devices { |
| #[cfg(unix)] |
| let (mut device, jail) = dev_value; |
| #[cfg(windows)] |
| let (mut device, _) = dev_value; |
| let address = device_addrs[dev_idx]; |
| |
| let mut keep_rds = device.keep_rds(); |
| syslog::push_descriptors(&mut keep_rds); |
| keep_rds.append(&mut vm.get_memory().as_raw_descriptors()); |
| |
| let ranges = io_ranges.remove(&dev_idx).unwrap_or_default(); |
| let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default(); |
| device |
| .register_device_capabilities() |
| .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?; |
| for (event, addr, datamatch) in device.ioevents() { |
| let io_addr = IoEventAddress::Mmio(addr); |
| vm.register_ioevent(event, io_addr, datamatch) |
| .map_err(DeviceRegistrationError::RegisterIoevent)?; |
| keep_rds.push(event.as_raw_descriptor()); |
| } |
| |
| if let Some(vcfg_base) = vcfg_base { |
| let (methods, shm) = device.generate_acpi_methods(); |
| if !methods.is_empty() { |
| amls.insert(address, methods); |
| } |
| if let Some((offset, mmap)) = shm { |
| let _ = vm.add_memory_region( |
| GuestAddress(vcfg_base + offset as u64), |
| Box::new(mmap), |
| false, |
| false, |
| ); |
| } |
| } |
| |
| #[cfg(unix)] |
| let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail { |
| let proxy = ProxyDevice::new(device, jail, keep_rds) |
| .map_err(DeviceRegistrationError::ProxyDeviceCreation)?; |
| pid_labels.insert(proxy.pid() as u32, proxy.debug_label()); |
| Arc::new(Mutex::new(proxy)) |
| } else { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| #[cfg(windows)] |
| let arced_dev = { |
| device.on_sandboxed(); |
| Arc::new(Mutex::new(device)) |
| }; |
| root.add_device(address, arced_dev.clone()); |
| for range in &ranges { |
| mmio_bus |
| .insert(arced_dev.clone(), range.addr, range.size) |
| .map_err(DeviceRegistrationError::MmioInsert)?; |
| } |
| |
| for range in &device_ranges { |
| mmio_bus |
| .insert(arced_dev.clone(), range.addr, range.size) |
| .map_err(DeviceRegistrationError::MmioInsert)?; |
| } |
| } |
| |
| Ok((root, pci_irqs, pid_labels, amls)) |
| } |
| |
| /// Errors for image loading. |
| #[sorted] |
| #[derive(Error, Debug)] |
| pub enum LoadImageError { |
| #[error("Alignment not a power of two: {0}")] |
| BadAlignment(u64), |
| #[error("Image size too large: {0}")] |
| ImageSizeTooLarge(u64), |
| #[error("Reading image into memory failed: {0}")] |
| ReadToMemory(GuestMemoryError), |
| #[error("Seek failed: {0}")] |
| Seek(io::Error), |
| } |
| |
| /// Load an image from a file into guest memory. |
| /// |
| /// # Arguments |
| /// |
| /// * `guest_mem` - The memory to be used by the guest. |
| /// * `guest_addr` - The starting address to load the image in the guest memory. |
| /// * `max_size` - The amount of space in bytes available in the guest memory for the image. |
| /// * `image` - The file containing the image to be loaded. |
| /// |
| /// The size in bytes of the loaded image is returned. |
| pub fn load_image<F>( |
| guest_mem: &GuestMemory, |
| image: &mut F, |
| guest_addr: GuestAddress, |
| max_size: u64, |
| ) -> Result<usize, LoadImageError> |
| where |
| F: Read + Seek + AsRawDescriptor, |
| { |
| let size = image.seek(SeekFrom::End(0)).map_err(LoadImageError::Seek)?; |
| |
| if size > usize::max_value() as u64 || size > max_size { |
| return Err(LoadImageError::ImageSizeTooLarge(size)); |
| } |
| |
| // This is safe due to the bounds check above. |
| let size = size as usize; |
| |
| image |
| .seek(SeekFrom::Start(0)) |
| .map_err(LoadImageError::Seek)?; |
| |
| guest_mem |
| .read_to_memory(guest_addr, image, size) |
| .map_err(LoadImageError::ReadToMemory)?; |
| |
| Ok(size) |
| } |
| |
| /// Load an image from a file into guest memory at the highest possible address. |
| /// |
| /// # Arguments |
| /// |
| /// * `guest_mem` - The memory to be used by the guest. |
| /// * `image` - The file containing the image to be loaded. |
| /// * `min_guest_addr` - The minimum address of the start of the image. |
| /// * `max_guest_addr` - The address to load the last byte of the image. |
| /// * `align` - The minimum alignment of the start address of the image in bytes |
| /// (must be a power of two). |
| /// |
| /// The guest address and size in bytes of the loaded image are returned. |
| pub fn load_image_high<F>( |
| guest_mem: &GuestMemory, |
| image: &mut F, |
| min_guest_addr: GuestAddress, |
| max_guest_addr: GuestAddress, |
| align: u64, |
| ) -> Result<(GuestAddress, usize), LoadImageError> |
| where |
| F: Read + Seek + AsRawDescriptor, |
| { |
| if !align.is_power_of_two() { |
| return Err(LoadImageError::BadAlignment(align)); |
| } |
| |
| let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1); |
| let size = image.seek(SeekFrom::End(0)).map_err(LoadImageError::Seek)?; |
| |
| if size > usize::max_value() as u64 || size > max_size { |
| return Err(LoadImageError::ImageSizeTooLarge(size)); |
| } |
| |
| image |
| .seek(SeekFrom::Start(0)) |
| .map_err(LoadImageError::Seek)?; |
| |
| // Load image at the maximum aligned address allowed. |
| // The subtraction cannot underflow because of the size checks above. |
| let guest_addr = GuestAddress((max_guest_addr.offset() - size) & !(align - 1)); |
| |
| // This is safe due to the bounds check above. |
| let size = size as usize; |
| |
| guest_mem |
| .read_to_memory(guest_addr, image, size) |
| .map_err(LoadImageError::ReadToMemory)?; |
| |
| Ok((guest_addr, size)) |
| } |
| |
| /// Read and write permissions setting |
| /// |
| /// Wrap read_allow and write_allow to store them in MsrHandlers level. |
| #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)] |
| pub enum MsrRWType { |
| ReadOnly, |
| WriteOnly, |
| ReadWrite, |
| } |
| |
| /// Handler types for userspace-msr |
| #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)] |
| pub enum MsrAction { |
| /// Read and write from host directly, and the control of MSR will |
| /// take effect on host. |
| MsrPassthrough, |
| /// Store the dummy value for msr (copy from host or custom values), |
| /// and the control(WRMSR) of MSR won't take effect on host. |
| MsrEmulate, |
| } |
| |
| /// Source CPU of MSR value |
| /// |
| /// Indicate which CPU that user get/set MSRs from/to. |
| #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)] |
| pub enum MsrValueFrom { |
| /// Read/write MSR value from/into CPU 0. |
| /// The MSR source CPU always be CPU 0. |
| RWFromCPU0, |
| /// Read/write MSR value from/into the running CPU. |
| /// If vCPU migrates to another pcpu, the MSR source CPU will also change. |
| RWFromRunningCPU, |
| } |
| |
| /// Whether to force KVM-filtered MSRs. |
| #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)] |
| pub enum MsrFilter { |
| /// Leave it to hypervisor (KVM) default. |
| Default, |
| /// Don't let KVM do the default thing and use our userspace MSR |
| /// implementation. |
| Override, |
| } |
| |
| /// Config option for userspace-msr handing |
| /// |
| /// MsrConfig will be collected with its corresponding MSR's index. |
| /// eg, (msr_index, msr_config) |
| #[derive(Clone, Serialize, Deserialize)] |
| pub struct MsrConfig { |
| /// If support RDMSR/WRMSR emulation in crosvm? |
| pub rw_type: MsrRWType, |
| /// Handlers should be used to handling MSR. |
| pub action: MsrAction, |
| /// MSR source CPU. |
| pub from: MsrValueFrom, |
| /// Whether to override KVM MSR emulation. |
| pub filter: MsrFilter, |
| } |
| |
| #[sorted] |
| #[derive(Error, Debug)] |
| pub enum MsrExitHandlerError { |
| #[error("Fail to create MSR handler")] |
| HandlerCreateFailed, |
| } |