blob: a912eddb59f906ba8dfe47156a1fe44c1e1148ac [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
mod fdt;
const E820_RAM: u32 = 1;
const SETUP_DTB: u32 = 2;
const X86_64_FDT_MAX_SIZE: u64 = 0x200000;
#[allow(dead_code)]
#[allow(non_upper_case_globals)]
#[allow(non_camel_case_types)]
#[allow(non_snake_case)]
mod bootparam;
// boot_params is just a series of ints, it is safe to initialize it.
unsafe impl data_model::DataInit for bootparam::boot_params {}
#[allow(dead_code)]
#[allow(non_upper_case_globals)]
mod msr_index;
#[allow(dead_code)]
#[allow(non_upper_case_globals)]
#[allow(non_camel_case_types)]
#[allow(clippy::all)]
mod mpspec;
// These mpspec types are only data, reading them from data is a safe initialization.
unsafe impl data_model::DataInit for mpspec::mpc_bus {}
unsafe impl data_model::DataInit for mpspec::mpc_cpu {}
unsafe impl data_model::DataInit for mpspec::mpc_intsrc {}
unsafe impl data_model::DataInit for mpspec::mpc_ioapic {}
unsafe impl data_model::DataInit for mpspec::mpc_table {}
unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
unsafe impl data_model::DataInit for mpspec::mpf_intel {}
mod acpi;
mod bzimage;
mod cpuid;
mod gdt;
mod interrupts;
mod mptable;
mod regs;
mod smbios;
use std::collections::BTreeMap;
use std::error::Error as StdError;
use std::ffi::{CStr, CString};
use std::fmt::{self, Display};
use std::fs::File;
use std::io::{self, Seek};
use std::mem;
use std::sync::Arc;
use crate::bootparam::boot_params;
use arch::{RunnableLinuxVm, VmComponents, VmImage};
use devices::split_irqchip_common::GsiRelay;
use devices::{
get_serial_tty_string, Ioapic, PciConfigIo, PciDevice, PciInterruptPin, Pic, SerialParameters,
IOAPIC_BASE_ADDRESS, IOAPIC_MEM_LENGTH_BYTES,
};
use io_jail::Minijail;
use kvm::*;
use remain::sorted;
use resources::SystemAllocator;
use sync::Mutex;
use sys_util::{Clock, EventFd, GuestAddress, GuestMemory, GuestMemoryError};
use vm_control::VmIrqRequestSocket;
#[sorted]
#[derive(Debug)]
pub enum Error {
AllocateIrq,
CloneEventFd(sys_util::Error),
Cmdline(kernel_cmdline::Error),
ConfigureSystem,
CreateDevices(Box<dyn StdError>),
CreateEventFd(sys_util::Error),
CreateFdt(arch::fdt::Error),
CreateIoapicDevice(sys_util::Error),
CreateIrqChip(sys_util::Error),
CreateKvm(sys_util::Error),
CreatePciRoot(arch::DeviceRegistrationError),
CreatePit(sys_util::Error),
CreatePitDevice(devices::PitError),
CreateSerialDevices(arch::DeviceRegistrationError),
CreateSocket(io::Error),
CreateVcpu(sys_util::Error),
CreateVm(sys_util::Error),
E820Configuration,
EnableSplitIrqchip(sys_util::Error),
KernelOffsetPastEnd,
LoadBios(io::Error),
LoadBzImage(bzimage::Error),
LoadCmdline(kernel_loader::Error),
LoadInitrd(arch::LoadImageError),
LoadKernel(kernel_loader::Error),
Pstore(arch::pstore::Error),
RegisterIrqfd(sys_util::Error),
RegisterVsock(arch::DeviceRegistrationError),
SetLint(interrupts::Error),
SetTssAddr(sys_util::Error),
SetupCpuid(cpuid::Error),
SetupFpu(regs::Error),
SetupGuestMemory(GuestMemoryError),
SetupMptable(mptable::Error),
SetupMsrs(regs::Error),
SetupRegs(regs::Error),
SetupSmbios(smbios::Error),
SetupSregs(regs::Error),
ZeroPagePastRamEnd,
ZeroPageSetup,
}
impl Display for Error {
#[remain::check]
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
#[sorted]
match self {
AllocateIrq => write!(f, "error allocating a single irq"),
CloneEventFd(e) => write!(f, "unable to clone an EventFd: {}", e),
Cmdline(e) => write!(f, "the given kernel command line was invalid: {}", e),
ConfigureSystem => write!(f, "error configuring the system"),
CreateDevices(e) => write!(f, "error creating devices: {}", e),
CreateEventFd(e) => write!(f, "unable to make an EventFd: {}", e),
CreateFdt(e) => write!(f, "failed to create fdt: {}", e),
CreateIoapicDevice(e) => write!(f, "failed to create IOAPIC device: {}", e),
CreateIrqChip(e) => write!(f, "failed to create irq chip: {}", e),
CreateKvm(e) => write!(f, "failed to open /dev/kvm: {}", e),
CreatePciRoot(e) => write!(f, "failed to create a PCI root hub: {}", e),
CreatePit(e) => write!(f, "unable to create PIT: {}", e),
CreatePitDevice(e) => write!(f, "unable to make PIT device: {}", e),
CreateSerialDevices(e) => write!(f, "unable to create serial devices: {}", e),
CreateSocket(e) => write!(f, "failed to create socket: {}", e),
CreateVcpu(e) => write!(f, "failed to create VCPU: {}", e),
CreateVm(e) => write!(f, "failed to create VM: {}", e),
E820Configuration => write!(f, "invalid e820 setup params"),
EnableSplitIrqchip(e) => write!(f, "failed to enable split irqchip: {}", e),
KernelOffsetPastEnd => write!(f, "the kernel extends past the end of RAM"),
LoadBios(e) => write!(f, "error loading bios: {}", e),
LoadBzImage(e) => write!(f, "error loading kernel bzImage: {}", e),
LoadCmdline(e) => write!(f, "error loading command line: {}", e),
LoadInitrd(e) => write!(f, "error loading initrd: {}", e),
LoadKernel(e) => write!(f, "error loading Kernel: {}", e),
Pstore(e) => write!(f, "failed to allocate pstore region: {}", e),
RegisterIrqfd(e) => write!(f, "error registering an IrqFd: {}", e),
RegisterVsock(e) => write!(f, "error registering virtual socket device: {}", e),
SetLint(e) => write!(f, "failed to set interrupts: {}", e),
SetTssAddr(e) => write!(f, "failed to set tss addr: {}", e),
SetupCpuid(e) => write!(f, "failed to set up cpuid: {}", e),
SetupFpu(e) => write!(f, "failed to set up FPU: {}", e),
SetupGuestMemory(e) => write!(f, "failed to set up guest memory: {}", e),
SetupMptable(e) => write!(f, "failed to set up mptable: {}", e),
SetupMsrs(e) => write!(f, "failed to set up MSRs: {}", e),
SetupRegs(e) => write!(f, "failed to set up registers: {}", e),
SetupSmbios(e) => write!(f, "failed to set up SMBIOS: {}", e),
SetupSregs(e) => write!(f, "failed to set up sregs: {}", e),
ZeroPagePastRamEnd => write!(f, "the zero page extends past the end of guest_mem"),
ZeroPageSetup => write!(f, "error writing the zero page of guest memory"),
}
}
}
pub type Result<T> = std::result::Result<T, Error>;
impl std::error::Error for Error {}
pub struct X8664arch;
const BOOT_STACK_POINTER: u64 = 0x8000;
// Make sure it align to 256MB for MTRR convenient
const MEM_32BIT_GAP_SIZE: u64 = (768 << 20);
const FIRST_ADDR_PAST_32BITS: u64 = (1 << 32);
const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
const MMIO_SIZE: u64 = MEM_32BIT_GAP_SIZE - 0x8000000;
const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
const ZERO_PAGE_OFFSET: u64 = 0x7000;
/// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
/// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
/// pointer at the effective physical address 0xFFFFFFF0.
const BIOS_LEN: usize = 1 << 20;
const BIOS_START: u64 = FIRST_ADDR_PAST_32BITS - (BIOS_LEN as u64);
const KERNEL_START_OFFSET: u64 = 0x200000;
const CMDLINE_OFFSET: u64 = 0x20000;
const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET;
const X86_64_SERIAL_1_3_IRQ: u32 = 4;
const X86_64_SERIAL_2_4_IRQ: u32 = 3;
const X86_64_IRQ_BASE: u32 = 5;
const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E0000;
fn configure_system(
guest_mem: &GuestMemory,
_mem_size: u64,
kernel_addr: GuestAddress,
cmdline_addr: GuestAddress,
cmdline_size: usize,
num_cpus: u8,
pci_irqs: Vec<(u32, PciInterruptPin)>,
setup_data: Option<GuestAddress>,
initrd: Option<(GuestAddress, usize)>,
mut params: boot_params,
sci_irq: u32,
) -> Result<()> {
const EBDA_START: u64 = 0x0009fc00;
const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
const KERNEL_HDR_MAGIC: u32 = 0x53726448;
const KERNEL_LOADER_OTHER: u8 = 0xff;
const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x1000000; // Must be non-zero.
let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
// Note that this puts the mptable at 0x0 in guest physical memory.
mptable::setup_mptable(guest_mem, num_cpus, pci_irqs).map_err(Error::SetupMptable)?;
smbios::setup_smbios(guest_mem).map_err(Error::SetupSmbios)?;
params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
params.hdr.header = KERNEL_HDR_MAGIC;
params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
params.hdr.cmdline_size = cmdline_size as u32;
params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
if let Some(setup_data) = setup_data {
params.hdr.setup_data = setup_data.offset();
}
if let Some((initrd_addr, initrd_size)) = initrd {
params.hdr.ramdisk_image = initrd_addr.offset() as u32;
params.hdr.ramdisk_size = initrd_size as u32;
}
add_e820_entry(&mut params, 0, EBDA_START, E820_RAM)?;
let mem_end = guest_mem.end_addr();
if mem_end < end_32bit_gap_start {
add_e820_entry(
&mut params,
kernel_addr.offset() as u64,
mem_end.offset_from(kernel_addr) as u64,
E820_RAM,
)?;
} else {
add_e820_entry(
&mut params,
kernel_addr.offset() as u64,
end_32bit_gap_start.offset_from(kernel_addr) as u64,
E820_RAM,
)?;
if mem_end > first_addr_past_32bits {
add_e820_entry(
&mut params,
first_addr_past_32bits.offset() as u64,
mem_end.offset_from(first_addr_past_32bits) as u64,
E820_RAM,
)?;
}
}
let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
guest_mem
.checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64)
.ok_or(Error::ZeroPagePastRamEnd)?;
guest_mem
.write_obj_at_addr(params, zero_page_addr)
.map_err(|_| Error::ZeroPageSetup)?;
let rsdp_addr = acpi::create_acpi_tables(guest_mem, num_cpus, sci_irq);
params.acpi_rsdp_addr = rsdp_addr.0;
Ok(())
}
/// Add an e820 region to the e820 map.
/// Returns Ok(()) if successful, or an error if there is no space left in the map.
fn add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()> {
if params.e820_entries >= params.e820_table.len() as u8 {
return Err(Error::E820Configuration);
}
params.e820_table[params.e820_entries as usize].addr = addr;
params.e820_table[params.e820_entries as usize].size = size;
params.e820_table[params.e820_entries as usize].type_ = mem_type;
params.e820_entries += 1;
Ok(())
}
/// Returns a Vec of the valid memory addresses.
/// These should be used to configure the GuestMemory structure for the platform.
/// For x86_64 all addresses are valid from the start of the kernel except a
/// carve out at the end of 32bit address space.
fn arch_memory_regions(size: u64, has_bios: bool) -> Vec<(GuestAddress, u64)> {
let mem_end = GuestAddress(size);
let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
let end_32bit_gap_start = GuestAddress(END_ADDR_BEFORE_32BITS);
let mut regions = Vec::new();
if mem_end < end_32bit_gap_start {
regions.push((GuestAddress(0), size));
if has_bios {
regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
}
} else {
regions.push((GuestAddress(0), end_32bit_gap_start.offset()));
if has_bios {
regions.push((GuestAddress(BIOS_START), BIOS_LEN as u64));
}
regions.push((
first_addr_past_32bits,
mem_end.offset_from(end_32bit_gap_start),
));
}
regions
}
impl arch::LinuxArch for X8664arch {
type Error = Error;
fn build_vm<F, E>(
mut components: VmComponents,
split_irqchip: bool,
ioapic_device_socket: VmIrqRequestSocket,
serial_parameters: &BTreeMap<u8, SerialParameters>,
serial_jail: Option<Minijail>,
create_devices: F,
) -> Result<RunnableLinuxVm>
where
F: FnOnce(
&GuestMemory,
&mut Vm,
&mut SystemAllocator,
&EventFd,
) -> std::result::Result<Vec<(Box<dyn PciDevice>, Option<Minijail>)>, E>,
E: StdError + 'static,
{
let has_bios = match components.vm_image {
VmImage::Bios(_) => true,
_ => false,
};
let mem = Self::setup_memory(components.memory_size, has_bios)?;
let mut resources = Self::get_resource_allocator(&mem, components.wayland_dmabuf);
let kvm = Kvm::new().map_err(Error::CreateKvm)?;
let mut vm = Self::create_vm(&kvm, split_irqchip, mem.clone())?;
let vcpu_count = components.vcpu_count;
let mut vcpus = Vec::with_capacity(vcpu_count as usize);
for cpu_id in 0..vcpu_count {
let vcpu = Vcpu::new(cpu_id as libc::c_ulong, &kvm, &vm).map_err(Error::CreateVcpu)?;
if let VmImage::Kernel(_) = components.vm_image {
Self::configure_vcpu(
vm.get_memory(),
&kvm,
&vm,
&vcpu,
cpu_id as u64,
vcpu_count as u64,
)?;
}
vcpus.push(vcpu);
}
let vcpu_affinity = components.vcpu_affinity;
let irq_chip = Self::create_irq_chip(&vm)?;
let mut mmio_bus = devices::Bus::new();
let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
let (split_irqchip, mut gsi_relay) = if split_irqchip {
let gsi_relay = GsiRelay::new();
let pic = Arc::new(Mutex::new(Pic::new()));
let ioapic = Arc::new(Mutex::new(
Ioapic::new(&mut vm, ioapic_device_socket).map_err(Error::CreateIoapicDevice)?,
));
mmio_bus
.insert(
ioapic.clone(),
IOAPIC_BASE_ADDRESS,
IOAPIC_MEM_LENGTH_BYTES,
false,
)
.unwrap();
(Some((pic, ioapic)), Some(gsi_relay))
} else {
(None, None)
};
let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
.map_err(|e| Error::CreateDevices(Box::new(e)))?;
let (pci, pci_irqs, pid_debug_label_map) = arch::generate_pci_root(
pci_devices,
&mut gsi_relay,
&mut mmio_bus,
&mut resources,
&mut vm,
)
.map_err(Error::CreatePciRoot)?;
let pci_bus = Arc::new(Mutex::new(PciConfigIo::new(pci)));
// Event used to notify crosvm that guest OS is trying to suspend.
let suspend_evt = EventFd::new().map_err(Error::CreateEventFd)?;
// allocate sci_irq to fill the ACPI FACP table
let sci_irq = resources.allocate_irq().ok_or(Error::AllocateIrq)?;
let mut io_bus = Self::setup_io_bus(
&mut vm,
&mut gsi_relay,
exit_evt.try_clone().map_err(Error::CloneEventFd)?,
Some(pci_bus.clone()),
components.memory_size,
suspend_evt.try_clone().map_err(Error::CloneEventFd)?,
)?;
let stdio_serial_num = Self::setup_serial_devices(
&mut vm,
&mut io_bus,
&mut gsi_relay,
serial_parameters,
serial_jail,
)?;
let ramoops_region = match components.pstore {
Some(pstore) => Some(
arch::pstore::create_memory_region(&mut vm, &mut resources, &pstore)
.map_err(Error::Pstore)?,
),
None => None,
};
let gsi_relay = if let Some((pic, ioapic)) = &split_irqchip {
io_bus.insert(pic.clone(), 0x20, 0x2, true).unwrap();
io_bus.insert(pic.clone(), 0xa0, 0x2, true).unwrap();
io_bus.insert(pic.clone(), 0x4d0, 0x2, true).unwrap();
let mut irq_num = resources.allocate_irq().unwrap();
while irq_num < kvm::NUM_IOAPIC_PINS as u32 {
irq_num = resources.allocate_irq().unwrap();
}
// This will never fail because gsi_relay is Some iff split_irqchip is Some.
let gsi_relay = Arc::new(gsi_relay.unwrap());
pic.lock().register_relay(gsi_relay.clone());
ioapic.lock().register_relay(gsi_relay.clone());
Some(gsi_relay)
} else {
None
};
match components.vm_image {
VmImage::Bios(ref mut bios) => Self::load_bios(&mem, bios)?,
VmImage::Kernel(ref mut kernel_image) => {
let mut cmdline = Self::get_base_linux_cmdline(stdio_serial_num);
for param in components.extra_kernel_params {
cmdline.insert_str(&param).map_err(Error::Cmdline)?;
}
// It seems that default record_size is only 4096 byte even if crosvm allocates
// more memory. It means that one crash can only 4096 byte.
// Set record_size and console_size to 1/4 of allocated memory size.
// This configulation is same as the host.
if let Some(ramoops_region) = ramoops_region {
let ramoops_opts = [
("mem_address", ramoops_region.address),
("mem_size", ramoops_region.size as u64),
("console_size", (ramoops_region.size / 4) as u64),
("record_size", (ramoops_region.size / 4) as u64),
("dump_oops", 1_u64),
];
for (name, val) in &ramoops_opts {
cmdline
.insert_str(format!("ramoops.{}={:#x}", name, val))
.map_err(Error::Cmdline)?;
}
}
// separate out load_kernel from other setup to get a specific error for
// kernel loading
let (params, kernel_end) = Self::load_kernel(&mem, kernel_image)?;
Self::setup_system_memory(
&mem,
components.memory_size,
vcpu_count,
&CString::new(cmdline).unwrap(),
components.initrd_image,
pci_irqs,
components.android_fstab,
kernel_end,
params,
sci_irq,
)?;
}
}
Ok(RunnableLinuxVm {
vm,
kvm,
resources,
exit_evt,
vcpus,
vcpu_affinity,
irq_chip,
split_irqchip,
gsi_relay,
io_bus,
mmio_bus,
pid_debug_label_map,
suspend_evt,
})
}
}
impl X8664arch {
/// Loads the bios from an open file.
///
/// # Arguments
///
/// * `mem` - The memory to be used by the guest.
/// * `bios_image` - the File object for the specified bios
fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
let bios_image_length = bios_image
.seek(io::SeekFrom::End(0))
.map_err(Error::LoadBios)?;
if bios_image_length != BIOS_LEN as u64 {
return Err(Error::LoadBios(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"bios was {} bytes, expected {}",
bios_image_length, BIOS_LEN
),
)));
}
bios_image
.seek(io::SeekFrom::Start(0))
.map_err(Error::LoadBios)?;
mem.read_to_memory(GuestAddress(BIOS_START), bios_image, BIOS_LEN)
.map_err(Error::SetupGuestMemory)?;
Ok(())
}
/// Loads the kernel from an open file.
///
/// # Arguments
///
/// * `mem` - The memory to be used by the guest.
/// * `kernel_image` - the File object for the specified kernel.
fn load_kernel(mem: &GuestMemory, kernel_image: &mut File) -> Result<(boot_params, u64)> {
let elf_result =
kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image);
if elf_result == Err(kernel_loader::Error::InvalidElfMagicNumber) {
bzimage::load_bzimage(mem, GuestAddress(KERNEL_START_OFFSET), kernel_image)
.map_err(Error::LoadBzImage)
} else {
let kernel_end = elf_result.map_err(Error::LoadKernel)?;
Ok((Default::default(), kernel_end))
}
}
/// Configures the system memory space should be called once per vm before
/// starting vcpu threads.
///
/// # Arguments
///
/// * `mem` - The memory to be used by the guest.
/// * `vcpu_count` - Number of virtual CPUs the guest will have.
/// * `cmdline` - the kernel commandline
/// * `initrd_file` - an initial ramdisk image
fn setup_system_memory(
mem: &GuestMemory,
mem_size: u64,
vcpu_count: u32,
cmdline: &CStr,
initrd_file: Option<File>,
pci_irqs: Vec<(u32, PciInterruptPin)>,
android_fstab: Option<File>,
kernel_end: u64,
params: boot_params,
sci_irq: u32,
) -> Result<()> {
kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)
.map_err(Error::LoadCmdline)?;
// Track the first free address after the kernel - this is where extra
// data like the device tree blob and initrd will be loaded.
let mut free_addr = kernel_end;
let setup_data = if let Some(android_fstab) = android_fstab {
let free_addr_aligned = (((free_addr + 64 - 1) / 64) * 64) + 64;
let dtb_start = GuestAddress(free_addr_aligned);
let dtb_size = fdt::create_fdt(
X86_64_FDT_MAX_SIZE as usize,
mem,
dtb_start.offset(),
android_fstab,
)
.map_err(Error::CreateFdt)?;
free_addr = dtb_start.offset() + dtb_size as u64;
Some(dtb_start)
} else {
None
};
let initrd = match initrd_file {
Some(mut initrd_file) => {
let mut initrd_addr_max = u64::from(params.hdr.initrd_addr_max);
// Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
if initrd_addr_max == 0 {
initrd_addr_max = 0x37FFFFFF;
}
let mem_max = mem.end_addr().offset() - 1;
if initrd_addr_max > mem_max {
initrd_addr_max = mem_max;
}
let (initrd_start, initrd_size) = arch::load_image_high(
mem,
&mut initrd_file,
GuestAddress(free_addr),
GuestAddress(initrd_addr_max),
sys_util::pagesize() as u64,
)
.map_err(Error::LoadInitrd)?;
Some((initrd_start, initrd_size))
}
None => None,
};
configure_system(
mem,
mem_size,
GuestAddress(KERNEL_START_OFFSET),
GuestAddress(CMDLINE_OFFSET),
cmdline.to_bytes().len() + 1,
vcpu_count as u8,
pci_irqs,
setup_data,
initrd,
params,
sci_irq,
)?;
Ok(())
}
/// Creates a new VM object and initializes architecture specific devices
///
/// # Arguments
///
/// * `kvm` - The opened /dev/kvm object.
/// * `split_irqchip` - Whether to use a split IRQ chip.
/// * `mem` - The memory to be used by the guest.
fn create_vm(kvm: &Kvm, split_irqchip: bool, mem: GuestMemory) -> Result<Vm> {
let mut vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
let tss_addr = GuestAddress(0xfffbd000);
vm.set_tss_addr(tss_addr).map_err(Error::SetTssAddr)?;
if !split_irqchip {
vm.create_pit().map_err(Error::CreatePit)?;
vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
} else {
vm.enable_split_irqchip()
.map_err(Error::EnableSplitIrqchip)?;
for i in 0..kvm::NUM_IOAPIC_PINS {
// Add dummy MSI routes to replace the default IRQChip routes.
let route = IrqRoute {
gsi: i as u32,
source: IrqSource::Msi {
address: 0,
data: 0,
},
};
// Safe to ignore errors because errors are caused by the default routes and dummy
// MSI routes will always be registered.
let _ = vm.add_irq_route_entry(route);
}
}
Ok(vm)
}
/// This creates a GuestMemory object for this VM
///
/// * `mem_size` - Desired physical memory size in bytes for this VM
fn setup_memory(mem_size: u64, has_bios: bool) -> Result<GuestMemory> {
let arch_mem_regions = arch_memory_regions(mem_size, has_bios);
let mem = GuestMemory::new(&arch_mem_regions).map_err(Error::SetupGuestMemory)?;
Ok(mem)
}
/// The creates the interrupt controller device and optionally returns the fd for it.
/// Some architectures may not have a separate descriptor for the interrupt
/// controller, so they would return None even on success.
///
/// # Arguments
///
/// * `vm` - the vm object
fn create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>> {
// Unfortunately X86 and ARM have to do this in completely different order
// X86 needs to create the irq chip before creating cpus and
// ARM needs to do it afterwards.
Ok(None)
}
/// This returns the start address of high mmio
///
/// # Arguments
///
/// * mem: The memory to be used by the guest
fn get_high_mmio_base(mem: &GuestMemory) -> u64 {
// Put device memory at a 2MB boundary after physical memory or 4gb, whichever is greater.
const MB: u64 = 1 << 20;
const GB: u64 = 1 << 30;
let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
std::cmp::max(ram_end_round_2mb, 4 * GB)
}
/// This returns a minimal kernel command for this architecture
fn get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline {
let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize);
if let Some(stdio_serial_num) = stdio_serial_num {
let tty_string = get_serial_tty_string(stdio_serial_num);
cmdline.insert("console", &tty_string).unwrap();
}
cmdline.insert_str("acpi=off reboot=k panic=-1").unwrap();
cmdline
}
/// Returns a system resource allocator.
fn get_resource_allocator(mem: &GuestMemory, gpu_allocation: bool) -> SystemAllocator {
let high_mmio_start = Self::get_high_mmio_base(mem);
SystemAllocator::builder()
.add_io_addresses(0xc000, 0x10000)
.add_low_mmio_addresses(END_ADDR_BEFORE_32BITS, MMIO_SIZE)
.add_high_mmio_addresses(high_mmio_start, u64::max_value() - high_mmio_start)
.create_allocator(X86_64_IRQ_BASE, gpu_allocation)
.unwrap()
}
/// Sets up the IO bus for this platform
///
/// # Arguments
///
/// * - `vm` the vm object
/// * - `gsi_relay`: only valid for split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
/// * - `exit_evt` - the event fd object which should receive exit events
/// * - `mem_size` - the size in bytes of physical ram for the guest
/// * - `suspend_evt` - the event fd object which used to suspend the vm
fn setup_io_bus(
_vm: &mut Vm,
gsi_relay: &mut Option<GsiRelay>,
exit_evt: EventFd,
pci: Option<Arc<Mutex<devices::PciConfigIo>>>,
mem_size: u64,
suspend_evt: EventFd,
) -> Result<devices::Bus> {
struct NoDevice;
impl devices::BusDevice for NoDevice {
fn debug_label(&self) -> String {
"no device".to_owned()
}
}
let mut io_bus = devices::Bus::new();
let mem_gap_start = END_ADDR_BEFORE_32BITS;
let mem_below_4g = std::cmp::min(mem_gap_start, mem_size);
let mem_above_4g = mem_size.saturating_sub(FIRST_ADDR_PAST_32BITS);
io_bus
.insert(
Arc::new(Mutex::new(devices::Cmos::new(mem_below_4g, mem_above_4g))),
0x70,
0x2,
false,
)
.unwrap();
let nul_device = Arc::new(Mutex::new(NoDevice));
let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
exit_evt.try_clone().map_err(Error::CloneEventFd)?,
)));
if let Some(gsi_relay) = gsi_relay {
let pit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
let pit = Arc::new(Mutex::new(
devices::Pit::new(
pit_evt.try_clone().map_err(Error::CloneEventFd)?,
Arc::new(Mutex::new(Clock::new())),
)
.map_err(Error::CreatePitDevice)?,
));
io_bus.insert(pit.clone(), 0x040, 0x8, true).unwrap();
io_bus.insert(pit.clone(), 0x061, 0x1, true).unwrap();
io_bus.insert(i8042, 0x062, 0x3, true).unwrap();
gsi_relay.register_irqfd(pit_evt, 0);
} else {
io_bus
.insert(nul_device.clone(), 0x040, 0x8, false)
.unwrap(); // ignore pit
io_bus.insert(i8042, 0x061, 0x4, true).unwrap();
}
io_bus
.insert(nul_device.clone(), 0x0ed, 0x1, false)
.unwrap(); // most likely this one does nothing
io_bus
.insert(nul_device.clone(), 0x0f0, 0x2, false)
.unwrap(); // ignore fpu
if let Some(pci_root) = pci {
io_bus.insert(pci_root, 0xcf8, 0x8, false).unwrap();
} else {
// ignore pci.
io_bus
.insert(nul_device.clone(), 0xcf8, 0x8, false)
.unwrap();
}
let pm = Arc::new(Mutex::new(devices::ACPIPMResource::new(suspend_evt)));
io_bus
.insert(
pm.clone(),
devices::acpi::ACPIPM_RESOURCE_BASE,
devices::acpi::ACPIPM_RESOURCE_LEN as u64,
false,
)
.unwrap();
io_bus.notify_on_resume(pm);
Ok(io_bus)
}
/// Sets up the serial devices for this platform. Returns the serial port number and serial
/// device to be used for stdout
///
/// # Arguments
///
/// * - `vm` the vm object
/// * - `io_bus` the I/O bus to add the devices to
/// * - `gsi_relay`: only valid for split IRQ chip (i.e. userspace PIT/PIC/IOAPIC)
/// * - `serial_parmaters` - definitions for how the serial devices should be configured
fn setup_serial_devices(
vm: &mut Vm,
io_bus: &mut devices::Bus,
gsi_relay: &mut Option<GsiRelay>,
serial_parameters: &BTreeMap<u8, SerialParameters>,
serial_jail: Option<Minijail>,
) -> Result<Option<u8>> {
let com_evt_1_3 = EventFd::new().map_err(Error::CreateEventFd)?;
let com_evt_2_4 = EventFd::new().map_err(Error::CreateEventFd)?;
let stdio_serial_num = arch::add_serial_devices(
io_bus,
&com_evt_1_3,
&com_evt_2_4,
&serial_parameters,
serial_jail,
)
.map_err(Error::CreateSerialDevices)?;
if let Some(gsi_relay) = gsi_relay {
gsi_relay.register_irqfd(com_evt_1_3, X86_64_SERIAL_1_3_IRQ as usize);
gsi_relay.register_irqfd(com_evt_2_4, X86_64_SERIAL_2_4_IRQ as usize);
} else {
vm.register_irqfd(&com_evt_1_3, X86_64_SERIAL_1_3_IRQ)
.map_err(Error::RegisterIrqfd)?;
vm.register_irqfd(&com_evt_2_4, X86_64_SERIAL_2_4_IRQ)
.map_err(Error::RegisterIrqfd)?;
}
Ok(stdio_serial_num)
}
/// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
///
/// # Arguments
///
/// * `guest_mem` - The memory to be used by the guest.
/// * `kernel_load_offset` - Offset in bytes from `guest_mem` at which the
/// kernel starts.
/// * `kvm` - The /dev/kvm object that created vcpu.
/// * `vm` - The VM object associated with this VCPU.
/// * `vcpu` - The VCPU object to configure.
/// * `cpu_id` - The id of the given `vcpu`.
/// * `num_cpus` - Number of virtual CPUs the guest will have.
fn configure_vcpu(
guest_mem: &GuestMemory,
kvm: &Kvm,
_vm: &Vm,
vcpu: &Vcpu,
cpu_id: u64,
num_cpus: u64,
) -> Result<()> {
let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET);
cpuid::setup_cpuid(kvm, vcpu, cpu_id, num_cpus).map_err(Error::SetupCpuid)?;
regs::setup_msrs(vcpu, END_ADDR_BEFORE_32BITS).map_err(Error::SetupMsrs)?;
let kernel_end = guest_mem
.checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET)
.ok_or(Error::KernelOffsetPastEnd)?;
regs::setup_regs(
vcpu,
(kernel_end).offset() as u64,
BOOT_STACK_POINTER as u64,
ZERO_PAGE_OFFSET as u64,
)
.map_err(Error::SetupRegs)?;
regs::setup_fpu(vcpu).map_err(Error::SetupFpu)?;
regs::setup_sregs(guest_mem, vcpu).map_err(Error::SetupSregs)?;
interrupts::set_lint(vcpu).map_err(Error::SetLint)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn regions_lt_4gb_nobios() {
let regions = arch_memory_regions(1u64 << 29, /* has_bios */ false);
assert_eq!(1, regions.len());
assert_eq!(GuestAddress(0), regions[0].0);
assert_eq!(1u64 << 29, regions[0].1);
}
#[test]
fn regions_gt_4gb_nobios() {
let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ false);
assert_eq!(2, regions.len());
assert_eq!(GuestAddress(0), regions[0].0);
assert_eq!(GuestAddress(1u64 << 32), regions[1].0);
}
#[test]
fn regions_lt_4gb_bios() {
let regions = arch_memory_regions(1u64 << 29, /* has_bios */ true);
assert_eq!(2, regions.len());
assert_eq!(GuestAddress(0), regions[0].0);
assert_eq!(1u64 << 29, regions[0].1);
assert_eq!(GuestAddress(BIOS_START), regions[1].0);
assert_eq!(BIOS_LEN as u64, regions[1].1);
}
#[test]
fn regions_gt_4gb_bios() {
let regions = arch_memory_regions((1u64 << 32) + 0x8000, /* has_bios */ true);
assert_eq!(3, regions.len());
assert_eq!(GuestAddress(0), regions[0].0);
assert_eq!(GuestAddress(BIOS_START), regions[1].0);
assert_eq!(BIOS_LEN as u64, regions[1].1);
assert_eq!(GuestAddress(1u64 << 32), regions[2].0);
}
}