kvm/src/lib.rs - platform/external/crosvm - Git at Google

 // Copyright 2017 The ChromiumOS Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //! A safe wrapper around the kernel's KVM interface.
 //!
 //! New code should use the `hypervisor` crate instead.

 #![cfg(any(target_os = "android", target_os = "linux"))]

 mod cap;

 use std::cell::RefCell;
 use std::cmp::min;
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
 use std::collections::BinaryHeap;
 use std::ffi::CString;
 use std::fs::File;
 use std::mem::size_of;
 use std::ops::Deref;
 use std::ops::DerefMut;
 use std::os::raw::*;
 use std::os::unix::prelude::OsStrExt;
 use std::path::Path;
 use std::path::PathBuf;
 use std::ptr::copy_nonoverlapping;
 use std::sync::Arc;

 #[allow(unused_imports)]
 use base::ioctl;
 #[allow(unused_imports)]
 use base::ioctl_with_mut_ptr;
 #[allow(unused_imports)]
 use base::ioctl_with_mut_ref;
 #[allow(unused_imports)]
 use base::ioctl_with_ptr;
 #[allow(unused_imports)]
 use base::ioctl_with_ref;
 #[allow(unused_imports)]
 use base::ioctl_with_val;
 #[allow(unused_imports)]
 use base::pagesize;
 #[allow(unused_imports)]
 use base::signal;
 use base::sys::BlockedSignal;
 #[allow(unused_imports)]
 use base::unblock_signal;
 #[allow(unused_imports)]
 use base::warn;
 use base::AsRawDescriptor;
 #[allow(unused_imports)]
 use base::Error;
 #[allow(unused_imports)]
 use base::Event;
 use base::FromRawDescriptor;
 #[allow(unused_imports)]
 use base::IoctlNr;
 #[allow(unused_imports)]
 use base::MappedRegion;
 #[allow(unused_imports)]
 use base::MemoryMapping;
 #[allow(unused_imports)]
 use base::MemoryMappingBuilder;
 #[allow(unused_imports)]
 use base::MmapError;
 use base::RawDescriptor;
 #[allow(unused_imports)]
 use base::Result;
 #[allow(unused_imports)]
 use base::SIGRTMIN;
 use data_model::vec_with_array_field;
 #[cfg(target_arch = "x86_64")]
 use data_model::FlexibleArrayWrapper;
 use kvm_sys::*;
 use libc::open64;
 use libc::sigset_t;
 use libc::EBUSY;
 use libc::EINVAL;
 use libc::ENOENT;
 use libc::ENOSPC;
 use libc::EOVERFLOW;
 use libc::O_CLOEXEC;
 use libc::O_RDWR;
 use sync::Mutex;
 use vm_memory::GuestAddress;
 use vm_memory::GuestMemory;

 pub use crate::cap::*;

 fn errno_result<T>() -> Result<T> {
     Err(Error::last())
 }

 unsafe fn set_user_memory_region<F: AsRawDescriptor>(
     fd: &F,
     slot: u32,
     read_only: bool,
     log_dirty_pages: bool,
     guest_addr: u64,
     memory_size: u64,
     userspace_addr: *mut u8,
 ) -> Result<()> {
     let mut flags = if read_only { KVM_MEM_READONLY } else { 0 };
     if log_dirty_pages {
         flags |= KVM_MEM_LOG_DIRTY_PAGES;
     }
     let region = kvm_userspace_memory_region {
         slot,
         flags,
         guest_phys_addr: guest_addr,
         memory_size,
         userspace_addr: userspace_addr as u64,
     };

     let ret = ioctl_with_ref(fd, KVM_SET_USER_MEMORY_REGION(), &region);
     if ret == 0 {
         Ok(())
     } else {
         errno_result()
     }
 }

 /// Helper function to determine the size in bytes of a dirty log bitmap for the given memory region
 /// size.
 ///
 /// # Arguments
 ///
 /// * `size` - Number of bytes in the memory region being queried.
 pub fn dirty_log_bitmap_size(size: usize) -> usize {
     let page_size = pagesize();
     (((size + page_size - 1) / page_size) + 7) / 8
 }

 /// A wrapper around opening and using `/dev/kvm`.
 ///
 /// Useful for querying extensions and basic values from the KVM backend. A `Kvm` is required to
 /// create a `Vm` object.
 pub struct Kvm {
     kvm: File,
 }

 impl Kvm {
     /// Opens `/dev/kvm/` and returns a Kvm object on success.
     pub fn new() -> Result<Kvm> {
         Kvm::new_with_path(&PathBuf::from("/dev/kvm"))
     }

     /// Opens a KVM device at `device_path` and returns a Kvm object on success.
     pub fn new_with_path(device_path: &Path) -> Result<Kvm> {
         let c_path = CString::new(device_path.as_os_str().as_bytes()).unwrap();
         // SAFETY:
         // Open calls are safe because we give a nul-terminated string and verify the result.
         let ret = unsafe { open64(c_path.as_ptr(), O_RDWR | O_CLOEXEC) };
         if ret < 0 {
             return errno_result();
         }
         Ok(Kvm {
             kvm: {
                 // SAFETY:
                 // Safe because we verify that ret is valid and we own the fd.
                 unsafe { File::from_raw_descriptor(ret) }
             },
         })
     }

     fn check_extension_int(&self, c: Cap) -> i32 {
         // SAFETY:
         // Safe because we know that our file is a KVM fd and that the extension is one of the ones
         // defined by kernel.
         unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) }
     }

     /// Checks if a particular `Cap` is available.
     pub fn check_extension(&self, c: Cap) -> bool {
         self.check_extension_int(c) == 1
     }

     /// Gets the size of the mmap required to use vcpu's `kvm_run` structure.
     pub fn get_vcpu_mmap_size(&self) -> Result<usize> {
         // SAFETY:
         // Safe because we know that our file is a KVM fd and we verify the return result.
         let res = unsafe { ioctl(self, KVM_GET_VCPU_MMAP_SIZE()) };
         if res > 0 {
             Ok(res as usize)
         } else {
             errno_result()
         }
     }

     #[cfg(target_arch = "x86_64")]
     fn get_cpuid(&self, kind: IoctlNr) -> Result<CpuId> {
         const MAX_KVM_CPUID_ENTRIES: usize = 256;
         let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES);

         // SAFETY:
         // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
         // allocated for the struct. The limit is read from nent, which is set to the allocated
         // size(MAX_KVM_CPUID_ENTRIES) above.
         let ret = unsafe { ioctl_with_mut_ptr(self, kind, cpuid.as_mut_ptr()) };
         if ret < 0 {
             return errno_result();
         }

         Ok(cpuid)
     }

     /// X86 specific call to get the system supported CPUID values
     #[cfg(target_arch = "x86_64")]
     pub fn get_supported_cpuid(&self) -> Result<CpuId> {
         self.get_cpuid(KVM_GET_SUPPORTED_CPUID())
     }

     /// X86 specific call to get the system emulated CPUID values
     #[cfg(target_arch = "x86_64")]
     pub fn get_emulated_cpuid(&self) -> Result<CpuId> {
         self.get_cpuid(KVM_GET_EMULATED_CPUID())
     }

     /// X86 specific call to get list of supported MSRS
     ///
     /// See the documentation for KVM_GET_MSR_INDEX_LIST.
     #[cfg(target_arch = "x86_64")]
     pub fn get_msr_index_list(&self) -> Result<Vec<u32>> {
         const MAX_KVM_MSR_ENTRIES: usize = 256;

         let mut msr_list = vec_with_array_field::<kvm_msr_list, u32>(MAX_KVM_MSR_ENTRIES);
         msr_list[0].nmsrs = MAX_KVM_MSR_ENTRIES as u32;

         // SAFETY:
         // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
         // allocated for the struct. The limit is read from nmsrs, which is set to the allocated
         // size (MAX_KVM_MSR_ENTRIES) above.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_MSR_INDEX_LIST(), &mut msr_list[0]) };
         if ret < 0 {
             return errno_result();
         }

         let mut nmsrs = msr_list[0].nmsrs;

         // SAFETY:
         // Mapping the unsized array to a slice is unsafe because the length isn't known.  Using
         // the length we originally allocated with eliminates the possibility of overflow.
         let indices: &[u32] = unsafe {
             if nmsrs > MAX_KVM_MSR_ENTRIES as u32 {
                 nmsrs = MAX_KVM_MSR_ENTRIES as u32;
             }
             msr_list[0].indices.as_slice(nmsrs as usize)
         };

         Ok(indices.to_vec())
     }

     #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
     // The x86 and riscv machine type is always 0
     pub fn get_vm_type(&self) -> c_ulong {
         0
     }

     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
     // Compute the machine type, which should be the IPA range for the VM
     // Ideally, this would take a description of the memory map and return
     // the closest machine type for this VM. Here, we just return the maximum
     // the kernel support.
     #[allow(clippy::useless_conversion)]
     pub fn get_vm_type(&self) -> c_ulong {
         // SAFETY:
         // Safe because we know self is a real kvm fd
         match unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), KVM_CAP_ARM_VM_IPA_SIZE.into()) }
         {
             // Not supported? Use 0 as the machine type, which implies 40bit IPA
             ret if ret < 0 => 0,
             // Use the lower 8 bits representing the IPA space as the machine type
             ipa => (ipa & 0xff) as c_ulong,
         }
     }
 }

 impl AsRawDescriptor for Kvm {
     fn as_raw_descriptor(&self) -> RawDescriptor {
         self.kvm.as_raw_descriptor()
     }
 }

 /// An address either in programmable I/O space or in memory mapped I/O space.
 #[derive(Copy, Clone, Debug)]
 pub enum IoeventAddress {
     Pio(u64),
     Mmio(u64),
 }

 /// Used in `Vm::register_ioevent` to indicate a size and optionally value to match.
 pub enum Datamatch {
     AnyLength,
     U8(Option<u8>),
     U16(Option<u16>),
     U32(Option<u32>),
     U64(Option<u64>),
 }

 /// A source of IRQs in an `IrqRoute`.
 pub enum IrqSource {
     Irqchip { chip: u32, pin: u32 },
     Msi { address: u64, data: u32 },
 }

 /// A single route for an IRQ.
 pub struct IrqRoute {
     pub gsi: u32,
     pub source: IrqSource,
 }

 /// Interrupt controller IDs
 pub enum PicId {
     Primary = 0,
     Secondary = 1,
 }

 /// Number of pins on the IOAPIC.
 pub const NUM_IOAPIC_PINS: usize = 24;

 // Used to invert the order when stored in a max-heap.
 #[derive(Copy, Clone, Eq, PartialEq)]
 struct MemSlot(u32);

 impl Ord for MemSlot {
     fn cmp(&self, other: &MemSlot) -> Ordering {
         // Notice the order is inverted so the lowest magnitude slot has the highest priority in a
         // max-heap.
         other.0.cmp(&self.0)
     }
 }

 impl PartialOrd for MemSlot {
     fn partial_cmp(&self, other: &MemSlot) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }

 /// A wrapper around creating and using a VM.
 pub struct Vm {
     vm: File,
     guest_mem: GuestMemory,
     mem_regions: Arc<Mutex<BTreeMap<u32, Box<dyn MappedRegion>>>>,
     mem_slot_gaps: Arc<Mutex<BinaryHeap<MemSlot>>>,
 }

 impl Vm {
     /// Constructs a new `Vm` using the given `Kvm` instance.
     pub fn new(kvm: &Kvm, guest_mem: GuestMemory) -> Result<Vm> {
         // SAFETY:
         // Safe because we know kvm is a real kvm fd as this module is the only one that can make
         // Kvm objects.
         let ret = unsafe { ioctl_with_val(kvm, KVM_CREATE_VM(), kvm.get_vm_type()) };
         if ret >= 0 {
             // SAFETY:
             // Safe because we verify the value of ret and we are the owners of the fd.
             let vm_file = unsafe { File::from_raw_descriptor(ret) };
             for region in guest_mem.regions() {
                 // SAFETY:
                 // Safe because the guest regions are guaranteed not to overlap.
                 unsafe {
                     set_user_memory_region(
                         &vm_file,
                         region.index as u32,
                         false,
                         false,
                         region.guest_addr.offset(),
                         region.size as u64,
                         region.host_addr as *mut u8,
                     )
                 }?;
             }

             Ok(Vm {
                 vm: vm_file,
                 guest_mem,
                 mem_regions: Arc::new(Mutex::new(BTreeMap::new())),
                 mem_slot_gaps: Arc::new(Mutex::new(BinaryHeap::new())),
             })
         } else {
             errno_result()
         }
     }

     /// Checks if a particular `Cap` is available.
     ///
     /// This is distinct from the `Kvm` version of this method because the some extensions depend on
     /// the particular `Vm` existence. This method is encouraged by the kernel because it more
     /// accurately reflects the usable capabilities.
     pub fn check_extension(&self, c: Cap) -> bool {
         // SAFETY:
         // Safe because we know that our file is a KVM fd and that the extension is one of the ones
         // defined by kernel.
         unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) == 1 }
     }

     /// Inserts the given `mem` into the VM's address space at `guest_addr`.
     ///
     /// The slot that was assigned the kvm memory mapping is returned on success. The slot can be
     /// given to `Vm::remove_memory_region` to remove the memory from the VM's address space and
     /// take back ownership of `mem`.
     ///
     /// Note that memory inserted into the VM's address space must not overlap with any other memory
     /// slot's region.
     ///
     /// If `read_only` is true, the guest will be able to read the memory as normal, but attempts to
     /// write will trigger a mmio VM exit, leaving the memory untouched.
     ///
     /// If `log_dirty_pages` is true, the slot number can be used to retrieve the pages written to
     /// by the guest with `get_dirty_log`.
     pub fn add_memory_region(
         &mut self,
         guest_addr: GuestAddress,
         mem: Box<dyn MappedRegion>,
         read_only: bool,
         log_dirty_pages: bool,
     ) -> Result<u32> {
         let size = mem.size() as u64;
         let end_addr = guest_addr
             .checked_add(size)
             .ok_or_else(|| Error::new(EOVERFLOW))?;
         if self.guest_mem.range_overlap(guest_addr, end_addr) {
             return Err(Error::new(ENOSPC));
         }
         let mut regions = self.mem_regions.lock();
         let mut gaps = self.mem_slot_gaps.lock();
         let slot = match gaps.pop() {
             Some(gap) => gap.0,
             None => (regions.len() + self.guest_mem.num_regions() as usize) as u32,
         };

         // SAFETY:
         // Safe because we check that the given guest address is valid and has no overlaps. We also
         // know that the pointer and size are correct because the MemoryMapping interface ensures
         // this. We take ownership of the memory mapping so that it won't be unmapped until the slot
         // is removed.
         let res = unsafe {
             set_user_memory_region(
                 &self.vm,
                 slot,
                 read_only,
                 log_dirty_pages,
                 guest_addr.offset(),
                 size,
                 mem.as_ptr(),
             )
         };

         if let Err(e) = res {
             gaps.push(MemSlot(slot));
             return Err(e);
         }
         regions.insert(slot, mem);
         Ok(slot)
     }

     /// Removes memory that was previously added at the given slot.
     ///
     /// Ownership of the host memory mapping associated with the given slot is returned on success.
     pub fn remove_memory_region(&mut self, slot: u32) -> Result<Box<dyn MappedRegion>> {
         let mut regions = self.mem_regions.lock();
         if !regions.contains_key(&slot) {
             return Err(Error::new(ENOENT));
         }
         // SAFETY:
         // Safe because the slot is checked against the list of memory slots.
         unsafe {
             set_user_memory_region(&self.vm, slot, false, false, 0, 0, std::ptr::null_mut())?;
         }
         self.mem_slot_gaps.lock().push(MemSlot(slot));
         // This remove will always succeed because of the contains_key check above.
         Ok(regions.remove(&slot).unwrap())
     }

     /// Gets the bitmap of dirty pages since the last call to `get_dirty_log` for the memory at
     /// `slot`.
     ///
     /// The size of `dirty_log` must be at least as many bits as there are pages in the memory
     /// region `slot` represents. For example, if the size of `slot` is 16 pages, `dirty_log` must
     /// be 2 bytes or greater.
     pub fn get_dirty_log(&self, slot: u32, dirty_log: &mut [u8]) -> Result<()> {
         match self.mem_regions.lock().get(&slot) {
             Some(mem) => {
                 // Ensures that there are as many bytes in dirty_log as there are pages in the mmap.
                 if dirty_log_bitmap_size(mem.size()) > dirty_log.len() {
                     return Err(Error::new(EINVAL));
                 }
                 let mut dirty_log_kvm = kvm_dirty_log {
                     slot,
                     ..Default::default()
                 };
                 dirty_log_kvm.__bindgen_anon_1.dirty_bitmap = dirty_log.as_ptr() as *mut c_void;
                 // SAFETY:
                 // Safe because the `dirty_bitmap` pointer assigned above is guaranteed to be valid
                 // (because it's from a slice) and we checked that it will be large enough to hold
                 // the entire log.
                 let ret = unsafe { ioctl_with_ref(self, KVM_GET_DIRTY_LOG(), &dirty_log_kvm) };
                 if ret == 0 {
                     Ok(())
                 } else {
                     errno_result()
                 }
             }
             _ => Err(Error::new(ENOENT)),
         }
     }

     /// Gets a reference to the guest memory owned by this VM.
     ///
     /// Note that `GuestMemory` does not include any mmio memory that may have been added after
     /// this VM was constructed.
     pub fn get_memory(&self) -> &GuestMemory {
         &self.guest_mem
     }

     /// Sets the address of a one-page region in the VM's address space.
     ///
     /// See the documentation on the KVM_SET_IDENTITY_MAP_ADDR ioctl.
     #[cfg(target_arch = "x86_64")]
     pub fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VM fd and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_IDENTITY_MAP_ADDR(), &addr.offset()) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Retrieves the current timestamp of kvmclock as seen by the current guest.
     ///
     /// See the documentation on the KVM_GET_CLOCK ioctl.
     #[cfg(target_arch = "x86_64")]
     pub fn get_clock(&self) -> Result<kvm_clock_data> {
         // SAFETY: trivially safe
         let mut clock_data = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only write
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_CLOCK(), &mut clock_data) };
         if ret == 0 {
             Ok(clock_data)
         } else {
             errno_result()
         }
     }

     /// Sets the current timestamp of kvmclock to the specified value.
     ///
     /// See the documentation on the KVM_SET_CLOCK ioctl.
     #[cfg(target_arch = "x86_64")]
     pub fn set_clock(&self, clock_data: &kvm_clock_data) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_CLOCK(), clock_data) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Crates an in kernel interrupt controller.
     ///
     /// See the documentation on the KVM_CREATE_IRQCHIP ioctl.
     #[cfg(any(target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64"))]
     pub fn create_irq_chip(&self) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VM fd and we verify the return result.
         let ret = unsafe { ioctl(self, KVM_CREATE_IRQCHIP()) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Retrieves the state of given interrupt controller by issuing KVM_GET_IRQCHIP ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
     #[cfg(target_arch = "x86_64")]
     pub fn get_pic_state(&self, id: PicId) -> Result<kvm_pic_state> {
         let mut irqchip_state = kvm_irqchip {
             chip_id: id as u32,
             ..Default::default()
         };
         // SAFETY:
         // Safe because we know our file is a VM fd, we know the kernel will only write
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state) };
         if ret == 0 {
             Ok(
                 // SAFETY:
                 // Safe as we know that we are retrieving data related to the
                 // PIC (primary or secondary) and not IOAPIC.
                 unsafe { irqchip_state.chip.pic },
             )
         } else {
             errno_result()
         }
     }

     /// Sets the state of given interrupt controller by issuing KVM_SET_IRQCHIP ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
     #[cfg(target_arch = "x86_64")]
     pub fn set_pic_state(&self, id: PicId, state: &kvm_pic_state) -> Result<()> {
         let mut irqchip_state = kvm_irqchip {
             chip_id: id as u32,
             ..Default::default()
         };
         irqchip_state.chip.pic = *state;
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Retrieves the state of IOAPIC by issuing KVM_GET_IRQCHIP ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
     #[cfg(target_arch = "x86_64")]
     pub fn get_ioapic_state(&self) -> Result<kvm_ioapic_state> {
         let mut irqchip_state = kvm_irqchip {
             chip_id: 2,
             ..Default::default()
         };
         let ret =
             // SAFETY:
             // Safe because we know our file is a VM fd, we know the kernel will only write
             // correct amount of memory to our pointer, and we verify the return result.
             unsafe {
                 ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state)
         };
         if ret == 0 {
             Ok(
                 // SAFETY:
                 // Safe as we know that we are retrieving data related to the
                 // IOAPIC and not PIC.
                 unsafe { irqchip_state.chip.ioapic },
             )
         } else {
             errno_result()
         }
     }

     /// Sets the state of IOAPIC by issuing KVM_SET_IRQCHIP ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
     #[cfg(target_arch = "x86_64")]
     pub fn set_ioapic_state(&self, state: &kvm_ioapic_state) -> Result<()> {
         let mut irqchip_state = kvm_irqchip {
             chip_id: 2,
             ..Default::default()
         };
         irqchip_state.chip.ioapic = *state;
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Sets the level on the given irq to 1 if `active` is true, and 0 otherwise.
     #[cfg(any(target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64"))]
     pub fn set_irq_line(&self, irq: u32, active: bool) -> Result<()> {
         let mut irq_level = kvm_irq_level::default();
         irq_level.__bindgen_anon_1.irq = irq;
         irq_level.level = active.into();

         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_IRQ_LINE(), &irq_level) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Creates a PIT as per the KVM_CREATE_PIT2 ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
     #[cfg(target_arch = "x86_64")]
     pub fn create_pit(&self) -> Result<()> {
         let pit_config = kvm_pit_config::default();
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_CREATE_PIT2(), &pit_config) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Retrieves the state of PIT by issuing KVM_GET_PIT2 ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_pit`.
     #[cfg(target_arch = "x86_64")]
     pub fn get_pit_state(&self) -> Result<kvm_pit_state2> {
         // SAFETY: trivially safe
         let mut pit_state = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only write
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_PIT2(), &mut pit_state) };
         if ret == 0 {
             Ok(pit_state)
         } else {
             errno_result()
         }
     }

     /// Sets the state of PIT by issuing KVM_SET_PIT2 ioctl.
     ///
     /// Note that this call can only succeed after a call to `Vm::create_pit`.
     #[cfg(target_arch = "x86_64")]
     pub fn set_pit_state(&self, pit_state: &kvm_pit_state2) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_PIT2(), pit_state) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Registers an event to be signaled whenever a certain address is written to.
     ///
     /// The `datamatch` parameter can be used to limit signaling `evt` to only the cases where the
     /// value being written is equal to `datamatch`. Note that the size of `datamatch` is important
     /// and must match the expected size of the guest's write.
     ///
     /// In all cases where `evt` is signaled, the ordinary vmexit to userspace that would be
     /// triggered is prevented.
     pub fn register_ioevent(
         &self,
         evt: &Event,
         addr: IoeventAddress,
         datamatch: Datamatch,
     ) -> Result<()> {
         self.ioeventfd(evt, addr, datamatch, false)
     }

     /// Unregisters an event previously registered with `register_ioevent`.
     ///
     /// The `evt`, `addr`, and `datamatch` set must be the same as the ones passed into
     /// `register_ioevent`.
     pub fn unregister_ioevent(
         &self,
         evt: &Event,
         addr: IoeventAddress,
         datamatch: Datamatch,
     ) -> Result<()> {
         self.ioeventfd(evt, addr, datamatch, true)
     }

     fn ioeventfd(
         &self,
         evt: &Event,
         addr: IoeventAddress,
         datamatch: Datamatch,
         deassign: bool,
     ) -> Result<()> {
         let (do_datamatch, datamatch_value, datamatch_len) = match datamatch {
             Datamatch::AnyLength => (false, 0, 0),
             Datamatch::U8(v) => match v {
                 Some(u) => (true, u as u64, 1),
                 None => (false, 0, 1),
             },
             Datamatch::U16(v) => match v {
                 Some(u) => (true, u as u64, 2),
                 None => (false, 0, 2),
             },
             Datamatch::U32(v) => match v {
                 Some(u) => (true, u as u64, 4),
                 None => (false, 0, 4),
             },
             Datamatch::U64(v) => match v {
                 Some(u) => (true, u, 8),
                 None => (false, 0, 8),
             },
         };
         let mut flags = 0;
         if deassign {
             flags |= 1 << kvm_ioeventfd_flag_nr_deassign;
         }
         if do_datamatch {
             flags |= 1 << kvm_ioeventfd_flag_nr_datamatch
         }
         if let IoeventAddress::Pio(_) = addr {
             flags |= 1 << kvm_ioeventfd_flag_nr_pio;
         }
         let ioeventfd = kvm_ioeventfd {
             datamatch: datamatch_value,
             len: datamatch_len,
             addr: match addr {
                 IoeventAddress::Pio(p) => p,
                 IoeventAddress::Mmio(m) => m,
             },
             fd: evt.as_raw_descriptor(),
             flags,
             ..Default::default()
         };
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_IOEVENTFD(), &ioeventfd) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Registers an event that will, when signalled, trigger the `gsi` irq, and `resample_evt` will
     /// get triggered when the irqchip is resampled.
     #[cfg(any(target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64"))]
     pub fn register_irqfd_resample(
         &self,
         evt: &Event,
         resample_evt: &Event,
         gsi: u32,
     ) -> Result<()> {
         let irqfd = kvm_irqfd {
             flags: KVM_IRQFD_FLAG_RESAMPLE,
             fd: evt.as_raw_descriptor() as u32,
             resamplefd: resample_evt.as_raw_descriptor() as u32,
             gsi,
             ..Default::default()
         };
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Unregisters an event that was previously registered with
     /// `register_irqfd`/`register_irqfd_resample`.
     ///
     /// The `evt` and `gsi` pair must be the same as the ones passed into
     /// `register_irqfd`/`register_irqfd_resample`.
     #[cfg(any(target_arch = "x86_64", target_arch = "arm", target_arch = "aarch64"))]
     pub fn unregister_irqfd(&self, evt: &Event, gsi: u32) -> Result<()> {
         let irqfd = kvm_irqfd {
             fd: evt.as_raw_descriptor() as u32,
             gsi,
             flags: KVM_IRQFD_FLAG_DEASSIGN,
             ..Default::default()
         };
         // SAFETY:
         // Safe because we know that our file is a VM fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Sets the GSI routing table, replacing any table set with previous calls to
     /// `set_gsi_routing`.
     #[cfg(target_arch = "x86_64")]
     pub fn set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()> {
         let mut irq_routing =
             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(routes.len());
         irq_routing[0].nr = routes.len() as u32;

         // SAFETY:
         // Safe because we ensured there is enough space in irq_routing to hold the number of
         // route entries.
         let irq_routes = unsafe { irq_routing[0].entries.as_mut_slice(routes.len()) };
         for (route, irq_route) in routes.iter().zip(irq_routes.iter_mut()) {
             irq_route.gsi = route.gsi;
             match route.source {
                 IrqSource::Irqchip { chip, pin } => {
                     irq_route.type_ = KVM_IRQ_ROUTING_IRQCHIP;
                     irq_route.u.irqchip = kvm_irq_routing_irqchip { irqchip: chip, pin }
                 }
                 IrqSource::Msi { address, data } => {
                     irq_route.type_ = KVM_IRQ_ROUTING_MSI;
                     irq_route.u.msi = kvm_irq_routing_msi {
                         address_lo: address as u32,
                         address_hi: (address >> 32) as u32,
                         data,
                         ..Default::default()
                     }
                 }
             }
         }

         // TODO(b/315998194): Add safety comment
         #[allow(clippy::undocumented_unsafe_blocks)]
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_GSI_ROUTING(), &irq_routing[0]) };
         if ret == 0 {
             Ok(())
         } else {
             errno_result()
         }
     }

     /// Enable the specified capability.
     /// See documentation for KVM_ENABLE_CAP.
     /// # Safety
     /// This function is marked as unsafe because `cap` may contain values which are interpreted as
     /// pointers by the kernel.
     pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> {
         // Safe because we allocated the struct and we know the kernel will read exactly the size of
         // the struct.
         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap);
         if ret < 0 {
             errno_result()
         } else {
             Ok(())
         }
     }
 }

 impl AsRawDescriptor for Vm {
     fn as_raw_descriptor(&self) -> RawDescriptor {
         self.vm.as_raw_descriptor()
     }
 }

 /// A reason why a VCPU exited. One of these returns every time `Vcpu::run` is called.
 #[derive(Debug)]
 pub enum VcpuExit {
     /// An out port instruction was run on the given port with the given data.
     IoOut {
         port: u16,
         size: usize,
         data: [u8; 8],
     },
     /// An in port instruction was run on the given port.
     ///
     /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is
     /// called again.
     IoIn {
         port: u16,
         size: usize,
     },
     /// A read instruction was run against the given MMIO address.
     ///
     /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is
     /// called again.
     MmioRead {
         address: u64,
         size: usize,
     },
     /// A write instruction was run against the given MMIO address with the given data.
     MmioWrite {
         address: u64,
         size: usize,
         data: [u8; 8],
     },
     IoapicEoi {
         vector: u8,
     },
     HypervSynic {
         msr: u32,
         control: u64,
         evt_page: u64,
         msg_page: u64,
     },
     HypervHcall {
         input: u64,
         params: [u64; 2],
     },
     Unknown,
     Exception,
     Hypercall,
     Debug,
     Hlt,
     IrqWindowOpen,
     Shutdown,
     FailEntry {
         hardware_entry_failure_reason: u64,
     },
     Intr,
     SetTpr,
     TprAccess,
     S390Sieic,
     S390Reset,
     Dcr,
     Nmi,
     InternalError,
     Osi,
     PaprHcall,
     S390Ucontrol,
     Watchdog,
     S390Tsch,
     Epr,
     /// The cpu triggered a system level event which is specified by the type field.
     /// The first field is the event type and the second field is flags.
     /// The possible event types are shutdown, reset, or crash.  So far there
     /// are not any flags defined.
     SystemEvent(u32 /* event_type */, u64 /* flags */),
 }

 /// A wrapper around creating and using a VCPU.
 /// `Vcpu` provides all functionality except for running. To run, `to_runnable` must be called to
 /// lock the vcpu to a thread. Then the returned `RunnableVcpu` can be used for running.
 pub struct Vcpu {
     vcpu: File,
     run_mmap: MemoryMapping,
 }

 pub struct VcpuThread {
     run: *mut kvm_run,
     signal_num: Option<c_int>,
 }

 thread_local!(static VCPU_THREAD: RefCell<Option<VcpuThread>> = RefCell::new(None));

 impl Vcpu {
     /// Constructs a new VCPU for `vm`.
     ///
     /// The `id` argument is the CPU number between [0, max vcpus).
     pub fn new(id: c_ulong, kvm: &Kvm, vm: &Vm) -> Result<Vcpu> {
         let run_mmap_size = kvm.get_vcpu_mmap_size()?;

         // SAFETY:
         // Safe because we know that vm a VM fd and we verify the return result.
         let vcpu_fd = unsafe { ioctl_with_val(vm, KVM_CREATE_VCPU(), id) };
         if vcpu_fd < 0 {
             return errno_result();
         }

         // SAFETY:
         // Wrap the vcpu now in case the following ? returns early. This is safe because we verified
         // the value of the fd and we own the fd.
         let vcpu = unsafe { File::from_raw_descriptor(vcpu_fd) };

         let run_mmap = MemoryMappingBuilder::new(run_mmap_size)
             .from_file(&vcpu)
             .build()
             .map_err(|_| Error::new(ENOSPC))?;

         Ok(Vcpu { vcpu, run_mmap })
     }

     /// Consumes `self` and returns a `RunnableVcpu`. A `RunnableVcpu` is required to run the
     /// guest.
     /// Assigns a vcpu to the current thread and stores it in a hash map that can be used by signal
     /// handlers to call set_local_immediate_exit(). An optional signal number will be temporarily
     /// blocked while assigning the vcpu to the thread and later blocked when `RunnableVcpu` is
     /// destroyed.
     ///
     /// Returns an error, `EBUSY`, if the current thread already contains a Vcpu.
     #[allow(clippy::cast_ptr_alignment)]
     pub fn to_runnable(self, signal_num: Option<c_int>) -> Result<RunnableVcpu> {
         // Block signal while we add -- if a signal fires (very unlikely,
         // as this means something is trying to pause the vcpu before it has
         // even started) it'll try to grab the read lock while this write
         // lock is grabbed and cause a deadlock.
         // Assuming that a failure to block means it's already blocked.
         let _blocked_signal = signal_num.map(BlockedSignal::new);

         VCPU_THREAD.with(|v| {
             if v.borrow().is_none() {
                 *v.borrow_mut() = Some(VcpuThread {
                     run: self.run_mmap.as_ptr() as *mut kvm_run,
                     signal_num,
                 });
                 Ok(())
             } else {
                 Err(Error::new(EBUSY))
             }
         })?;

         Ok(RunnableVcpu {
             vcpu: self,
             phantom: Default::default(),
         })
     }

     /// Sets the data received by a mmio read, ioport in, or hypercall instruction.
     ///
     /// This function should be called after `Vcpu::run` returns an `VcpuExit::IoIn`,
     /// `VcpuExit::MmioRead`, or 'VcpuExit::HypervHcall`.
     #[allow(clippy::cast_ptr_alignment)]
     pub fn set_data(&self, data: &[u8]) -> Result<()> {
         // SAFETY:
         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
         // kernel told us how large it was. The pointer is page aligned so casting to a different
         // type is well defined, hence the clippy allow attribute.
         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
         match run.exit_reason {
             KVM_EXIT_IO => {
                 let run_start = run as *mut kvm_run as *mut u8;
                 // SAFETY:
                 // Safe because the exit_reason (which comes from the kernel) told us which
                 // union field to use.
                 let io = unsafe { run.__bindgen_anon_1.io };
                 if io.direction as u32 != KVM_EXIT_IO_IN {
                     return Err(Error::new(EINVAL));
                 }
                 let data_size = (io.count as usize) * (io.size as usize);
                 if data_size != data.len() {
                     return Err(Error::new(EINVAL));
                 }
                 // SAFETY:
                 // The data_offset is defined by the kernel to be some number of bytes into the
                 // kvm_run structure, which we have fully mmap'd.
                 unsafe {
                     let data_ptr = run_start.offset(io.data_offset as isize);
                     copy_nonoverlapping(data.as_ptr(), data_ptr, data_size);
                 }
                 Ok(())
             }
             KVM_EXIT_MMIO => {
                 // SAFETY:
                 // Safe because the exit_reason (which comes from the kernel) told us which
                 // union field to use.
                 let mmio = unsafe { &mut run.__bindgen_anon_1.mmio };
                 if mmio.is_write != 0 {
                     return Err(Error::new(EINVAL));
                 }
                 let len = mmio.len as usize;
                 if len != data.len() {
                     return Err(Error::new(EINVAL));
                 }
                 mmio.data[..len].copy_from_slice(data);
                 Ok(())
             }
             KVM_EXIT_HYPERV => {
                 // SAFETY:
                 // Safe because the exit_reason (which comes from the kernel) told us which
                 // union field to use.
                 let hyperv = unsafe { &mut run.__bindgen_anon_1.hyperv };
                 if hyperv.type_ != KVM_EXIT_HYPERV_HCALL {
                     return Err(Error::new(EINVAL));
                 }
                 // TODO(b/315998194): Add safety comment
                 #[allow(clippy::undocumented_unsafe_blocks)]
                 let hcall = unsafe { &mut hyperv.u.hcall };
                 match data.try_into() {
                     Ok(data) => {
                         hcall.result = u64::from_ne_bytes(data);
                     }
                     _ => return Err(Error::new(EINVAL)),
                 }
                 Ok(())
             }
             _ => Err(Error::new(EINVAL)),
         }
     }

     /// Sets the bit that requests an immediate exit.
     #[allow(clippy::cast_ptr_alignment)]
     pub fn set_immediate_exit(&self, exit: bool) {
         // SAFETY:
         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
         // kernel told us how large it was. The pointer is page aligned so casting to a different
         // type is well defined, hence the clippy allow attribute.
         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
         run.immediate_exit = exit.into();
     }

     /// Sets/clears the bit for immediate exit for the vcpu on the current thread.
     pub fn set_local_immediate_exit(exit: bool) {
         VCPU_THREAD.with(|v| {
             if let Some(state) = &(*v.borrow()) {
                 // TODO(b/315998194): Add safety comment
                 #[allow(clippy::undocumented_unsafe_blocks)]
                 unsafe {
                     (*state.run).immediate_exit = exit.into();
                 };
             }
         });
     }

     /// Gets the VCPU registers.
     #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
     pub fn get_regs(&self) -> Result<kvm_regs> {
         // SAFETY: trivially safe
         let mut regs = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_REGS(), &mut regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(regs)
     }

     /// Sets the VCPU registers.
     #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
     pub fn set_regs(&self, regs: &kvm_regs) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_REGS(), regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the VCPU special registers.
     #[cfg(target_arch = "x86_64")]
     pub fn get_sregs(&self) -> Result<kvm_sregs> {
         // SAFETY: trivially safe
         let mut regs = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_SREGS(), &mut regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(regs)
     }

     /// Sets the VCPU special registers.
     #[cfg(target_arch = "x86_64")]
     pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<()> {
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
         // correct amount of memory from our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_SREGS(), sregs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the VCPU FPU registers.
     #[cfg(target_arch = "x86_64")]
     pub fn get_fpu(&self) -> Result<kvm_fpu> {
         // SAFETY: trivially safe
         // correct amount of memory to our pointer, and we verify the return result.
         let mut regs = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_FPU(), &mut regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(regs)
     }

     /// X86 specific call to setup the FPU
     ///
     /// See the documentation for KVM_SET_FPU.
     #[cfg(target_arch = "x86_64")]
     pub fn set_fpu(&self, fpu: &kvm_fpu) -> Result<()> {
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read past the end of the kvm_fpu struct.
             unsafe { ioctl_with_ref(self, KVM_SET_FPU(), fpu) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the VCPU debug registers.
     #[cfg(target_arch = "x86_64")]
     pub fn get_debugregs(&self) -> Result<kvm_debugregs> {
         // SAFETY: trivially safe
         let mut regs = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_DEBUGREGS(), &mut regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(regs)
     }

     /// Sets the VCPU debug registers
     #[cfg(target_arch = "x86_64")]
     pub fn set_debugregs(&self, dregs: &kvm_debugregs) -> Result<()> {
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read past the end of the kvm_fpu struct.
             unsafe { ioctl_with_ref(self, KVM_SET_DEBUGREGS(), dregs) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the VCPU extended control registers
     #[cfg(target_arch = "x86_64")]
     pub fn get_xcrs(&self) -> Result<kvm_xcrs> {
         // SAFETY: trivially safe
         let mut regs = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
         // correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_XCRS(), &mut regs) };
         if ret != 0 {
             return errno_result();
         }
         Ok(regs)
     }

     /// Sets the VCPU extended control registers
     #[cfg(target_arch = "x86_64")]
     pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<()> {
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read past the end of the kvm_xcrs struct.
             unsafe { ioctl_with_ref(self, KVM_SET_XCRS(), xcrs) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// X86 specific call to get the MSRS
     ///
     /// See the documentation for KVM_SET_MSRS.
     #[cfg(target_arch = "x86_64")]
     pub fn get_msrs(&self, msr_entries: &mut Vec<kvm_msr_entry>) -> Result<()> {
         let mut msrs = vec_with_array_field::<kvm_msrs, kvm_msr_entry>(msr_entries.len());
         {
             // SAFETY:
             // Mapping the unsized array to a slice is unsafe because the length isn't known.
             // Providing the length used to create the struct guarantees the entire slice is valid.
             unsafe {
                 let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(msr_entries.len());
                 entries.copy_from_slice(msr_entries);
             }
         }
         msrs[0].nmsrs = msr_entries.len() as u32;
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read or write past the end of the kvm_msrs struct.
             unsafe { ioctl_with_ref(self, KVM_GET_MSRS(), &msrs[0]) }
         };
         if ret < 0 {
             // KVM_SET_MSRS actually returns the number of msr entries written.
             return errno_result();
         }
         // TODO(b/315998194): Add safety comment
         #[allow(clippy::undocumented_unsafe_blocks)]
         unsafe {
             let count = ret as usize;
             assert!(count <= msr_entries.len());
             let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(count);
             msr_entries.truncate(count);
             msr_entries.copy_from_slice(entries);
         }
         Ok(())
     }

     /// X86 specific call to setup the MSRS
     ///
     /// See the documentation for KVM_SET_MSRS.
     #[cfg(target_arch = "x86_64")]
     pub fn set_msrs(&self, msrs: &kvm_msrs) -> Result<()> {
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read past the end of the kvm_msrs struct.
             unsafe { ioctl_with_ref(self, KVM_SET_MSRS(), msrs) }
         };
         if ret < 0 {
             // KVM_SET_MSRS actually returns the number of msr entries written.
             return errno_result();
         }
         Ok(())
     }

     /// X86 specific call to setup the CPUID registers
     ///
     /// See the documentation for KVM_SET_CPUID2.
     #[cfg(target_arch = "x86_64")]
     pub fn set_cpuid2(&self, cpuid: &CpuId) -> Result<()> {
         let ret = {
             // SAFETY:
             // Here we trust the kernel not to read past the end of the kvm_msrs struct.
             unsafe { ioctl_with_ptr(self, KVM_SET_CPUID2(), cpuid.as_ptr()) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// X86 specific call to get the system emulated hyper-v CPUID values
     #[cfg(target_arch = "x86_64")]
     pub fn get_hyperv_cpuid(&self) -> Result<CpuId> {
         const MAX_KVM_CPUID_ENTRIES: usize = 256;
         let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES);

         let ret = {
             // SAFETY:
             // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
             // allocated for the struct. The limit is read from nent, which is set to the allocated
             // size(MAX_KVM_CPUID_ENTRIES) above.
             unsafe { ioctl_with_mut_ptr(self, KVM_GET_SUPPORTED_HV_CPUID(), cpuid.as_mut_ptr()) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(cpuid)
     }

     /// X86 specific call to get the state of the "Local Advanced Programmable Interrupt
     /// Controller".
     ///
     /// See the documentation for KVM_GET_LAPIC.
     #[cfg(target_arch = "x86_64")]
     pub fn get_lapic(&self) -> Result<kvm_lapic_state> {
         let mut klapic: kvm_lapic_state = Default::default();

         let ret = {
             // SAFETY:
             // The ioctl is unsafe unless you trust the kernel not to write past the end of the
             // local_apic struct.
             unsafe { ioctl_with_mut_ref(self, KVM_GET_LAPIC(), &mut klapic) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(klapic)
     }

     /// X86 specific call to set the state of the "Local Advanced Programmable Interrupt
     /// Controller".
     ///
     /// See the documentation for KVM_SET_LAPIC.
     #[cfg(target_arch = "x86_64")]
     pub fn set_lapic(&self, klapic: &kvm_lapic_state) -> Result<()> {
         let ret = {
             // SAFETY:
             // The ioctl is safe because the kernel will only read from the klapic struct.
             unsafe { ioctl_with_ref(self, KVM_SET_LAPIC(), klapic) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the vcpu's current "multiprocessing state".
     ///
     /// See the documentation for KVM_GET_MP_STATE. This call can only succeed after
     /// a call to `Vm::create_irq_chip`.
     ///
     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
     /// to run crosvm on s390.
     #[cfg(target_arch = "x86_64")]
     pub fn get_mp_state(&self) -> Result<kvm_mp_state> {
         // SAFETY: trivially safe
         let mut state: kvm_mp_state = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel will only
         // write correct amount of memory to our pointer, and we verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_MP_STATE(), &mut state) };
         if ret < 0 {
             return errno_result();
         }
         Ok(state)
     }

     /// Sets the vcpu's current "multiprocessing state".
     ///
     /// See the documentation for KVM_SET_MP_STATE. This call can only succeed after
     /// a call to `Vm::create_irq_chip`.
     ///
     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
     /// to run crosvm on s390.
     #[cfg(target_arch = "x86_64")]
     pub fn set_mp_state(&self, state: &kvm_mp_state) -> Result<()> {
         let ret = {
             // SAFETY:
             // The ioctl is safe because the kernel will only read from the kvm_mp_state struct.
             unsafe { ioctl_with_ref(self, KVM_SET_MP_STATE(), state) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Gets the vcpu's currently pending exceptions, interrupts, NMIs, etc
     ///
     /// See the documentation for KVM_GET_VCPU_EVENTS.
     #[cfg(target_arch = "x86_64")]
     pub fn get_vcpu_events(&self) -> Result<kvm_vcpu_events> {
         // SAFETY: trivially safe
         let mut events: kvm_vcpu_events = unsafe { std::mem::zeroed() };
         // SAFETY:
         // Safe because we know that our file is a VCPU fd, we know the kernel
         // will only write correct amount of memory to our pointer, and we
         // verify the return result.
         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_VCPU_EVENTS(), &mut events) };
         if ret < 0 {
             return errno_result();
         }
         Ok(events)
     }

     /// Sets the vcpu's currently pending exceptions, interrupts, NMIs, etc
     ///
     /// See the documentation for KVM_SET_VCPU_EVENTS.
     #[cfg(target_arch = "x86_64")]
     pub fn set_vcpu_events(&self, events: &kvm_vcpu_events) -> Result<()> {
         let ret = {
             // SAFETY:
             // The ioctl is safe because the kernel will only read from the
             // kvm_vcpu_events.
             unsafe { ioctl_with_ref(self, KVM_SET_VCPU_EVENTS(), events) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Enable the specified capability.
     /// See documentation for KVM_ENABLE_CAP.
     /// # Safety
     /// This function is marked as unsafe because `cap` may contain values which are interpreted as
     /// pointers by the kernel.
     pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> {
         // SAFETY:
         // Safe because we allocated the struct and we know the kernel will read exactly the size of
         // the struct.
         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap);
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Specifies set of signals that are blocked during execution of KVM_RUN.
     /// Signals that are not blocked will cause KVM_RUN to return with -EINTR.
     ///
     /// See the documentation for KVM_SET_SIGNAL_MASK
     pub fn set_signal_mask(&self, signals: &[c_int]) -> Result<()> {
         let sigset = signal::create_sigset(signals)?;

         let mut kvm_sigmask = vec_with_array_field::<kvm_signal_mask, sigset_t>(1);
         // Rust definition of sigset_t takes 128 bytes, but the kernel only
         // expects 8-bytes structure, so we can't write
         // kvm_sigmask.len  = size_of::<sigset_t>() as u32;
         kvm_sigmask[0].len = 8;
         // Ensure the length is not too big.
         const _ASSERT: usize = size_of::<sigset_t>() - 8usize;

         // SAFETY:
         // Safe as we allocated exactly the needed space
         unsafe {
             copy_nonoverlapping(
                 &sigset as *const sigset_t as *const u8,
                 kvm_sigmask[0].sigset.as_mut_ptr(),
                 8,
             );
         }

         let ret = {
             // SAFETY:
             // The ioctl is safe because the kernel will only read from the
             // kvm_signal_mask structure.
             unsafe { ioctl_with_ref(self, KVM_SET_SIGNAL_MASK(), &kvm_sigmask[0]) }
         };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }

     /// Sets the value of one register on this VCPU.  The id of the register is
     /// encoded as specified in the kernel documentation for KVM_SET_ONE_REG.
     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
     pub fn set_one_reg(&self, reg_id: u64, data: u64) -> Result<()> {
         let data_ref = &data as *const u64;
         let onereg = kvm_one_reg {
             id: reg_id,
             addr: data_ref as u64,
         };
         // SAFETY:
         // safe because we allocated the struct and we know the kernel will read
         // exactly the size of the struct
         let ret = unsafe { ioctl_with_ref(self, KVM_SET_ONE_REG(), &onereg) };
         if ret < 0 {
             return errno_result();
         }
         Ok(())
     }
 }

 impl AsRawDescriptor for Vcpu {
     fn as_raw_descriptor(&self) -> RawDescriptor {
         self.vcpu.as_raw_descriptor()
     }
 }

 /// A Vcpu that has a thread and can be run. Created by calling `to_runnable` on a `Vcpu`.
 /// Implements `Deref` to a `Vcpu` so all `Vcpu` methods are usable, with the addition of the `run`
 /// function to execute the guest.
 pub struct RunnableVcpu {
     vcpu: Vcpu,
     // vcpus must stay on the same thread once they start.
     // Add the PhantomData pointer to ensure RunnableVcpu is not `Send`.
     phantom: std::marker::PhantomData<*mut u8>,
 }

 impl RunnableVcpu {
     /// Runs the VCPU until it exits, returning the reason for the exit.
     ///
     /// Note that the state of the VCPU and associated VM must be setup first for this to do
     /// anything useful.
     #[allow(clippy::cast_ptr_alignment)]
     // The pointer is page aligned so casting to a different type is well defined, hence the clippy
     // allow attribute.
     pub fn run(&self) -> Result<VcpuExit> {
         // SAFETY:
         // Safe because we know that our file is a VCPU fd and we verify the return result.
         let ret = unsafe { ioctl(self, KVM_RUN()) };
         if ret == 0 {
             // SAFETY:
             // Safe because we know we mapped enough memory to hold the kvm_run struct because the
             // kernel told us how large it was.
             let run = unsafe { &*(self.run_mmap.as_ptr() as *const kvm_run) };
             match run.exit_reason {
                 KVM_EXIT_IO => {
                     // SAFETY:
                     // Safe because the exit_reason (which comes from the kernel) told us which
                     // union field to use.
                     let io = unsafe { run.__bindgen_anon_1.io };
                     let port = io.port;
                     let size = (io.count as usize) * (io.size as usize);
                     match io.direction as u32 {
                         KVM_EXIT_IO_IN => Ok(VcpuExit::IoIn { port, size }),
                         KVM_EXIT_IO_OUT => {
                             let mut data = [0; 8];
                             let run_start = run as *const kvm_run as *const u8;
                             // SAFETY:
                             // The data_offset is defined by the kernel to be some number of bytes
                             // into the kvm_run structure, which we have fully mmap'd.
                             unsafe {
                                 let data_ptr = run_start.offset(io.data_offset as isize);
                                 copy_nonoverlapping(
                                     data_ptr,
                                     data.as_mut_ptr(),
                                     min(size, data.len()),
                                 );
                             }
                             Ok(VcpuExit::IoOut { port, size, data })
                         }
                         _ => Err(Error::new(EINVAL)),
                     }
                 }
                 KVM_EXIT_MMIO => {
                     // SAFETY:
                     // Safe because the exit_reason (which comes from the kernel) told us which
                     // union field to use.
                     let mmio = unsafe { &run.__bindgen_anon_1.mmio };
                     let address = mmio.phys_addr;
                     let size = min(mmio.len as usize, mmio.data.len());
                     if mmio.is_write != 0 {
                         Ok(VcpuExit::MmioWrite {
                             address,
                             size,
                             data: mmio.data,
                         })
                     } else {
                         Ok(VcpuExit::MmioRead { address, size })
                     }
                 }
                 KVM_EXIT_IOAPIC_EOI => {
                     // SAFETY:
                     // Safe because the exit_reason (which comes from the kernel) told us which
                     // union field to use.
                     let vector = unsafe { run.__bindgen_anon_1.eoi.vector };
                     Ok(VcpuExit::IoapicEoi { vector })
                 }
                 KVM_EXIT_HYPERV => {
                     // SAFETY:
                     // Safe because the exit_reason (which comes from the kernel) told us which
                     // union field to use.
                     let hyperv = unsafe { &run.__bindgen_anon_1.hyperv };
                     match hyperv.type_ {
                         KVM_EXIT_HYPERV_SYNIC => {
                             // TODO(b/315998194): Add safety comment
                             #[allow(clippy::undocumented_unsafe_blocks)]
                             let synic = unsafe { &hyperv.u.synic };
                             Ok(VcpuExit::HypervSynic {
                                 msr: synic.msr,
                                 control: synic.control,
                                 evt_page: synic.evt_page,
                                 msg_page: synic.msg_page,
                             })
                         }
                         KVM_EXIT_HYPERV_HCALL => {
                             // TODO(b/315998194): Add safety comment
                             #[allow(clippy::undocumented_unsafe_blocks)]
                             let hcall = unsafe { &hyperv.u.hcall };
                             Ok(VcpuExit::HypervHcall {
                                 input: hcall.input,
                                 params: hcall.params,
                             })
                         }
                         _ => Err(Error::new(EINVAL)),
                     }
                 }
                 KVM_EXIT_UNKNOWN => Ok(VcpuExit::Unknown),
                 KVM_EXIT_EXCEPTION => Ok(VcpuExit::Exception),
                 KVM_EXIT_HYPERCALL => Ok(VcpuExit::Hypercall),
                 KVM_EXIT_DEBUG => Ok(VcpuExit::Debug),
                 KVM_EXIT_HLT => Ok(VcpuExit::Hlt),
                 KVM_EXIT_IRQ_WINDOW_OPEN => Ok(VcpuExit::IrqWindowOpen),
                 KVM_EXIT_SHUTDOWN => Ok(VcpuExit::Shutdown),
                 KVM_EXIT_FAIL_ENTRY => {
                     // SAFETY:
                     // Safe because the exit_reason (which comes from the kernel) told us which
                     // union field to use.
                     let hardware_entry_failure_reason = unsafe {
                         run.__bindgen_anon_1
                             .fail_entry
                             .hardware_entry_failure_reason
                     };
                     Ok(VcpuExit::FailEntry {
                         hardware_entry_failure_reason,
                     })
                 }
                 KVM_EXIT_INTR => Ok(VcpuExit::Intr),
                 KVM_EXIT_SET_TPR => Ok(VcpuExit::SetTpr),
                 KVM_EXIT_TPR_ACCESS => Ok(VcpuExit::TprAccess),
                 KVM_EXIT_S390_SIEIC => Ok(VcpuExit::S390Sieic),
                 KVM_EXIT_S390_RESET => Ok(VcpuExit::S390Reset),
                 KVM_EXIT_DCR => Ok(VcpuExit::Dcr),
                 KVM_EXIT_NMI => Ok(VcpuExit::Nmi),
                 KVM_EXIT_INTERNAL_ERROR => Ok(VcpuExit::InternalError),
                 KVM_EXIT_OSI => Ok(VcpuExit::Osi),
                 KVM_EXIT_PAPR_HCALL => Ok(VcpuExit::PaprHcall),
                 KVM_EXIT_S390_UCONTROL => Ok(VcpuExit::S390Ucontrol),
                 KVM_EXIT_WATCHDOG => Ok(VcpuExit::Watchdog),
                 KVM_EXIT_S390_TSCH => Ok(VcpuExit::S390Tsch),
                 KVM_EXIT_EPR => Ok(VcpuExit::Epr),
                 KVM_EXIT_SYSTEM_EVENT => {
                     let event_type = {
                         // SAFETY:
                         // Safe because we know the exit reason told us this union
                         // field is valid
                         unsafe { run.__bindgen_anon_1.system_event.type_ }
                     };
                     // TODO(b/315998194): Add safety comment
                     #[allow(clippy::undocumented_unsafe_blocks)]
                     let event_flags =
                         unsafe { run.__bindgen_anon_1.system_event.__bindgen_anon_1.flags };
                     Ok(VcpuExit::SystemEvent(event_type, event_flags))
                 }
                 r => panic!("unknown kvm exit reason: {}", r),
             }
         } else {
             errno_result()
         }
     }
 }

 impl Deref for RunnableVcpu {
     type Target = Vcpu;
     fn deref(&self) -> &Self::Target {
         &self.vcpu
     }
 }

 impl DerefMut for RunnableVcpu {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.vcpu
     }
 }

 impl AsRawDescriptor for RunnableVcpu {
     fn as_raw_descriptor(&self) -> RawDescriptor {
         self.vcpu.as_raw_descriptor()
     }
 }

 impl Drop for RunnableVcpu {
     fn drop(&mut self) {
         VCPU_THREAD.with(|v| {
             // This assumes that a failure in `BlockedSignal::new` means the signal is already
             // blocked and there it should not be unblocked on exit.
             let _blocked_signal = &(*v.borrow())
                 .as_ref()
                 .and_then(|state| state.signal_num)
                 .map(BlockedSignal::new);

             *v.borrow_mut() = None;
         });
     }
 }

 /// Wrapper for kvm_cpuid2 which has a zero length array at the end.
 /// Hides the zero length array behind a bounds check.
 #[cfg(target_arch = "x86_64")]
 pub type CpuId = FlexibleArrayWrapper<kvm_cpuid2, kvm_cpuid_entry2>;