| // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| mod process; |
| mod vcpu; |
| |
| use std::fmt::{self, Display}; |
| use std::fs::File; |
| use std::io; |
| use std::os::unix::io::{AsRawFd, FromRawFd}; |
| use std::os::unix::net::UnixDatagram; |
| use std::path::Path; |
| use std::result; |
| use std::sync::atomic::{AtomicBool, Ordering}; |
| use std::sync::{Arc, Barrier}; |
| use std::thread; |
| use std::time::{Duration, Instant}; |
| |
| use libc::{ |
| c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR, |
| EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID, |
| MS_RDONLY, SIGCHLD, SOCK_SEQPACKET, |
| }; |
| |
| use protobuf::ProtobufError; |
| use remain::sorted; |
| |
| use base::{ |
| block_signal, clear_signal, drop_capabilities, error, getegid, geteuid, info, pipe, |
| register_rt_signal_handler, validate_raw_fd, warn, Error as SysError, Event, Killable, |
| MmapError, PollContext, PollToken, Result as SysResult, SignalFd, SignalFdError, SIGRTMIN, |
| }; |
| use kvm::{Cap, Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm}; |
| use minijail::{self, Minijail}; |
| use net_util::{Error as TapError, Tap, TapT}; |
| use vm_memory::GuestMemory; |
| |
| use self::process::*; |
| use self::vcpu::*; |
| use crate::{Config, Executable}; |
| |
| const MAX_DATAGRAM_SIZE: usize = 4096; |
| const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000; |
| |
| /// An error that occurs during the lifetime of a plugin process. |
| #[sorted] |
| pub enum Error { |
| CloneEvent(SysError), |
| CloneVcpuPipe(io::Error), |
| CreateEvent(SysError), |
| CreateIrqChip(SysError), |
| CreateJail(minijail::Error), |
| CreateKvm(SysError), |
| CreateMainSocket(SysError), |
| CreatePIT(SysError), |
| CreatePollContext(SysError), |
| CreateSignalFd(SignalFdError), |
| CreateSocketPair(io::Error), |
| CreateTapFd(TapError), |
| CreateVcpu(SysError), |
| CreateVcpuSocket(SysError), |
| CreateVm(SysError), |
| DecodeRequest(ProtobufError), |
| DropCapabilities(SysError), |
| EncodeResponse(ProtobufError), |
| Mount(minijail::Error), |
| MountDev(minijail::Error), |
| MountLib(minijail::Error), |
| MountLib64(minijail::Error), |
| MountPlugin(minijail::Error), |
| MountPluginLib(minijail::Error), |
| MountProc(minijail::Error), |
| MountRoot(minijail::Error), |
| NoRootDir, |
| ParsePivotRoot(minijail::Error), |
| ParseSeccomp(minijail::Error), |
| PluginFailed(i32), |
| PluginKill(SysError), |
| PluginKilled(i32), |
| PluginRunJail(minijail::Error), |
| PluginSocketHup, |
| PluginSocketPoll(SysError), |
| PluginSocketRecv(SysError), |
| PluginSocketSend(SysError), |
| PluginSpawn(io::Error), |
| PluginTimeout, |
| PluginWait(SysError), |
| Poll(SysError), |
| PollContextAdd(SysError), |
| RootNotAbsolute, |
| RootNotDir, |
| SetGidMap(minijail::Error), |
| SetUidMap(minijail::Error), |
| SigChild { |
| pid: u32, |
| signo: u32, |
| status: i32, |
| code: i32, |
| }, |
| SignalFd(SignalFdError), |
| SpawnVcpu(io::Error), |
| TapEnable(TapError), |
| TapOpen(TapError), |
| TapSetIp(TapError), |
| TapSetMacAddress(TapError), |
| TapSetNetmask(TapError), |
| ValidateTapFd(SysError), |
| } |
| |
| impl Display for Error { |
| #[remain::check] |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| use self::Error::*; |
| |
| #[sorted] |
| match self { |
| CloneEvent(e) => write!(f, "failed to clone event: {}", e), |
| CloneVcpuPipe(e) => write!(f, "failed to clone vcpu pipe: {}", e), |
| CreateEvent(e) => write!(f, "failed to create event: {}", e), |
| CreateIrqChip(e) => write!(f, "failed to create kvm irqchip: {}", e), |
| CreateJail(e) => write!(f, "failed to create jail: {}", e), |
| CreateKvm(e) => write!(f, "error creating Kvm: {}", e), |
| CreateMainSocket(e) => write!(f, "error creating main request socket: {}", e), |
| CreatePIT(e) => write!(f, "failed to create kvm PIT: {}", e), |
| CreatePollContext(e) => write!(f, "failed to create poll context: {}", e), |
| CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e), |
| CreateSocketPair(e) => write!(f, "failed to create socket pair: {}", e), |
| CreateTapFd(e) => write!(f, "failed to create tap device from raw fd: {}", e), |
| CreateVcpu(e) => write!(f, "error creating vcpu: {}", e), |
| CreateVcpuSocket(e) => write!(f, "error creating vcpu request socket: {}", e), |
| CreateVm(e) => write!(f, "error creating vm: {}", e), |
| DecodeRequest(e) => write!(f, "failed to decode plugin request: {}", e), |
| DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e), |
| EncodeResponse(e) => write!(f, "failed to encode plugin response: {}", e), |
| Mount(e) | MountDev(e) | MountLib(e) | MountLib64(e) | MountPlugin(e) |
| | MountPluginLib(e) | MountProc(e) | MountRoot(e) => { |
| write!(f, "failed to mount: {}", e) |
| } |
| NoRootDir => write!(f, "no root directory for jailed process to pivot root into"), |
| ParsePivotRoot(e) => write!(f, "failed to set jail pivot root: {}", e), |
| ParseSeccomp(e) => write!(f, "failed to parse jail seccomp filter: {}", e), |
| PluginFailed(e) => write!(f, "plugin exited with error: {}", e), |
| PluginKill(e) => write!(f, "error sending kill signal to plugin: {}", e), |
| PluginKilled(e) => write!(f, "plugin exited with signal {}", e), |
| PluginRunJail(e) => write!(f, "failed to run jail: {}", e), |
| PluginSocketHup => write!(f, "plugin request socket has been hung up"), |
| PluginSocketPoll(e) => write!(f, "failed to poll plugin request sockets: {}", e), |
| PluginSocketRecv(e) => write!(f, "failed to recv from plugin request socket: {}", e), |
| PluginSocketSend(e) => write!(f, "failed to send to plugin request socket: {}", e), |
| PluginSpawn(e) => write!(f, "failed to spawn plugin: {}", e), |
| PluginTimeout => write!(f, "plugin did not exit within timeout"), |
| PluginWait(e) => write!(f, "error waiting for plugin to exit: {}", e), |
| Poll(e) => write!(f, "failed to poll all FDs: {}", e), |
| PollContextAdd(e) => write!(f, "failed to add fd to poll context: {}", e), |
| RootNotAbsolute => write!(f, "path to the root directory must be absolute"), |
| RootNotDir => write!(f, "specified root directory is not a directory"), |
| SetGidMap(e) => write!(f, "failed to set gidmap for jail: {}", e), |
| SetUidMap(e) => write!(f, "failed to set uidmap for jail: {}", e), |
| SigChild { |
| pid, |
| signo, |
| status, |
| code, |
| } => write!( |
| f, |
| "process {} died with signal {}, status {}, and code {}", |
| pid, signo, status, code |
| ), |
| SignalFd(e) => write!(f, "failed to read signal fd: {}", e), |
| SpawnVcpu(e) => write!(f, "error spawning vcpu thread: {}", e), |
| TapEnable(e) => write!(f, "error enabling tap device: {}", e), |
| TapOpen(e) => write!(f, "error opening tap device: {}", e), |
| TapSetIp(e) => write!(f, "error setting tap ip: {}", e), |
| TapSetMacAddress(e) => write!(f, "error setting tap mac address: {}", e), |
| TapSetNetmask(e) => write!(f, "error setting tap netmask: {}", e), |
| ValidateTapFd(e) => write!(f, "failed to validate raw tap fd: {}", e), |
| } |
| } |
| } |
| |
| type Result<T> = result::Result<T, Error>; |
| |
| fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> { |
| let mut fds = [0, 0]; |
| unsafe { |
| let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr()); |
| if ret == 0 { |
| ioctl(fds[0], FIOCLEX); |
| Ok(( |
| UnixDatagram::from_raw_fd(fds[0]), |
| UnixDatagram::from_raw_fd(fds[1]), |
| )) |
| } else { |
| Err(SysError::last()) |
| } |
| } |
| } |
| |
| struct VcpuPipe { |
| crosvm_read: File, |
| plugin_write: File, |
| plugin_read: File, |
| crosvm_write: File, |
| } |
| |
| fn new_pipe_pair() -> SysResult<VcpuPipe> { |
| let to_crosvm = pipe(true)?; |
| let to_plugin = pipe(true)?; |
| // Increasing the pipe size can be a nice-to-have to make sure that |
| // messages get across atomically (and made sure that writes don't block), |
| // though it's not necessary a hard requirement for things to work. |
| let flags = unsafe { |
| fcntl( |
| to_crosvm.0.as_raw_fd(), |
| F_SETPIPE_SZ, |
| MAX_VCPU_DATAGRAM_SIZE as c_int, |
| ) |
| }; |
| if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 { |
| warn!( |
| "Failed to adjust size of crosvm pipe (result {}): {}", |
| flags, |
| SysError::last() |
| ); |
| } |
| let flags = unsafe { |
| fcntl( |
| to_plugin.0.as_raw_fd(), |
| F_SETPIPE_SZ, |
| MAX_VCPU_DATAGRAM_SIZE as c_int, |
| ) |
| }; |
| if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 { |
| warn!( |
| "Failed to adjust size of plugin pipe (result {}): {}", |
| flags, |
| SysError::last() |
| ); |
| } |
| Ok(VcpuPipe { |
| crosvm_read: to_crosvm.0, |
| plugin_write: to_crosvm.1, |
| plugin_read: to_plugin.0, |
| crosvm_write: to_plugin.1, |
| }) |
| } |
| |
| fn proto_to_sys_err(e: ProtobufError) -> SysError { |
| match e { |
| ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)), |
| _ => SysError::new(EINVAL), |
| } |
| } |
| |
| fn io_to_sys_err(e: io::Error) -> SysError { |
| SysError::new(e.raw_os_error().unwrap_or(EINVAL)) |
| } |
| |
| fn mmap_to_sys_err(e: MmapError) -> SysError { |
| match e { |
| MmapError::SystemCallFailed(e) => e, |
| _ => SysError::new(EINVAL), |
| } |
| } |
| |
| fn create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail> { |
| // All child jails run in a new user namespace without any users mapped, |
| // they run as nobody unless otherwise configured. |
| let mut j = Minijail::new().map_err(Error::CreateJail)?; |
| j.namespace_pids(); |
| j.namespace_user(); |
| j.uidmap(&format!("0 {0} 1", geteuid())) |
| .map_err(Error::SetUidMap)?; |
| j.gidmap(&format!("0 {0} 1", getegid())) |
| .map_err(Error::SetGidMap)?; |
| j.namespace_user_disable_setgroups(); |
| // Don't need any capabilities. |
| j.use_caps(0); |
| // Create a new mount namespace with an empty root FS. |
| j.namespace_vfs(); |
| j.enter_pivot_root(root).map_err(Error::ParsePivotRoot)?; |
| // Run in an empty network namespace. |
| j.namespace_net(); |
| j.no_new_privs(); |
| // By default we'll prioritize using the pre-compiled .bpf over the .policy |
| // file (the .bpf is expected to be compiled using "trap" as the failure |
| // behavior instead of the default "kill" behavior). |
| // Refer to the code comment for the "seccomp-log-failures" |
| // command-line parameter for an explanation about why the |log_failures| |
| // flag forces the use of .policy files (and the build-time alternative to |
| // this run-time flag). |
| let bpf_policy_file = seccomp_policy.with_extension("bpf"); |
| if bpf_policy_file.exists() && !log_failures { |
| j.parse_seccomp_program(&bpf_policy_file) |
| .map_err(Error::ParseSeccomp)?; |
| } else { |
| // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, |
| // which will correctly kill the entire device process if a worker |
| // thread commits a seccomp violation. |
| j.set_seccomp_filter_tsync(); |
| if log_failures { |
| j.log_seccomp_filter_failures(); |
| } |
| j.parse_seccomp_filters(&seccomp_policy.with_extension("policy")) |
| .map_err(Error::ParseSeccomp)?; |
| } |
| j.use_seccomp_filter(); |
| // Don't do init setup. |
| j.run_as_init(); |
| |
| // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable |
| // file into it. The size=67108864 is size=64*1024*1024 or size=64MB. |
| j.mount_with_data( |
| Path::new("none"), |
| Path::new("/"), |
| "tmpfs", |
| (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize, |
| "size=67108864", |
| ) |
| .map_err(Error::MountRoot)?; |
| |
| // Because we requested to "run as init", minijail will not mount /proc for us even though |
| // plugin will be running in its own PID namespace, so we have to mount it ourselves. |
| j.mount( |
| Path::new("proc"), |
| Path::new("/proc"), |
| "proc", |
| (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY) as usize, |
| ) |
| .map_err(Error::MountProc)?; |
| |
| Ok(j) |
| } |
| |
| /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create` |
| /// request. |
| /// |
| /// Each such object has an ID associated with it that exists in an ID space shared by every variant |
| /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a |
| /// common destroy method. |
| /// |
| |
| /// In addition to the destory method, each object may have methods specific to its variant type. |
| /// These variant methods must be done by matching the variant to the expected type for that method. |
| /// For example, getting the dirty log from a `Memory` object starting with an ID: |
| /// |
| /// ```ignore |
| /// match objects.get(&request_id) { |
| /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]), |
| /// _ => return Err(SysError::new(ENOENT)), |
| /// } |
| /// ``` |
| enum PluginObject { |
| IoEvent { |
| evt: Event, |
| addr: IoeventAddress, |
| length: u32, |
| datamatch: u64, |
| }, |
| Memory { |
| slot: u32, |
| length: usize, |
| }, |
| IrqEvent { |
| irq_id: u32, |
| evt: Event, |
| }, |
| } |
| |
| impl PluginObject { |
| fn destroy(self, vm: &mut Vm) -> SysResult<()> { |
| match self { |
| PluginObject::IoEvent { |
| evt, |
| addr, |
| length, |
| datamatch, |
| } => match length { |
| 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength), |
| 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))), |
| 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))), |
| 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))), |
| 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))), |
| _ => Err(SysError::new(EINVAL)), |
| }, |
| PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())), |
| PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id), |
| } |
| } |
| } |
| |
| pub fn run_vcpus( |
| kvm: &Kvm, |
| vm: &Vm, |
| plugin: &Process, |
| vcpu_count: u32, |
| kill_signaled: &Arc<AtomicBool>, |
| exit_evt: &Event, |
| vcpu_handles: &mut Vec<thread::JoinHandle<()>>, |
| ) -> Result<()> { |
| let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize)); |
| let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit); |
| |
| // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent |
| // to that vcpu's thread. If KVM is running the VM then it'll return -EINTR. |
| // An issue is what to do when KVM isn't running the VM (where we could be |
| // in the kernel or in the app). |
| // |
| // If KVM supports "immediate exit" then we set a signal handler that will |
| // set the |immediate_exit| flag that tells KVM to return -EINTR before running |
| // the VM. |
| // |
| // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app |
| // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked |
| // signal might get asserted). There's overhead to have KVM unblock and re-block |
| // SIGRTMIN each time it runs the VM, so this mode should be avoided. |
| |
| if use_kvm_signals { |
| unsafe { |
| extern "C" fn handle_signal() {} |
| // Our signal handler does nothing and is trivially async signal safe. |
| // We need to install this signal handler even though we do block |
| // the signal below, to ensure that this signal will interrupt |
| // execution of KVM_RUN (this is implementation issue). |
| register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) |
| .expect("failed to register vcpu signal handler"); |
| } |
| // We do not really want the signal handler to run... |
| block_signal(SIGRTMIN() + 0).expect("failed to block signal"); |
| } else { |
| unsafe { |
| extern "C" fn handle_signal() { |
| Vcpu::set_local_immediate_exit(true); |
| } |
| register_rt_signal_handler(SIGRTMIN() + 0, handle_signal) |
| .expect("failed to register vcpu signal handler"); |
| } |
| } |
| |
| for cpu_id in 0..vcpu_count { |
| let kill_signaled = kill_signaled.clone(); |
| let vcpu_thread_barrier = vcpu_thread_barrier.clone(); |
| let vcpu_exit_evt = exit_evt.try_clone().map_err(Error::CloneEvent)?; |
| let vcpu_plugin = plugin.create_vcpu(cpu_id)?; |
| let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).map_err(Error::CreateVcpu)?; |
| |
| vcpu_handles.push( |
| thread::Builder::new() |
| .name(format!("crosvm_vcpu{}", cpu_id)) |
| .spawn(move || { |
| if use_kvm_signals { |
| // Tell KVM to not block anything when entering kvm run |
| // because we will be using first RT signal to kick the VCPU. |
| vcpu.set_signal_mask(&[]) |
| .expect("failed to set up KVM VCPU signal mask"); |
| } |
| |
| #[cfg(feature = "chromeos")] |
| if let Err(e) = base::sched::enable_core_scheduling() { |
| error!("Failed to enable core scheduling: {}", e); |
| } |
| |
| let vcpu = vcpu |
| .to_runnable(Some(SIGRTMIN() + 0)) |
| .expect("Failed to set thread id"); |
| |
| let res = vcpu_plugin.init(&vcpu); |
| vcpu_thread_barrier.wait(); |
| if let Err(e) = res { |
| error!("failed to initialize vcpu {}: {}", cpu_id, e); |
| } else { |
| loop { |
| let mut interrupted_by_signal = false; |
| let run_res = vcpu.run(); |
| match run_res { |
| Ok(run) => match run { |
| VcpuExit::IoIn { port, mut size } => { |
| let mut data = [0; 256]; |
| if size > data.len() { |
| error!("unsupported IoIn size of {} bytes", size); |
| size = data.len(); |
| } |
| vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu); |
| if let Err(e) = vcpu.set_data(&data[..size]) { |
| error!("failed to set return data for IoIn: {}", e); |
| } |
| } |
| VcpuExit::IoOut { |
| port, |
| mut size, |
| data, |
| } => { |
| if size > data.len() { |
| error!("unsupported IoOut size of {} bytes", size); |
| size = data.len(); |
| } |
| vcpu_plugin.io_write(port as u64, &data[..size], &vcpu); |
| } |
| VcpuExit::MmioRead { address, size } => { |
| let mut data = [0; 8]; |
| vcpu_plugin.mmio_read( |
| address as u64, |
| &mut data[..size], |
| &vcpu, |
| ); |
| // Setting data for mmio can not fail. |
| let _ = vcpu.set_data(&data[..size]); |
| } |
| VcpuExit::MmioWrite { |
| address, |
| size, |
| data, |
| } => { |
| vcpu_plugin.mmio_write( |
| address as u64, |
| &data[..size], |
| &vcpu, |
| ); |
| } |
| VcpuExit::HypervHcall { input, params } => { |
| let mut data = [0; 8]; |
| vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu); |
| // Setting data for hyperv call can not fail. |
| let _ = vcpu.set_data(&data); |
| } |
| VcpuExit::HypervSynic { |
| msr, |
| control, |
| evt_page, |
| msg_page, |
| } => { |
| vcpu_plugin |
| .hyperv_synic(msr, control, evt_page, msg_page, &vcpu); |
| } |
| VcpuExit::Hlt => break, |
| VcpuExit::Shutdown => break, |
| VcpuExit::InternalError => { |
| error!("vcpu {} has internal error", cpu_id); |
| break; |
| } |
| r => warn!("unexpected vcpu exit: {:?}", r), |
| }, |
| Err(e) => match e.errno() { |
| EINTR => interrupted_by_signal = true, |
| EAGAIN => {} |
| _ => { |
| error!("vcpu hit unknown error: {}", e); |
| break; |
| } |
| }, |
| } |
| if kill_signaled.load(Ordering::SeqCst) { |
| break; |
| } |
| |
| // Only handle the pause request if kvm reported that it was |
| // interrupted by a signal. This helps to entire that KVM has had a chance |
| // to finish emulating any IO that may have immediately happened. |
| // If we eagerly check pre_run() then any IO that we |
| // just reported to the plugin won't have been processed yet by KVM. |
| // Not eagerly calling pre_run() also helps to reduce |
| // any overhead from checking if a pause request is pending. |
| // The assumption is that pause requests aren't common |
| // or frequent so it's better to optimize for the non-pause execution paths. |
| if interrupted_by_signal { |
| if use_kvm_signals { |
| clear_signal(SIGRTMIN() + 0) |
| .expect("failed to clear pending signal"); |
| } else { |
| vcpu.set_immediate_exit(false); |
| } |
| |
| if let Err(e) = vcpu_plugin.pre_run(&vcpu) { |
| error!("failed to process pause on vcpu {}: {}", cpu_id, e); |
| break; |
| } |
| } |
| } |
| } |
| vcpu_exit_evt |
| .write(1) |
| .expect("failed to signal vcpu exit event"); |
| }) |
| .map_err(Error::SpawnVcpu)?, |
| ); |
| } |
| Ok(()) |
| } |
| |
| #[derive(PollToken)] |
| enum Token { |
| Exit, |
| ChildSignal, |
| Plugin { index: usize }, |
| } |
| |
| /// Run a VM with a plugin process specified by `cfg`. |
| /// |
| /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific |
| /// device are ignored because the plugin is responsible for emulating hardware. |
| pub fn run_config(cfg: Config) -> Result<()> { |
| info!("crosvm starting plugin process"); |
| |
| // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this |
| // before any jailed devices have been spawned, so that we can catch any of them that fail very |
| // quickly. |
| let sigchld_fd = SignalFd::new(SIGCHLD).map_err(Error::CreateSignalFd)?; |
| |
| let jail = if cfg.sandbox { |
| // An empty directory for jailed plugin pivot root. |
| let root_path = match &cfg.plugin_root { |
| Some(dir) => dir, |
| None => Path::new(option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty")), |
| }; |
| |
| if root_path.is_relative() { |
| return Err(Error::RootNotAbsolute); |
| } |
| |
| if !root_path.exists() { |
| return Err(Error::NoRootDir); |
| } |
| |
| if !root_path.is_dir() { |
| return Err(Error::RootNotDir); |
| } |
| |
| let policy_path = cfg.seccomp_policy_dir.join("plugin"); |
| let mut jail = create_plugin_jail(root_path, cfg.seccomp_log_failures, &policy_path)?; |
| |
| // Update gid map of the jail if caller provided supplemental groups. |
| if !cfg.plugin_gid_maps.is_empty() { |
| let map = format!("0 {} 1", getegid()) |
| + &cfg |
| .plugin_gid_maps |
| .into_iter() |
| .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count)) |
| .collect::<String>(); |
| jail.gidmap(&map).map_err(Error::SetGidMap)?; |
| } |
| |
| // Mount minimal set of devices (full, zero, urandom, etc). We can not use |
| // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN. |
| let device_names = ["full", "null", "urandom", "zero"]; |
| for name in &device_names { |
| let device = Path::new("/dev").join(&name); |
| jail.mount_bind(&device, &device, true) |
| .map_err(Error::MountDev)?; |
| } |
| |
| for bind_mount in &cfg.plugin_mounts { |
| jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable) |
| .map_err(Error::Mount)?; |
| } |
| |
| Some(jail) |
| } else { |
| None |
| }; |
| |
| let mut tap_interfaces: Vec<Tap> = Vec::new(); |
| if let Some(host_ip) = cfg.host_ip { |
| if let Some(netmask) = cfg.netmask { |
| if let Some(mac_address) = cfg.mac_address { |
| let tap = Tap::new(false, false).map_err(Error::TapOpen)?; |
| tap.set_ip_addr(host_ip).map_err(Error::TapSetIp)?; |
| tap.set_netmask(netmask).map_err(Error::TapSetNetmask)?; |
| tap.set_mac_address(mac_address) |
| .map_err(Error::TapSetMacAddress)?; |
| |
| tap.enable().map_err(Error::TapEnable)?; |
| tap_interfaces.push(tap); |
| } |
| } |
| } |
| for tap_fd in cfg.tap_fd { |
| // Safe because we ensure that we get a unique handle to the fd. |
| let tap = unsafe { |
| Tap::from_raw_fd(validate_raw_fd(tap_fd).map_err(Error::ValidateTapFd)?) |
| .map_err(Error::CreateTapFd)? |
| }; |
| tap_interfaces.push(tap); |
| } |
| |
| let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect(); |
| |
| let plugin_path = match cfg.executable_path { |
| Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(), |
| _ => panic!("Executable was not a plugin"), |
| }; |
| let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32; |
| let mem = GuestMemory::new(&[]).unwrap(); |
| let kvm = Kvm::new().map_err(Error::CreateKvm)?; |
| let mut vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?; |
| vm.create_irq_chip().map_err(Error::CreateIrqChip)?; |
| vm.create_pit().map_err(Error::CreatePIT)?; |
| |
| let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail)?; |
| // Now that the jail for the plugin has been created and we had a chance to adjust gids there, |
| // we can drop all our capabilities in case we had any. |
| drop_capabilities().map_err(Error::DropCapabilities)?; |
| |
| let mut res = Ok(()); |
| // If Some, we will exit after enough time is passed to shutdown cleanly. |
| let mut dying_instant: Option<Instant> = None; |
| let duration_to_die = Duration::from_millis(1000); |
| |
| let exit_evt = Event::new().map_err(Error::CreateEvent)?; |
| let kill_signaled = Arc::new(AtomicBool::new(false)); |
| let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize); |
| |
| let poll_ctx = |
| PollContext::build_with(&[(&exit_evt, Token::Exit), (&sigchld_fd, Token::ChildSignal)]) |
| .map_err(Error::PollContextAdd)?; |
| |
| let mut sockets_to_drop = Vec::new(); |
| let mut redo_poll_ctx_sockets = true; |
| // In this loop, make every attempt to not return early. If an error is encountered, set `res` |
| // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon. |
| // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break |
| // from the poll loop so that the VCPU threads can be cleaned up. |
| 'poll: loop { |
| // After we have waited long enough, it's time to give up and exit. |
| if dying_instant |
| .map(|i| i.elapsed() >= duration_to_die) |
| .unwrap_or(false) |
| { |
| break; |
| } |
| |
| if redo_poll_ctx_sockets { |
| for (index, socket) in plugin.sockets().iter().enumerate() { |
| poll_ctx |
| .add(socket, Token::Plugin { index }) |
| .map_err(Error::PollContextAdd)?; |
| } |
| } |
| |
| let plugin_socket_count = plugin.sockets().len(); |
| let events = { |
| let poll_res = match dying_instant { |
| Some(inst) => poll_ctx.wait_timeout(duration_to_die - inst.elapsed()), |
| None => poll_ctx.wait(), |
| }; |
| match poll_res { |
| Ok(v) => v, |
| Err(e) => { |
| // Polling no longer works, time to break and cleanup, |
| if res.is_ok() { |
| res = Err(Error::Poll(e)); |
| } |
| break; |
| } |
| } |
| }; |
| for event in events.iter_readable() { |
| match event.token() { |
| Token::Exit => { |
| // No need to check the exit event if we are already doing cleanup. |
| let _ = poll_ctx.delete(&exit_evt); |
| dying_instant.get_or_insert(Instant::now()); |
| let sig_res = plugin.signal_kill(); |
| if res.is_ok() && sig_res.is_err() { |
| res = sig_res.map_err(Error::PluginKill); |
| } |
| } |
| Token::ChildSignal => { |
| // Print all available siginfo structs, then exit the loop. |
| loop { |
| match sigchld_fd.read() { |
| Ok(Some(siginfo)) => { |
| // If the plugin process has ended, there is no need to continue |
| // processing plugin connections, so we break early. |
| if siginfo.ssi_pid == plugin.pid() as u32 { |
| break 'poll; |
| } |
| // Because SIGCHLD is not expected from anything other than the |
| // plugin process, report it as an error. |
| if res.is_ok() { |
| res = Err(Error::SigChild { |
| pid: siginfo.ssi_pid, |
| signo: siginfo.ssi_signo, |
| status: siginfo.ssi_status, |
| code: siginfo.ssi_code, |
| }) |
| } |
| } |
| Ok(None) => break, // No more signals to read. |
| Err(e) => { |
| // Something really must be messed up for this to happen, continue |
| // processing connections for a limited time. |
| if res.is_ok() { |
| res = Err(Error::SignalFd(e)); |
| } |
| break; |
| } |
| } |
| } |
| // As we only spawn the plugin process, getting a SIGCHLD can only mean |
| // something went wrong. |
| dying_instant.get_or_insert(Instant::now()); |
| let sig_res = plugin.signal_kill(); |
| if res.is_ok() && sig_res.is_err() { |
| res = sig_res.map_err(Error::PluginKill); |
| } |
| } |
| Token::Plugin { index } => { |
| match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces) |
| { |
| Ok(_) => {} |
| // A HUP is an expected event for a socket, so don't bother warning about |
| // it. |
| Err(Error::PluginSocketHup) => sockets_to_drop.push(index), |
| // Only one connection out of potentially many is broken. Drop it, but don't |
| // start cleaning up. Because the error isn't returned, we will warn about |
| // it here. |
| Err(e) => { |
| warn!("error handling plugin socket: {}", e); |
| sockets_to_drop.push(index); |
| } |
| } |
| } |
| } |
| } |
| |
| if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() { |
| let res = run_vcpus( |
| &kvm, |
| &vm, |
| &plugin, |
| vcpu_count, |
| &kill_signaled, |
| &exit_evt, |
| &mut vcpu_handles, |
| ); |
| if let Err(e) = res { |
| dying_instant.get_or_insert(Instant::now()); |
| error!("failed to start vcpus: {}", e); |
| } |
| } |
| |
| redo_poll_ctx_sockets = |
| !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count; |
| |
| // Cleanup all of the sockets that we have determined were disconnected or suffered some |
| // other error. |
| plugin.drop_sockets(&mut sockets_to_drop); |
| sockets_to_drop.clear(); |
| |
| if redo_poll_ctx_sockets { |
| for socket in plugin.sockets() { |
| let _ = poll_ctx.delete(socket); |
| } |
| } |
| } |
| |
| // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM. |
| kill_signaled.store(true, Ordering::SeqCst); |
| // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests |
| // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their |
| // blocked connections. |
| plugin.signal_kill().map_err(Error::PluginKill)?; |
| for handle in vcpu_handles { |
| match handle.kill(SIGRTMIN() + 0) { |
| Ok(_) => { |
| if let Err(e) = handle.join() { |
| error!("failed to join vcpu thread: {:?}", e); |
| } |
| } |
| Err(e) => error!("failed to kill vcpu thread: {}", e), |
| } |
| } |
| |
| match plugin.try_wait() { |
| // The plugin has run out of time by now |
| Ok(ProcessStatus::Running) => Err(Error::PluginTimeout), |
| // Return an error discovered earlier in this function. |
| Ok(ProcessStatus::Success) => res, |
| Ok(ProcessStatus::Fail(code)) => Err(Error::PluginFailed(code)), |
| Ok(ProcessStatus::Signal(code)) => Err(Error::PluginKilled(code)), |
| Err(e) => Err(Error::PluginWait(e)), |
| } |
| } |