blob: b6700b1753a996dd339b6ea5be4f3ad3e708e869 [file] [log] [blame]
// Copyright 2019 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::borrow::Cow;
use std::cell::RefCell;
use std::collections::btree_map;
use std::collections::BTreeMap;
use std::ffi::{c_void, CStr, CString};
use std::fs::File;
use std::io;
use std::mem::{self, size_of, MaybeUninit};
use std::os::raw::{c_int, c_long};
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
use std::ptr;
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use base::{error, ioctl_ior_nr, ioctl_iow_nr, ioctl_with_mut_ptr, ioctl_with_ptr, warn};
use data_model::DataInit;
use rand_ish::SimpleRng;
use sync::Mutex;
use crate::virtio::fs::filesystem::{
Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
IoctlIovec, IoctlReply, ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader,
ZeroCopyWriter,
};
use crate::virtio::fs::fuse;
use crate::virtio::fs::multikey::MultikeyBTreeMap;
use crate::virtio::fs::read_dir::ReadDir;
const EMPTY_CSTR: &[u8] = b"\0";
const ROOT_CSTR: &[u8] = b"/\0";
const PROC_CSTR: &[u8] = b"/proc\0";
const UNLABELED: &[u8] = b"unlabeled";
const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
const SECURITY_XATTR: &[u8] = b"security.";
const SELINUX_XATTR: &[u8] = b"security.selinux";
const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fscrypt_policy_v1 {
_version: u8,
_contents_encryption_mode: u8,
_filenames_encryption_mode: u8,
_flags: u8,
_master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
}
unsafe impl DataInit for fscrypt_policy_v1 {}
ioctl_ior_nr!(FS_IOC_SET_ENCRYPTION_POLICY, 0x66, 19, fscrypt_policy_v1);
ioctl_iow_nr!(FS_IOC_GET_ENCRYPTION_POLICY, 0x66, 21, fscrypt_policy_v1);
#[repr(C)]
#[derive(Clone, Copy)]
struct fsxattr {
_fsx_xflags: u32, /* xflags field value (get/set) */
_fsx_extsize: u32, /* extsize field value (get/set)*/
_fsx_nextents: u32, /* nextents field value (get) */
_fsx_projid: u32, /* project identifier (get/set) */
_fsx_cowextsize: u32, /* CoW extsize field value (get/set)*/
_fsx_pad: [u8; 8],
}
unsafe impl DataInit for fsxattr {}
ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
type Inode = u64;
type Handle = u64;
#[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
struct InodeAltKey {
ino: libc::ino64_t,
dev: libc::dev_t,
}
#[derive(PartialEq, Eq)]
enum FileType {
Regular,
Directory,
Other,
}
impl From<libc::mode_t> for FileType {
fn from(mode: libc::mode_t) -> Self {
match mode & libc::S_IFMT {
libc::S_IFREG => FileType::Regular,
libc::S_IFDIR => FileType::Directory,
_ => FileType::Other,
}
}
}
struct InodeData {
inode: Inode,
// Most of these aren't actually files but ¯\_(ツ)_/¯.
file: File,
refcount: AtomicU64,
filetype: FileType,
}
struct HandleData {
inode: Inode,
file: Mutex<File>,
}
macro_rules! scoped_cred {
($name:ident, $ty:ty, $syscall_nr:expr) => {
#[derive(Debug)]
struct $name {
old: $ty,
}
impl $name {
// Changes the effective uid/gid of the current thread to `val`. Changes the thread's
// credentials back to `old` when the returned struct is dropped.
fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
if val == old {
// Nothing to do since we already have the correct value.
return Ok(None);
}
// We want credential changes to be per-thread because otherwise
// we might interfere with operations being carried out on other
// threads with different uids/gids. However, posix requires that
// all threads in a process share the same credentials. To do this
// libc uses signals to ensure that when one thread changes its
// credentials the other threads do the same thing.
//
// So instead we invoke the syscall directly in order to get around
// this limitation. Another option is to use the setfsuid and
// setfsgid systems calls. However since those calls have no way to
// return an error, it's preferable to do this instead.
// This call is safe because it doesn't modify any memory and we
// check the return value.
let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
if res == 0 {
Ok(Some($name { old }))
} else {
Err(io::Error::last_os_error())
}
}
}
impl Drop for $name {
fn drop(&mut self) {
let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
if res < 0 {
error!(
"failed to change credentials back to {}: {}",
self.old,
io::Error::last_os_error(),
);
}
}
}
};
}
#[cfg(not(target_arch = "arm"))]
scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
#[cfg(target_arch = "arm")]
scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
#[cfg(not(target_arch = "arm"))]
scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
#[cfg(target_arch = "arm")]
scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
#[cfg(not(target_arch = "arm"))]
const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
#[cfg(target_arch = "arm")]
const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
#[cfg(not(target_arch = "arm"))]
const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
#[cfg(target_arch = "arm")]
const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
thread_local! {
// Both these calls are safe because they take no parameters, and only return an integer value.
// The kernel also guarantees that they can never fail.
static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
}
fn set_creds(
uid: libc::uid_t,
gid: libc::gid_t,
) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
let olduid = THREAD_EUID.with(|uid| *uid);
let oldgid = THREAD_EGID.with(|gid| *gid);
// We have to change the gid before we change the uid because if we change the uid first then we
// lose the capability to change the gid. However changing back can happen in any order.
ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
}
thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = RefCell::new(None));
// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
// open the file.
fn open_fscreate(proc: &File) -> File {
// Safe because this is a valid c-string.
let fscreate = unsafe { CStr::from_bytes_with_nul_unchecked(b"thread-self/attr/fscreate\0") };
// Safe because this doesn't modify any memory and we check the return value.
let fd = unsafe {
libc::openat(
proc.as_raw_fd(),
fscreate.as_ptr(),
libc::O_CLOEXEC | libc::O_WRONLY,
)
};
// We don't expect this to fail and we're not in a position to return an error here so just
// panic.
if fd < 0 {
panic!(
"Failed to open /proc/thread-self/attr/fscreate: {}",
io::Error::last_os_error()
);
}
// Safe because we just opened this fd.
unsafe { File::from_raw_fd(fd) }
}
struct ScopedSecurityContext;
impl ScopedSecurityContext {
fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
THREAD_FSCREATE.with(|thread_fscreate| {
let mut fscreate = thread_fscreate.borrow_mut();
let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
// Safe because this doesn't modify any memory and we check the return value.
let ret = unsafe {
libc::write(
file.as_raw_fd(),
ctx.as_ptr() as *const libc::c_void,
ctx.to_bytes_with_nul().len(),
)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(ScopedSecurityContext)
}
})
}
}
impl Drop for ScopedSecurityContext {
fn drop(&mut self) {
THREAD_FSCREATE.with(|thread_fscreate| {
// expect is safe here because the thread local would have been initialized by the call
// to `new` above.
let fscreate = thread_fscreate.borrow();
let file = fscreate
.as_ref()
.expect("Uninitialized thread-local when dropping ScopedSecurityContext");
// Safe because this doesn't modify any memory and we check the return value.
let ret = unsafe { libc::write(file.as_raw_fd(), ptr::null(), 0) };
if ret < 0 {
warn!(
"Failed to restore security context: {}",
io::Error::last_os_error()
);
}
})
}
}
fn ebadf() -> io::Error {
io::Error::from_raw_os_error(libc::EBADF)
}
fn stat(f: &File) -> io::Result<libc::stat64> {
let mut st = MaybeUninit::<libc::stat64>::zeroed();
// Safe because this is a constant value and a valid C string.
let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// Safe because the kernel will only write data in `st` and we check the return
// value.
let res = unsafe {
libc::fstatat64(
f.as_raw_fd(),
pathname.as_ptr(),
st.as_mut_ptr(),
libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
)
};
if res >= 0 {
// Safe because the kernel guarantees that the struct is now fully initialized.
Ok(unsafe { st.assume_init() })
} else {
Err(io::Error::last_os_error())
}
}
/// The caching policy that the file system should report to the FUSE client. By default the FUSE
/// protocol uses close-to-open consistency. This means that any cached contents of the file are
/// invalidated the next time that file is opened.
#[derive(Debug, Clone)]
pub enum CachePolicy {
/// The client should never cache file data and all I/O should be directly forwarded to the
/// server. This policy must be selected when file contents may change without the knowledge of
/// the FUSE client (i.e., the file system does not have exclusive access to the directory).
Never,
/// The client is free to choose when and how to cache file data. This is the default policy and
/// uses close-to-open consistency as described in the enum documentation.
Auto,
/// The client should always cache file data. This means that the FUSE client will not
/// invalidate any cached data that was returned by the file system the last time the file was
/// opened. This policy should only be selected when the file system has exclusive access to the
/// directory.
Always,
}
impl FromStr for CachePolicy {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"never" | "Never" | "NEVER" => Ok(CachePolicy::Never),
"auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto),
"always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always),
_ => Err("invalid cache policy"),
}
}
}
impl Default for CachePolicy {
fn default() -> Self {
CachePolicy::Auto
}
}
/// Options that configure the behavior of the file system.
#[derive(Debug, Clone)]
pub struct Config {
/// How long the FUSE client should consider directory entries to be valid. If the contents of a
/// directory can only be modified by the FUSE client (i.e., the file system has exclusive
/// access), then this should be a large value.
///
/// The default value for this option is 5 seconds.
pub entry_timeout: Duration,
/// How long the FUSE client should consider file and directory attributes to be valid. If the
/// attributes of a file or directory can only be modified by the FUSE client (i.e., the file
/// system has exclusive access), then this should be set to a large value.
///
/// The default value for this option is 5 seconds.
pub attr_timeout: Duration,
/// The caching policy the file system should use. See the documentation of `CachePolicy` for
/// more details.
pub cache_policy: CachePolicy,
/// Whether the file system should enabled writeback caching. This can improve performance as it
/// allows the FUSE client to cache and coalesce multiple writes before sending them to the file
/// system. However, enabling this option can increase the risk of data corruption if the file
/// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT**
/// have exclusive access). Additionally, the file system should have read access to all files
/// in the directory it is serving as the FUSE client may send read requests even for files
/// opened with `O_WRONLY`.
///
/// Therefore callers should only enable this option when they can guarantee that: 1) the file
/// system has exclusive access to the directory and 2) the file system has read permissions for
/// all files in that directory.
///
/// The default value for this option is `false`.
pub writeback: bool,
/// Controls whether security.* xattrs (except for security.selinux) are re-written. When this
/// is set to true, the server will add a "user.virtiofs" prefix to xattrs in the security
/// namespace. Setting these xattrs requires CAP_SYS_ADMIN in the namespace where the file
/// system was mounted and since the server usually runs in an unprivileged user namespace, it's
/// unlikely to have that capability.
///
/// The default value for this option is `false`.
pub rewrite_security_xattrs: bool,
/// Use case-insensitive lookups for directory entries (ASCII only).
///
/// The default value for this option is `false`.
pub ascii_casefold: bool,
}
impl Default for Config {
fn default() -> Self {
Config {
entry_timeout: Duration::from_secs(5),
attr_timeout: Duration::from_secs(5),
cache_policy: Default::default(),
writeback: false,
rewrite_security_xattrs: false,
ascii_casefold: false,
}
}
}
/// A file system that simply "passes through" all requests it receives to the underlying file
/// system. To keep the implementation simple it servers the contents of its root directory. Users
/// that wish to serve only a specific directory should set up the environment so that that
/// directory ends up as the root of the file system process. One way to accomplish this is via a
/// combination of mount namespaces and the pivot_root system call.
pub struct PassthroughFs {
// File descriptors for various points in the file system tree. These fds are always opened with
// the `O_PATH` option so they cannot be used for reading or writing any data. See the
// documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot
// do with an fd opened with this flag.
inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
next_inode: AtomicU64,
// File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
// used for reading and writing data.
handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
next_handle: AtomicU64,
// File descriptor pointing to the `/proc` directory. This is used to convert an fd from
// `inodes` into one that can go into `handles`. This is accomplished by reading the
// `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
// to be serving doesn't have access to `/proc`.
proc: File,
// Whether writeback caching is enabled for this directory. This will only be true when
// `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
writeback: AtomicBool,
// Used to ensure that only one thread at a time uses chdir(). Since chdir() affects the
// process-wide CWD, we cannot allow more than one thread to do it at the same time.
chdir_mutex: Mutex<()>,
cfg: Config,
}
impl PassthroughFs {
pub fn new(cfg: Config) -> io::Result<PassthroughFs> {
// Safe because this is a constant value and a valid C string.
let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
// Safe because this doesn't modify any memory and we check the return value.
let fd = unsafe {
libc::openat(
libc::AT_FDCWD,
proc_cstr.as_ptr(),
libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
)
};
if fd < 0 {
return Err(io::Error::last_os_error());
}
// Safe because we just opened this fd.
let proc = unsafe { File::from_raw_fd(fd) };
Ok(PassthroughFs {
inodes: Mutex::new(MultikeyBTreeMap::new()),
next_inode: AtomicU64::new(fuse::ROOT_ID + 1),
handles: Mutex::new(BTreeMap::new()),
next_handle: AtomicU64::new(0),
proc,
writeback: AtomicBool::new(false),
chdir_mutex: Mutex::new(()),
cfg,
})
}
pub fn keep_fds(&self) -> Vec<RawFd> {
vec![self.proc.as_raw_fd()]
}
fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
if !self.cfg.rewrite_security_xattrs {
return Cow::Borrowed(name);
}
// Does not include nul-terminator.
let buf = name.to_bytes();
if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
return Cow::Borrowed(name);
}
let mut newname = USER_VIRTIOFS_XATTR.to_vec();
newname.extend_from_slice(buf);
// The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
// to_bytes() call above will not return a byte slice with any interior nul-bytes either.
Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
}
fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
self.inodes
.lock()
.get(&inode)
.map(Arc::clone)
.ok_or_else(ebadf)
}
fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
self.handles
.lock()
.get(&handle)
.filter(|hd| hd.inode == inode)
.map(Arc::clone)
.ok_or_else(ebadf)
}
fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
let pathname = CString::new(format!("self/fd/{}", inode.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// When writeback caching is enabled, the kernel may send read requests even if the
// userspace program opened the file write-only. So we need to ensure that we have opened
// the file for reading as well as writing.
let writeback = self.writeback.load(Ordering::Relaxed);
if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
flags &= !libc::O_ACCMODE;
flags |= libc::O_RDWR;
}
// When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
// However, this breaks atomicity as the file may have changed on disk, invalidating the
// cached copy of the data in the kernel and the offset that the kernel thinks is the end of
// the file. Just allow this for now as it is the user's responsibility to enable writeback
// caching only for directories that are not shared. It also means that we need to clear the
// `O_APPEND` flag.
if writeback && flags & libc::O_APPEND != 0 {
flags &= !libc::O_APPEND;
}
// Safe because this doesn't modify any memory and we check the return value. We don't
// really check `flags` because if the kernel can't handle poorly specified flags then we
// have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
// to follow the `/proc/self/fd` symlink to get the file.
let fd = unsafe {
libc::openat(
self.proc.as_raw_fd(),
pathname.as_ptr(),
(flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
)
};
if fd < 0 {
return Err(io::Error::last_os_error());
}
// Safe because we just opened this fd.
Ok(unsafe { File::from_raw_fd(fd) })
}
// Performs an ascii case insensitive lookup and returns an O_PATH fd for the entry, if found.
fn ascii_casefold_lookup(&self, dir: &InodeData, name: &[u8]) -> io::Result<RawFd> {
let parent = self.open_inode(dir, libc::O_RDONLY | libc::O_DIRECTORY)?;
let mut buf = [0u8; 1024];
let mut offset = 0;
loop {
let mut read_dir = ReadDir::new(&parent, offset, &mut buf[..])?;
if read_dir.remaining() == 0 {
break;
}
while let Some(entry) = read_dir.next() {
offset = entry.offset as libc::off64_t;
if name.eq_ignore_ascii_case(entry.name.to_bytes()) {
return Ok(unsafe {
libc::openat(
parent.as_raw_fd(),
entry.name.as_ptr(),
libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
)
});
}
}
}
Err(io::Error::from_raw_os_error(libc::ENOENT))
}
fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
let fd = {
// Safe because this doesn't modify any memory and we check the return value.
let fd = unsafe {
libc::openat(
parent.file.as_raw_fd(),
name.as_ptr(),
libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
)
};
if fd < 0 && self.cfg.ascii_casefold {
// Ignore any errors during casefold lookup.
self.ascii_casefold_lookup(parent, name.to_bytes())
.unwrap_or(fd)
} else {
fd
}
};
if fd < 0 {
return Err(io::Error::last_os_error());
}
// Safe because we just opened this fd.
let f = unsafe { File::from_raw_fd(fd) };
let st = stat(&f)?;
let altkey = InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
};
let data = self.inodes.lock().get_alt(&altkey).map(Arc::clone);
let inode = if let Some(data) = data {
// Matches with the release store in `forget`.
data.refcount.fetch_add(1, Ordering::Acquire);
data.inode
} else {
// There is a possible race here where 2 threads end up adding the same file
// into the inode list. However, since each of those will get a unique Inode
// value and unique file descriptors this shouldn't be that much of a problem.
let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
self.inodes.lock().insert(
inode,
InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
},
Arc::new(InodeData {
inode,
file: f,
refcount: AtomicU64::new(1),
filetype: st.st_mode.into(),
}),
);
inode
};
Ok(Entry {
inode,
generation: 0,
attr: st,
attr_timeout: self.cfg.attr_timeout.clone(),
entry_timeout: self.cfg.entry_timeout.clone(),
})
}
fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
let inode_data = self.find_inode(inode)?;
let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
let data = HandleData { inode, file };
self.handles.lock().insert(handle, Arc::new(data));
let mut opts = OpenOptions::empty();
match self.cfg.cache_policy {
// We only set the direct I/O option on files.
CachePolicy::Never => opts.set(
OpenOptions::DIRECT_IO,
flags & (libc::O_DIRECTORY as u32) == 0,
),
CachePolicy::Always => {
opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
OpenOptions::KEEP_CACHE
} else {
OpenOptions::CACHE_DIR
}
}
_ => {}
};
Ok((Some(handle), opts))
}
fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
let mut handles = self.handles.lock();
if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
if e.get().inode == inode {
// We don't need to close the file here because that will happen automatically when
// the last `Arc` is dropped.
e.remove();
return Ok(());
}
}
Err(ebadf())
}
fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
let st = stat(&inode.file)?;
Ok((st, self.cfg.attr_timeout.clone()))
}
fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe { libc::unlinkat(parent.file.as_raw_fd(), name.as_ptr(), flags) };
if res == 0 {
Ok(())
} else {
Err(io::Error::last_os_error())
}
}
// Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
// directory. This effectively emulates an *at syscall starting at /proc, which is useful when
// there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
// root inode.
fn with_proc_chdir<F, T>(&self, f: F) -> T
where
F: FnOnce() -> T,
{
let root = self
.find_inode(fuse::ROOT_ID)
.expect("failed to find root inode");
let chdir_lock = self.chdir_mutex.lock();
// Safe because this doesn't modify any memory and we check the return value. Since the
// fchdir should never fail we just use debug_asserts.
let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_fd()) };
debug_assert_eq!(
proc_cwd,
0,
"failed to fchdir to /proc: {}",
io::Error::last_os_error()
);
let res = f();
// Safe because this doesn't modify any memory and we check the return value. Since the
// fchdir should never fail we just use debug_asserts.
let root_cwd = unsafe { libc::fchdir(root.file.as_raw_fd()) };
debug_assert_eq!(
root_cwd,
0,
"failed to fchdir back to root directory: {}",
io::Error::last_os_error()
);
mem::drop(chdir_lock);
res
}
fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
let res = if inode.filetype == FileType::Other {
// For non-regular files and directories, we cannot open the fd normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", inode.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Safe because this will only modify `value` and we check the return value.
self.with_proc_chdir(|| unsafe {
libc::getxattr(
path.as_ptr(),
name.as_ptr(),
value.as_mut_ptr() as *mut libc::c_void,
value.len() as libc::size_t,
)
})
} else {
// For regular files and directories, we can just open an fd and use fgetxattr.
let dir = if inode.filetype == FileType::Directory {
libc::O_DIRECTORY
} else {
0
};
let f = self.open_inode(inode, libc::O_RDONLY | dir)?;
// Safe because this will only write to `value` and we check the return value.
unsafe {
libc::fgetxattr(
f.as_raw_fd(),
name.as_ptr(),
value.as_mut_ptr() as *mut libc::c_void,
value.len() as libc::size_t,
)
}
};
if res < 0 {
Err(io::Error::last_os_error())
} else {
Ok(res as usize)
}
}
// Checks whether `inode` has a default posix acl xattr.
fn has_default_posix_acl(&self, inode: &InodeData) -> io::Result<bool> {
// Safe because this is a valid c string with no interior nul-bytes.
let acl = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.posix_acl_default\0") };
if let Err(e) = self.do_getxattr(inode, acl, &mut []) {
match e.raw_os_error() {
Some(libc::ENODATA) | Some(libc::EOPNOTSUPP) => Ok(false),
_ => Err(e),
}
} else {
Ok(true)
}
}
fn get_encryption_policy(&self, handle: Handle) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
let mut buf = MaybeUninit::<fscrypt_policy_v1>::zeroed();
let file = data.file.lock();
// Safe because the kernel will only write to `buf` and we check the return value.
let res =
unsafe { ioctl_with_mut_ptr(&*file, FS_IOC_GET_ENCRYPTION_POLICY(), buf.as_mut_ptr()) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
// Safe because the kernel guarantees that the policy is now initialized.
let policy = unsafe { buf.assume_init() };
Ok(IoctlReply::Done(Ok(policy.as_slice().to_vec())))
}
}
fn set_encryption_policy<R: io::Read>(&self, handle: Handle, r: R) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
let policy = fscrypt_policy_v1::from_reader(r)?;
let file = data.file.lock();
// Safe because the kernel will only read from `policy` and we check the return value.
let res = unsafe { ioctl_with_ptr(&*file, FS_IOC_SET_ENCRYPTION_POLICY(), &policy) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
fn get_fsxattr(&self, handle: Handle) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
let mut buf = MaybeUninit::<fsxattr>::zeroed();
let file = data.file.lock();
// Safe because the kernel will only write to `buf` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*file, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
// Safe because the kernel guarantees that the policy is now initialized.
let xattr = unsafe { buf.assume_init() };
Ok(IoctlReply::Done(Ok(xattr.as_slice().to_vec())))
}
}
fn set_fsxattr<R: io::Read>(&self, handle: Handle, r: R) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
let attr = fsxattr::from_reader(r)?;
let file = data.file.lock();
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe { ioctl_with_ptr(&*file, FS_IOC_FSSETXATTR(), &attr) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
fn get_flags(&self, handle: Handle) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
// The ioctl encoding is a long but the parameter is actually an int.
let mut flags: c_int = 0;
let file = data.file.lock();
// Safe because the kernel will only write to `flags` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*file, FS_IOC_GETFLAGS(), &mut flags) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
}
}
fn set_flags<R: io::Read>(&self, handle: Handle, r: R) -> io::Result<IoctlReply> {
let data = self
.handles
.lock()
.get(&handle)
.map(Arc::clone)
.ok_or_else(ebadf)?;
// The ioctl encoding is a long but the parameter is actually an int.
let flags = c_int::from_reader(r)?;
let file = data.file.lock();
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe { ioctl_with_ptr(&*file, FS_IOC_SETFLAGS(), &flags) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
}
fn forget_one(
inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
inode: Inode,
count: u64,
) {
if let Some(data) = inodes.get(&inode) {
// Acquiring the write lock on the inode map prevents new lookups from incrementing the
// refcount but there is the possibility that a previous lookup already acquired a
// reference to the inode data and is in the process of updating the refcount so we need
// to loop here until we can decrement successfully.
loop {
let refcount = data.refcount.load(Ordering::Relaxed);
// Saturating sub because it doesn't make sense for a refcount to go below zero and
// we don't want misbehaving clients to cause integer overflow.
let new_count = refcount.saturating_sub(count);
// Synchronizes with the acquire load in `do_lookup`.
if data
.refcount
.compare_and_swap(refcount, new_count, Ordering::Release)
== refcount
{
if new_count == 0 {
// We just removed the last refcount for this inode. There's no need for an
// acquire fence here because we hold a write lock on the inode map and any
// thread that is waiting to do a forget on the same inode will have to wait
// until we release the lock. So there's is no other release store for us to
// synchronize with before deleting the entry.
inodes.remove(&inode);
}
break;
}
}
}
}
// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
fn strip_xattr_prefix(buf: &mut Vec<u8>) {
fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
if start >= b.len() {
return None;
}
let end = b[start..]
.iter()
.position(|&c| c == b'\0')
.map(|p| start + p + 1)
.unwrap_or(b.len());
Some(&b[start..end])
}
let mut pos = 0;
while let Some(name) = next_cstr(&buf, pos) {
if !name.starts_with(USER_VIRTIOFS_XATTR) {
pos += name.len();
continue;
}
let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
pos += newlen;
}
}
// Like mkdtemp but also takes a mode parameter rather than always using 0o700. This is needed
// because if the parent has a default posix acl set then the meaning of the mode parameter in the
// mkdir call completely changes: the actual mode is inherited from the default acls set in the
// parent and the mode is treated like a umask (the real umask is ignored in this case).
// Additionally, this only happens when the inode is first created and not on subsequent fchmod
// calls so we really need to use the requested mode from the very beginning and not the default
// 0o700 mode that mkdtemp uses.
fn create_temp_dir(parent: &File, mode: libc::mode_t) -> io::Result<CString> {
const MAX_ATTEMPTS: usize = 64;
let mut seed = 0u64.to_ne_bytes();
// Safe because this will only modify `seed` and we check the return value.
let ret = unsafe {
libc::syscall(
libc::SYS_getrandom,
seed.as_mut_ptr() as *mut c_void,
seed.len(),
0,
)
};
if ret < 0 {
return Err(io::Error::last_os_error());
}
let mut rng = SimpleRng::new(u64::from_ne_bytes(seed));
// Set an upper bound so that we don't end up spinning here forever.
for _ in 0..MAX_ATTEMPTS {
let mut name = String::from(".");
name.push_str(&rng.str(6));
let name = CString::new(name).expect("SimpleRng produced string with nul-bytes");
// Safe because this doesn't modify any memory and we check the return value.
let ret = unsafe { libc::mkdirat(parent.as_raw_fd(), name.as_ptr(), mode) };
if ret == 0 {
return Ok(name);
}
let e = io::Error::last_os_error();
if let Some(libc::EEXIST) = e.raw_os_error() {
continue;
} else {
return Err(e);
}
}
Err(io::Error::from_raw_os_error(libc::EAGAIN))
}
// A temporary directory that is automatically deleted when dropped unless `into_inner()` is called.
// This isn't a general-purpose temporary directory and is only intended to be used to ensure that
// there are no leaks when initializing a newly created directory with the correct metadata (see the
// implementation of `mkdir()` below). The directory is removed via a call to `unlinkat` so callers
// are not allowed to actually populate this temporary directory with any entries (or else deleting
// the directory will fail).
struct TempDir<'a> {
parent: &'a File,
name: CString,
file: File,
}
impl<'a> TempDir<'a> {
// Creates a new temporary directory in `parent` with a randomly generated name. `parent` must
// be a directory.
fn new(parent: &File, mode: libc::mode_t) -> io::Result<TempDir> {
let name = create_temp_dir(parent, mode)?;
// Safe because this doesn't modify any memory and we check the return value.
let fd = unsafe {
libc::openat(
parent.as_raw_fd(),
name.as_ptr(),
libc::O_DIRECTORY | libc::O_CLOEXEC,
)
};
if fd < 0 {
return Err(io::Error::last_os_error());
}
Ok(TempDir {
parent,
name,
// Safe because we just opened this fd.
file: unsafe { File::from_raw_fd(fd) },
})
}
fn basename(&self) -> &CStr {
&self.name
}
// Consumes the `TempDir`, returning the inner `File` without deleting the temporary
// directory.
fn into_inner(self) -> (CString, File) {
// Safe because this is a valid pointer and we are going to call `mem::forget` on `self` so
// we will not be aliasing memory.
let _parent = unsafe { ptr::read(&self.parent) };
let name = unsafe { ptr::read(&self.name) };
let file = unsafe { ptr::read(&self.file) };
mem::forget(self);
(name, file)
}
}
impl<'a> AsRawFd for TempDir<'a> {
fn as_raw_fd(&self) -> RawFd {
self.file.as_raw_fd()
}
}
impl<'a> Drop for TempDir<'a> {
fn drop(&mut self) {
// Safe because this doesn't modify any memory and we check the return value.
let ret = unsafe {
libc::unlinkat(
self.parent.as_raw_fd(),
self.name.as_ptr(),
libc::AT_REMOVEDIR,
)
};
if ret < 0 {
println!("Failed to remove tempdir: {}", io::Error::last_os_error());
error!("Failed to remove tempdir: {}", io::Error::last_os_error());
}
}
}
impl FileSystem for PassthroughFs {
type Inode = Inode;
type Handle = Handle;
type DirIter = ReadDir<Box<[u8]>>;
fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
// Safe because this is a constant value and a valid C string.
let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
// Safe because this doesn't modify any memory and we check the return value.
// We use `O_PATH` because we just want this for traversing the directory tree
// and not for actually reading the contents.
let fd = unsafe {
libc::openat(
libc::AT_FDCWD,
root.as_ptr(),
libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
)
};
if fd < 0 {
return Err(io::Error::last_os_error());
}
// Safe because we just opened this fd above.
let f = unsafe { File::from_raw_fd(fd) };
let st = stat(&f)?;
// Safe because this doesn't modify any memory and there is no need to check the return
// value because this system call always succeeds. We need to clear the umask here because
// we want the client to be able to set all the bits in the mode.
unsafe { libc::umask(0o000) };
let mut inodes = self.inodes.lock();
// Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
inodes.insert(
fuse::ROOT_ID,
InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
},
Arc::new(InodeData {
inode: fuse::ROOT_ID,
file: f,
refcount: AtomicU64::new(2),
filetype: st.st_mode.into(),
}),
);
let mut opts = FsOptions::DO_READDIRPLUS
| FsOptions::READDIRPLUS_AUTO
| FsOptions::EXPORT_SUPPORT
| FsOptions::DONT_MASK
| FsOptions::SECURITY_CONTEXT
| FsOptions::POSIX_ACL;
if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
opts |= FsOptions::WRITEBACK_CACHE;
self.writeback.store(true, Ordering::Relaxed);
}
Ok(opts)
}
fn destroy(&self) {
self.handles.lock().clear();
self.inodes.lock().clear();
}
fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
let data = self.find_inode(inode)?;
let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
// Safe because this will only modify `out` and we check the return value.
let res = unsafe { libc::fstatvfs64(data.file.as_raw_fd(), out.as_mut_ptr()) };
if res == 0 {
// Safe because the kernel guarantees that `out` has been initialized.
Ok(unsafe { out.assume_init() })
} else {
Err(io::Error::last_os_error())
}
}
fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
let data = self.find_inode(parent)?;
self.do_lookup(&data, name)
}
fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
let mut inodes = self.inodes.lock();
forget_one(&mut inodes, inode, count)
}
fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
let mut inodes = self.inodes.lock();
for (inode, count) in requests {
forget_one(&mut inodes, inode, count)
}
}
fn opendir(
&self,
_ctx: Context,
inode: Inode,
flags: u32,
) -> io::Result<(Option<Handle>, OpenOptions)> {
self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
}
fn releasedir(
&self,
_ctx: Context,
inode: Inode,
_flags: u32,
handle: Handle,
) -> io::Result<()> {
self.do_release(inode, handle)
}
fn mkdir(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mut mode: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
// This method has the same issues as `create()`: namely that the kernel may have allowed a
// process to make a directory due to one of its supplementary groups but that information
// is not forwarded to us. However, there is no `O_TMPDIR` equivalent for directories so
// instead we create a "hidden" directory with a randomly generated name in the parent
// directory, modify the uid/gid and mode to the proper values, and then rename it to the
// requested name. This ensures that even in the case of a power loss the directory is not
// visible in the filesystem with the requested name but incorrect metadata. The only thing
// left would be a empty hidden directory with a random name.
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes() != UNLABELED)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
// The presence of a default posix acl xattr in the parent directory completely changes the
// meaning of the mode parameter so only apply the umask if it doesn't have one.
if !self.has_default_posix_acl(&data)? {
mode &= !umask;
}
let tmpdir = TempDir::new(&data.file, mode)?;
// We need to respect the setgid bit in the parent directory if it is set.
let st = stat(&data.file)?;
let gid = if st.st_mode & libc::S_ISGID != 0 {
st.st_gid
} else {
ctx.gid
};
// Set the uid and gid for the directory. Safe because this doesn't modify any memory and we
// check the return value.
let ret = unsafe { libc::fchown(tmpdir.as_raw_fd(), ctx.uid, gid) };
if ret < 0 {
return Err(io::Error::last_os_error());
}
// Now rename it into place. Safe because this doesn't modify any memory and we check the
// return value. TODO: Switch to libc::renameat2 once
// https://github.com/rust-lang/libc/pull/1508 lands and we have glibc 2.28.
let ret = unsafe {
libc::syscall(
libc::SYS_renameat2,
data.file.as_raw_fd(),
tmpdir.basename().as_ptr(),
data.file.as_raw_fd(),
name.as_ptr(),
libc::RENAME_NOREPLACE,
)
};
if ret < 0 {
return Err(io::Error::last_os_error());
}
// Now that we've moved the directory make sure we don't try to delete the now non-existent
// `tmpdir`.
tmpdir.into_inner();
self.do_lookup(&data, name)
}
fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
let data = self.find_inode(parent)?;
self.do_unlink(&data, name, libc::AT_REMOVEDIR)
}
fn readdir(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
size: u32,
offset: u64,
) -> io::Result<Self::DirIter> {
let data = self.find_handle(handle, inode)?;
let mut buf = Vec::with_capacity(size as usize);
buf.resize(size as usize, 0);
let dir = data.file.lock();
ReadDir::new(&*dir, offset as libc::off64_t, buf.into_boxed_slice())
}
fn open(
&self,
_ctx: Context,
inode: Inode,
flags: u32,
) -> io::Result<(Option<Handle>, OpenOptions)> {
self.do_open(inode, flags)
}
fn release(
&self,
_ctx: Context,
inode: Inode,
_flags: u32,
handle: Handle,
_flush: bool,
_flock_release: bool,
_lock_owner: Option<u64>,
) -> io::Result<()> {
self.do_release(inode, handle)
}
fn create(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mut mode: u32,
flags: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
// The `Context` may not contain all the information we need to create the file here. For
// example, a process may be part of several groups, one of which gives it permission to
// create a file in `parent`, but is not the gid of the process. This information is not
// forwarded to the server so we don't know when this is happening. Instead, we just rely on
// the access checks in the kernel driver: if we received this request then the kernel has
// determined that the process is allowed to create the file and we shouldn't reject it now
// based on acls.
//
// To ensure that the file is created atomically with the proper uid/gid we use `O_TMPFILE`
// + `linkat` as described in the `open(2)` manpage.
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes() != UNLABELED)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
// We don't want to use `O_EXCL` with `O_TMPFILE` as it has a different meaning when used in
// that combination.
let mut tmpflags = (flags as i32 | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW)
& !(libc::O_EXCL | libc::O_CREAT);
// O_TMPFILE requires that we use O_RDWR or O_WRONLY.
if flags as i32 & libc::O_ACCMODE == libc::O_RDONLY {
tmpflags &= !libc::O_ACCMODE;
tmpflags |= libc::O_RDWR;
}
// The presence of a default posix acl xattr in the parent directory completely changes the
// meaning of the mode parameter so only apply the umask if it doesn't have one.
if !self.has_default_posix_acl(&data)? {
mode &= !umask;
}
// Safe because this is a valid c string.
let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
// Safe because this doesn't modify any memory and we check the return value.
let fd =
unsafe { libc::openat(data.file.as_raw_fd(), current_dir.as_ptr(), tmpflags, mode) };
if fd < 0 {
return Err(io::Error::last_os_error());
}
// Safe because we just opened this fd.
let tmpfile = unsafe { File::from_raw_fd(fd) };
// We need to respect the setgid bit in the parent directory if it is set.
let st = stat(&data.file)?;
let gid = if st.st_mode & libc::S_ISGID != 0 {
st.st_gid
} else {
ctx.gid
};
// Now set the uid and gid for the file. Safe because this doesn't modify any memory and we
// check the return value.
let ret = unsafe { libc::fchown(tmpfile.as_raw_fd(), ctx.uid, gid) };
if ret < 0 {
return Err(io::Error::last_os_error());
}
let proc_path = CString::new(format!("self/fd/{}", tmpfile.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Finally link it into the file system tree so that it's visible to other processes. Safe
// because this doesn't modify any memory and we check the return value.
let ret = unsafe {
libc::linkat(
self.proc.as_raw_fd(),
proc_path.as_ptr(),
data.file.as_raw_fd(),
name.as_ptr(),
libc::AT_SYMLINK_FOLLOW,
)
};
if ret < 0 {
return Err(io::Error::last_os_error());
}
// We no longer need the tmpfile.
mem::drop(tmpfile);
let entry = self.do_lookup(&data, name)?;
let (handle, opts) = self
.do_open(
entry.inode,
flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
)
.map_err(|e| {
// Don't leak the entry.
self.forget(ctx, entry.inode, 1);
e
})?;
Ok((entry, handle, opts))
}
fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
let data = self.find_inode(parent)?;
self.do_unlink(&data, name, 0)
}
fn read<W: io::Write + ZeroCopyWriter>(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
mut w: W,
size: u32,
offset: u64,
_lock_owner: Option<u64>,
_flags: u32,
) -> io::Result<usize> {
let data = self.find_handle(handle, inode)?;
let mut f = data.file.lock();
w.write_from(&mut f, size as usize, offset)
}
fn write<R: io::Read + ZeroCopyReader>(
&self,
ctx: Context,
inode: Inode,
handle: Handle,
mut r: R,
size: u32,
offset: u64,
_lock_owner: Option<u64>,
_delayed_write: bool,
_flags: u32,
) -> io::Result<usize> {
// We need to change credentials during a write so that the kernel will remove setuid or
// setgid bits from the file if it was written to by someone other than the owner.
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let data = self.find_handle(handle, inode)?;
let mut f = data.file.lock();
r.read_to(&mut f, size as usize, offset)
}
fn getattr(
&self,
_ctx: Context,
inode: Inode,
_handle: Option<Handle>,
) -> io::Result<(libc::stat64, Duration)> {
let data = self.find_inode(inode)?;
self.do_getattr(&data)
}
fn setattr(
&self,
_ctx: Context,
inode: Inode,
attr: libc::stat64,
handle: Option<Handle>,
valid: SetattrValid,
) -> io::Result<(libc::stat64, Duration)> {
let inode_data = self.find_inode(inode)?;
enum Data {
Handle(Arc<HandleData>, RawFd),
ProcPath(CString),
}
// If we have a handle then use it otherwise get a new fd from the inode.
let data = if let Some(handle) = handle {
let hd = self.find_handle(handle, inode)?;
let fd = hd.file.lock().as_raw_fd();
Data::Handle(hd, fd)
} else {
let pathname = CString::new(format!("self/fd/{}", inode_data.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Data::ProcPath(pathname)
};
if valid.contains(SetattrValid::MODE) {
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
match data {
Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
Data::ProcPath(ref p) => {
libc::fchmodat(self.proc.as_raw_fd(), p.as_ptr(), attr.st_mode, 0)
}
}
};
if res < 0 {
return Err(io::Error::last_os_error());
}
}
if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
let uid = if valid.contains(SetattrValid::UID) {
attr.st_uid
} else {
// Cannot use -1 here because these are unsigned values.
::std::u32::MAX
};
let gid = if valid.contains(SetattrValid::GID) {
attr.st_gid
} else {
// Cannot use -1 here because these are unsigned values.
::std::u32::MAX
};
// Safe because this is a constant value and a valid C string.
let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
libc::fchownat(
inode_data.file.as_raw_fd(),
empty.as_ptr(),
uid,
gid,
libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
)
};
if res < 0 {
return Err(io::Error::last_os_error());
}
}
if valid.contains(SetattrValid::SIZE) {
// Safe because this doesn't modify any memory and we check the return value.
let res = match data {
Data::Handle(_, fd) => unsafe { libc::ftruncate64(fd, attr.st_size) },
_ => {
// There is no `ftruncateat` so we need to get a new fd and truncate it.
let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
unsafe { libc::ftruncate64(f.as_raw_fd(), attr.st_size) }
}
};
if res < 0 {
return Err(io::Error::last_os_error());
}
}
if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
let mut tvs = [
libc::timespec {
tv_sec: 0,
tv_nsec: libc::UTIME_OMIT,
},
libc::timespec {
tv_sec: 0,
tv_nsec: libc::UTIME_OMIT,
},
];
if valid.contains(SetattrValid::ATIME_NOW) {
tvs[0].tv_nsec = libc::UTIME_NOW;
} else if valid.contains(SetattrValid::ATIME) {
tvs[0].tv_sec = attr.st_atime;
tvs[0].tv_nsec = attr.st_atime_nsec;
}
if valid.contains(SetattrValid::MTIME_NOW) {
tvs[1].tv_nsec = libc::UTIME_NOW;
} else if valid.contains(SetattrValid::MTIME) {
tvs[1].tv_sec = attr.st_mtime;
tvs[1].tv_nsec = attr.st_mtime_nsec;
}
// Safe because this doesn't modify any memory and we check the return value.
let res = match data {
Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) },
Data::ProcPath(ref p) => unsafe {
libc::utimensat(self.proc.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0)
},
};
if res < 0 {
return Err(io::Error::last_os_error());
}
}
self.do_getattr(&inode_data)
}
fn rename(
&self,
_ctx: Context,
olddir: Inode,
oldname: &CStr,
newdir: Inode,
newname: &CStr,
flags: u32,
) -> io::Result<()> {
let old_inode = self.find_inode(olddir)?;
let new_inode = self.find_inode(newdir)?;
// Safe because this doesn't modify any memory and we check the return value.
// TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
// and we have glibc 2.28.
let res = unsafe {
libc::syscall(
libc::SYS_renameat2,
old_inode.file.as_raw_fd(),
oldname.as_ptr(),
new_inode.file.as_raw_fd(),
newname.as_ptr(),
flags,
)
};
if res == 0 {
Ok(())
} else {
Err(io::Error::last_os_error())
}
}
fn mknod(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mut mode: u32,
rdev: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let data = self.find_inode(parent)?;
// The presence of a default posix acl xattr in the parent directory completely changes the
// meaning of the mode parameter so only apply the umask if it doesn't have one.
if !self.has_default_posix_acl(&data)? {
mode &= !umask;
}
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes() != UNLABELED)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
libc::mknodat(
data.file.as_raw_fd(),
name.as_ptr(),
mode as libc::mode_t,
rdev as libc::dev_t,
)
};
if res < 0 {
Err(io::Error::last_os_error())
} else {
self.do_lookup(&data, name)
}
}
fn link(
&self,
_ctx: Context,
inode: Inode,
newparent: Inode,
newname: &CStr,
) -> io::Result<Entry> {
let data = self.find_inode(inode)?;
let new_inode = self.find_inode(newparent)?;
let path = CString::new(format!("self/fd/{}", data.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
libc::linkat(
self.proc.as_raw_fd(),
path.as_ptr(),
new_inode.file.as_raw_fd(),
newname.as_ptr(),
libc::AT_SYMLINK_FOLLOW,
)
};
if res == 0 {
self.do_lookup(&new_inode, newname)
} else {
Err(io::Error::last_os_error())
}
}
fn symlink(
&self,
ctx: Context,
linkname: &CStr,
parent: Inode,
name: &CStr,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes() != UNLABELED)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
// Safe because this doesn't modify any memory and we check the return value.
let res =
unsafe { libc::symlinkat(linkname.as_ptr(), data.file.as_raw_fd(), name.as_ptr()) };
if res == 0 {
self.do_lookup(&data, name)
} else {
Err(io::Error::last_os_error())
}
}
fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
let data = self.find_inode(inode)?;
let mut buf = Vec::with_capacity(libc::PATH_MAX as usize);
buf.resize(libc::PATH_MAX as usize, 0);
// Safe because this is a constant value and a valid C string.
let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// Safe because this will only modify the contents of `buf` and we check the return value.
let res = unsafe {
libc::readlinkat(
data.file.as_raw_fd(),
empty.as_ptr(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len(),
)
};
if res < 0 {
return Err(io::Error::last_os_error());
}
buf.resize(res as usize, 0);
Ok(buf)
}
fn flush(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
_lock_owner: u64,
) -> io::Result<()> {
let data = self.find_handle(handle, inode)?;
// Since this method is called whenever an fd is closed in the client, we can emulate that
// behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
// because this doesn't modify any memory and we check the return values.
unsafe {
let newfd = libc::dup(data.file.lock().as_raw_fd());
if newfd < 0 {
return Err(io::Error::last_os_error());
}
if libc::close(newfd) < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
}
}
}
fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
let data = self.find_handle(handle, inode)?;
let fd = data.file.lock().as_raw_fd();
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
if datasync {
libc::fdatasync(fd)
} else {
libc::fsync(fd)
}
};
if res == 0 {
Ok(())
} else {
Err(io::Error::last_os_error())
}
}
fn fsyncdir(
&self,
ctx: Context,
inode: Inode,
datasync: bool,
handle: Handle,
) -> io::Result<()> {
self.fsync(ctx, inode, datasync, handle)
}
fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
let data = self.find_inode(inode)?;
let st = stat(&data.file)?;
let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
if mode == libc::F_OK {
// The file exists since we were able to call `stat(2)` on it.
return Ok(());
}
if (mode & libc::R_OK) != 0 {
if ctx.uid != 0
&& (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
&& st.st_mode & 0o004 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
if (mode & libc::W_OK) != 0 {
if ctx.uid != 0
&& (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
&& st.st_mode & 0o002 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
// root can only execute something if it is executable by one of the owner, the group, or
// everyone.
if (mode & libc::X_OK) != 0 {
if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
&& (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
&& st.st_mode & 0o001 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
Ok(())
}
fn setxattr(
&self,
_ctx: Context,
inode: Inode,
name: &CStr,
value: &[u8],
flags: u32,
) -> io::Result<()> {
// We can't allow the VM to set this xattr because an unprivileged process may use it to set
// a privileged xattr.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::EPERM));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let res = if data.filetype == FileType::Other {
// For non-regular files and directories, we cannot open the fd normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", data.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Safe because this doesn't modify any memory and we check the return value.
self.with_proc_chdir(|| unsafe {
libc::setxattr(
path.as_ptr(),
name.as_ptr(),
value.as_ptr() as *const libc::c_void,
value.len() as libc::size_t,
flags as c_int,
)
})
} else {
// For regular files and directories, we can just open an fd and use fsetxattr.
let dir = if data.filetype == FileType::Directory {
libc::O_DIRECTORY
} else {
0
};
let f = self.open_inode(&data, libc::O_RDONLY | dir)?;
// Safe because this doesn't modify any memory and we check the return value.
unsafe {
libc::fsetxattr(
f.as_raw_fd(),
name.as_ptr(),
value.as_ptr() as *const libc::c_void,
value.len() as libc::size_t,
flags as c_int,
)
}
};
if res < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
}
}
fn getxattr(
&self,
_ctx: Context,
inode: Inode,
name: &CStr,
size: u32,
) -> io::Result<GetxattrReply> {
// We don't allow the VM to set this xattr so we also pretend there is no value associated
// with it.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::ENODATA));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let mut buf = vec![0u8; size as usize];
// Safe because this will only modify the contents of `buf`.
let res = self.do_getxattr(&data, &name, &mut buf[..])?;
if size == 0 {
Ok(GetxattrReply::Count(res as u32))
} else {
buf.truncate(res as usize);
Ok(GetxattrReply::Value(buf))
}
}
fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
let data = self.find_inode(inode)?;
let mut buf = vec![0u8; size as usize];
let res = if data.filetype == FileType::Other {
// For non-regular files and directories, we cannot open the fd normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", data.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Safe because this will only modify `buf` and we check the return value.
self.with_proc_chdir(|| unsafe {
libc::listxattr(
path.as_ptr(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len() as libc::size_t,
)
})
} else {
// For regular files and directories, we can just open an fd and use fsetxattr.
let dir = if data.filetype == FileType::Directory {
libc::O_DIRECTORY
} else {
0
};
let f = self.open_inode(&data, libc::O_RDONLY | dir)?;
// Safe because this will only write to `buf` and we check the return value.
unsafe {
libc::flistxattr(
f.as_raw_fd(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len() as libc::size_t,
)
}
};
if res < 0 {
return Err(io::Error::last_os_error());
}
if size == 0 {
Ok(ListxattrReply::Count(res as u32))
} else {
buf.truncate(res as usize);
if self.cfg.rewrite_security_xattrs {
strip_xattr_prefix(&mut buf);
}
Ok(ListxattrReply::Names(buf))
}
}
fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
// We don't allow the VM to set this xattr so we also pretend there is no value associated
// with it.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::ENODATA));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let res = if data.filetype == FileType::Other {
// For non-regular files and directories, we cannot open the fd normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", data.file.as_raw_fd()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// Safe because this doesn't modify any memory and we check the return value.
self.with_proc_chdir(|| unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) })
} else {
// For regular files and directories, we can just open an fd and use fsetxattr.
let dir = if data.filetype == FileType::Directory {
libc::O_DIRECTORY
} else {
0
};
let f = self.open_inode(&data, libc::O_RDONLY | dir)?;
// Safe because this doesn't modify any memory and we check the return value.
unsafe { libc::fremovexattr(f.as_raw_fd(), name.as_ptr()) }
};
if res == 0 {
Ok(())
} else {
Err(io::Error::last_os_error())
}
}
fn fallocate(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
mode: u32,
offset: u64,
length: u64,
) -> io::Result<()> {
let data = self.find_handle(handle, inode)?;
let fd = data.file.lock().as_raw_fd();
// Safe because this doesn't modify any memory and we check the return value.
let res = unsafe {
libc::fallocate64(
fd,
mode as libc::c_int,
offset as libc::off64_t,
length as libc::off64_t,
)
};
if res == 0 {
Ok(())
} else {
Err(io::Error::last_os_error())
}
}
fn ioctl<R: io::Read>(
&self,
_ctx: Context,
handle: Handle,
_flags: IoctlFlags,
cmd: u32,
arg: u64,
in_size: u32,
out_size: u32,
r: R,
) -> io::Result<IoctlReply> {
const GET_ENCRYPTION_POLICY: u32 = FS_IOC_GET_ENCRYPTION_POLICY() as u32;
const SET_ENCRYPTION_POLICY: u32 = FS_IOC_SET_ENCRYPTION_POLICY() as u32;
const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
// Normally, we wouldn't need to retry the FS_IOC_GET_ENCRYPTION_POLICY and
// FS_IOC_SET_ENCRYPTION_POLICY ioctls. Unfortunately, the I/O directions for both of them
// are encoded backwards so they can only be handled as unrestricted fuse ioctls.
match cmd {
GET_ENCRYPTION_POLICY => {
if out_size < size_of::<fscrypt_policy_v1>() as u32 {
let input = Vec::new();
let output = vec![IoctlIovec {
base: arg,
len: size_of::<fscrypt_policy_v1>() as u64,
}];
Ok(IoctlReply::Retry { input, output })
} else {
self.get_encryption_policy(handle)
}
}
SET_ENCRYPTION_POLICY => {
if in_size < size_of::<fscrypt_policy_v1>() as u32 {
let input = vec![IoctlIovec {
base: arg,
len: size_of::<fscrypt_policy_v1>() as u64,
}];
let output = Vec::new();
Ok(IoctlReply::Retry { input, output })
} else {
self.set_encryption_policy(handle, r)
}
}
GET_FSXATTR => {
if out_size < size_of::<fsxattr>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.get_fsxattr(handle)
}
}
SET_FSXATTR => {
if in_size < size_of::<fsxattr>() as u32 {
Err(io::Error::from_raw_os_error(libc::EINVAL))
} else {
self.set_fsxattr(handle, r)
}
}
GET_FLAGS32 | GET_FLAGS64 => {
if out_size < size_of::<c_int>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.get_flags(handle)
}
}
SET_FLAGS32 | SET_FLAGS64 => {
if in_size < size_of::<c_int>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.set_flags(handle, r)
}
}
_ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
}
}
fn copy_file_range(
&self,
ctx: Context,
inode_src: Inode,
handle_src: Handle,
offset_src: u64,
inode_dst: Inode,
handle_dst: Handle,
offset_dst: u64,
length: u64,
flags: u64,
) -> io::Result<usize> {
// We need to change credentials during a write so that the kernel will remove setuid or
// setgid bits from the file if it was written to by someone other than the owner.
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let src_data = self.find_handle(handle_src, inode_src)?;
let dst_data = self.find_handle(handle_dst, inode_dst)?;
let src = src_data.file.lock().as_raw_fd();
let dst = dst_data.file.lock().as_raw_fd();
let res = unsafe {
libc::syscall(
libc::SYS_copy_file_range,
src,
&offset_src,
dst,
&offset_dst,
length,
flags,
)
};
if res >= 0 {
Ok(res as usize)
} else {
Err(io::Error::last_os_error())
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::env;
use std::os::unix::ffi::OsStringExt;
use std::path::PathBuf;
#[test]
fn create_temp_dir() {
let testdir = CString::new(env::temp_dir().into_os_string().into_vec())
.expect("env::temp_dir() is not a valid c-string");
let fd = unsafe {
libc::openat(
libc::AT_FDCWD,
testdir.as_ptr(),
libc::O_PATH | libc::O_CLOEXEC,
)
};
assert!(fd >= 0, "Failed to open env::temp_dir()");
let parent = unsafe { File::from_raw_fd(fd) };
let t = TempDir::new(&parent, 0o755).expect("Failed to create temporary directory");
let basename = t.basename().to_string_lossy();
let path = PathBuf::from(env::temp_dir()).join(&*basename);
assert!(path.exists());
assert!(path.is_dir());
}
#[test]
fn remove_temp_dir() {
let testdir = CString::new(env::temp_dir().into_os_string().into_vec())
.expect("env::temp_dir() is not a valid c-string");
let fd = unsafe {
libc::openat(
libc::AT_FDCWD,
testdir.as_ptr(),
libc::O_PATH | libc::O_CLOEXEC,
)
};
assert!(fd >= 0, "Failed to open env::temp_dir()");
let parent = unsafe { File::from_raw_fd(fd) };
let t = TempDir::new(&parent, 0o755).expect("Failed to create temporary directory");
let basename = t.basename().to_string_lossy();
let path = PathBuf::from(env::temp_dir()).join(&*basename);
mem::drop(t);
assert!(!path.exists());
}
#[test]
fn temp_dir_into_inner() {
let testdir = CString::new(env::temp_dir().into_os_string().into_vec())
.expect("env::temp_dir() is not a valid c-string");
let fd = unsafe {
libc::openat(
libc::AT_FDCWD,
testdir.as_ptr(),
libc::O_PATH | libc::O_CLOEXEC,
)
};
assert!(fd >= 0, "Failed to open env::temp_dir()");
let parent = unsafe { File::from_raw_fd(fd) };
let t = TempDir::new(&parent, 0o755).expect("Failed to create temporary directory");
let (basename_cstr, _) = t.into_inner();
let basename = basename_cstr.to_string_lossy();
let path = PathBuf::from(env::temp_dir()).join(&*basename);
assert!(path.exists());
}
#[test]
fn rewrite_xattr_names() {
let mut cfg = Config::default();
cfg.rewrite_security_xattrs = true;
let p = PassthroughFs::new(cfg).expect("Failed to create PassthroughFs");
// Selinux shouldn't get overwritten.
let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
// user, trusted, and system should not be changed either.
let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
// sehash should be re-written.
let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
assert_eq!(
p.rewrite_xattr_name(sehash).to_bytes(),
b"user.virtiofs.security.sehash"
);
}
#[test]
fn strip_xattr_names() {
let only_nuls = b"\0\0\0\0\0";
let mut actual = only_nuls.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &only_nuls[..]);
let no_nuls = b"security.sehashuser.virtiofs";
let mut actual = no_nuls.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &no_nuls[..]);
let empty = b"";
let mut actual = empty.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &empty[..]);
let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
let mut actual = no_strippable_names.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &no_strippable_names[..]);
let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wtf\0";
let mut actual = only_strippable_names.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], b"security.sehash\0security.wtf\0");
let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wtf\0user.foobar\0";
let mut actual = mixed_names.to_vec();
strip_xattr_prefix(&mut actual);
let expected = b"security.sehash\0security.selinux\0security.wtf\0user.foobar\0";
assert_eq!(&actual[..], &expected[..]);
let no_nul_with_prefix = b"user.virtiofs.security.sehash";
let mut actual = no_nul_with_prefix.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], b"security.sehash");
}
}