blob: cc581100a8e345692b29dab8e2c6a11b3bf7aa6e [file] [log] [blame]
// Copyright 2019 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::borrow::Cow;
use std::cell::RefCell;
use std::cmp;
use std::collections::btree_map;
use std::collections::BTreeMap;
use std::ffi::CStr;
use std::ffi::CString;
use std::fs::File;
use std::io;
use std::mem;
use std::mem::size_of;
use std::mem::MaybeUninit;
use std::os::raw::c_int;
use std::os::raw::c_long;
use std::ptr;
use std::ptr::addr_of;
use std::ptr::addr_of_mut;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::MutexGuard;
use std::time::Duration;
use base::error;
use base::ioctl_ior_nr;
use base::ioctl_iow_nr;
use base::ioctl_iowr_nr;
use base::ioctl_with_mut_ptr;
use base::ioctl_with_ptr;
use base::syscall;
use base::unix::FileFlags;
use base::warn;
use base::AsRawDescriptor;
use base::FromRawDescriptor;
use base::Protection;
use base::RawDescriptor;
use fuse::filesystem::Context;
use fuse::filesystem::DirectoryIterator;
use fuse::filesystem::Entry;
use fuse::filesystem::FileSystem;
use fuse::filesystem::FsOptions;
use fuse::filesystem::GetxattrReply;
use fuse::filesystem::IoctlFlags;
use fuse::filesystem::IoctlReply;
use fuse::filesystem::ListxattrReply;
use fuse::filesystem::OpenOptions;
use fuse::filesystem::RemoveMappingOne;
use fuse::filesystem::SetattrValid;
use fuse::filesystem::ZeroCopyReader;
use fuse::filesystem::ZeroCopyWriter;
use fuse::filesystem::ROOT_ID;
use fuse::sys::WRITE_KILL_PRIV;
use fuse::Mapper;
#[cfg(feature = "arc_quota")]
use protobuf::Message;
use sync::Mutex;
#[cfg(feature = "arc_quota")]
use system_api::client::OrgChromiumSpaced;
#[cfg(feature = "arc_quota")]
use system_api::spaced::SetProjectIdReply;
#[cfg(feature = "arc_quota")]
use system_api::spaced::SetProjectInheritanceFlagReply;
use zerocopy::AsBytes;
use zerocopy::FromBytes;
use zerocopy::FromZeroes;
use crate::virtio::fs::caps::Capability;
use crate::virtio::fs::caps::Caps;
use crate::virtio::fs::caps::Set as CapSet;
use crate::virtio::fs::caps::Value as CapValue;
use crate::virtio::fs::config::CachePolicy;
use crate::virtio::fs::config::Config;
use crate::virtio::fs::expiring_map::ExpiringMap;
use crate::virtio::fs::multikey::MultikeyBTreeMap;
use crate::virtio::fs::read_dir::ReadDir;
const EMPTY_CSTR: &[u8] = b"\0";
const ROOT_CSTR: &[u8] = b"/\0";
const PROC_CSTR: &[u8] = b"/proc\0";
const UNLABELED_CSTR: &[u8] = b"unlabeled\0";
const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
const SECURITY_XATTR: &[u8] = b"security.";
const SELINUX_XATTR: &[u8] = b"security.selinux";
const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
#[cfg(feature = "arc_quota")]
const FS_PROJINHERIT_FL: c_int = 0x20000000;
// 25 seconds is the default timeout for dbus-send.
#[cfg(feature = "arc_quota")]
const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
/// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
macro_rules! fs_trace {
($tag:expr, $name:expr, $($arg:expr),+) => {
cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
};
}
#[repr(C)]
#[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
struct fscrypt_policy_v1 {
_version: u8,
_contents_encryption_mode: u8,
_filenames_encryption_mode: u8,
_flags: u8,
_master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
}
#[repr(C)]
#[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
struct fscrypt_policy_v2 {
_version: u8,
_contents_encryption_mode: u8,
_filenames_encryption_mode: u8,
_flags: u8,
__reserved: [u8; 4],
master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
}
#[repr(C)]
#[derive(Copy, Clone, FromZeroes, FromBytes)]
union fscrypt_policy {
_version: u8,
_v1: fscrypt_policy_v1,
_v2: fscrypt_policy_v2,
}
#[repr(C)]
#[derive(Copy, Clone, FromZeroes, FromBytes)]
struct fscrypt_get_policy_ex_arg {
policy_size: u64, /* input/output */
policy: fscrypt_policy, /* output */
}
impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
let data_raw: *const fscrypt_get_policy_ex_arg = value;
// SAFETY: the length of the output slice is asserted to be within the struct it points to
unsafe {
std::slice::from_raw_parts(
data_raw.cast(),
value.policy_size as usize + size_of::<u64>(),
)
}
}
}
ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
#[repr(C)]
#[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
struct fsxattr {
fsx_xflags: u32, /* xflags field value (get/set) */
fsx_extsize: u32, /* extsize field value (get/set) */
fsx_nextents: u32, /* nextents field value (get) */
fsx_projid: u32, /* project identifier (get/set) */
fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
fsx_pad: [u8; 8],
}
ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
#[repr(C)]
#[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
struct fsverity_enable_arg {
_version: u32,
_hash_algorithm: u32,
_block_size: u32,
salt_size: u32,
salt_ptr: u64,
sig_size: u32,
__reserved1: u32,
sig_ptr: u64,
__reserved2: [u64; 11],
}
#[repr(C)]
#[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
struct fsverity_digest {
_digest_algorithm: u16,
digest_size: u16,
// __u8 digest[];
}
ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
pub type Inode = u64;
type Handle = u64;
#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
struct InodeAltKey {
ino: libc::ino64_t,
dev: libc::dev_t,
}
#[derive(PartialEq, Eq, Debug)]
enum FileType {
Regular,
Directory,
Other,
}
impl From<libc::mode_t> for FileType {
fn from(mode: libc::mode_t) -> Self {
match mode & libc::S_IFMT {
libc::S_IFREG => FileType::Regular,
libc::S_IFDIR => FileType::Directory,
_ => FileType::Other,
}
}
}
#[derive(Debug)]
struct InodeData {
inode: Inode,
// (File, open_flags)
file: Mutex<(File, libc::c_int)>,
refcount: AtomicU64,
filetype: FileType,
path: String,
}
impl AsRawDescriptor for InodeData {
fn as_raw_descriptor(&self) -> RawDescriptor {
self.file.lock().0.as_raw_descriptor()
}
}
#[derive(Debug)]
struct HandleData {
inode: Inode,
file: Mutex<File>,
}
impl AsRawDescriptor for HandleData {
fn as_raw_descriptor(&self) -> RawDescriptor {
self.file.lock().as_raw_descriptor()
}
}
macro_rules! scoped_cred {
($name:ident, $ty:ty, $syscall_nr:expr) => {
#[derive(Debug)]
struct $name {
old: $ty,
}
impl $name {
// Changes the effective uid/gid of the current thread to `val`. Changes the thread's
// credentials back to `old` when the returned struct is dropped.
fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
if val == old {
// Nothing to do since we already have the correct value.
return Ok(None);
}
// We want credential changes to be per-thread because otherwise
// we might interfere with operations being carried out on other
// threads with different uids/gids. However, posix requires that
// all threads in a process share the same credentials. To do this
// libc uses signals to ensure that when one thread changes its
// credentials the other threads do the same thing.
//
// So instead we invoke the syscall directly in order to get around
// this limitation. Another option is to use the setfsuid and
// setfsgid systems calls. However since those calls have no way to
// return an error, it's preferable to do this instead.
// SAFETY: this call is safe because it doesn't modify any memory and we
// check the return value.
let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
if res == 0 {
Ok(Some($name { old }))
} else {
Err(io::Error::last_os_error())
}
}
}
impl Drop for $name {
fn drop(&mut self) {
// SAFETY: trivially safe
let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
if res < 0 {
error!(
"failed to change credentials back to {}: {}",
self.old,
io::Error::last_os_error(),
);
}
}
}
};
}
#[cfg(not(target_arch = "arm"))]
scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
#[cfg(target_arch = "arm")]
scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
#[cfg(not(target_arch = "arm"))]
scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
#[cfg(target_arch = "arm")]
scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
#[cfg(not(target_arch = "arm"))]
const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
#[cfg(target_arch = "arm")]
const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
#[cfg(not(target_arch = "arm"))]
const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
#[cfg(target_arch = "arm")]
const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
thread_local! {
// SAFETY: both calls take no parameters and only return an integer value. The kernel also
// guarantees that they can never fail.
static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
// SAFETY: both calls take no parameters and only return an integer value. The kernel also
// guarantees that they can never fail.
static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
}
fn set_creds(
uid: libc::uid_t,
gid: libc::gid_t,
) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
let olduid = THREAD_EUID.with(|uid| *uid);
let oldgid = THREAD_EGID.with(|gid| *gid);
// We have to change the gid before we change the uid because if we change the uid first then we
// lose the capability to change the gid. However changing back can happen in any order.
ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
}
thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = RefCell::new(None));
// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
// open the file.
fn open_fscreate(proc: &File) -> File {
// SAFETY: This string is nul-terminated and does not contain any interior nul bytes
let fscreate = unsafe { CStr::from_bytes_with_nul_unchecked(b"thread-self/attr/fscreate\0") };
// SAFETY: this doesn't modify any memory and we check the return value.
let raw_descriptor = unsafe {
libc::openat(
proc.as_raw_descriptor(),
fscreate.as_ptr(),
libc::O_CLOEXEC | libc::O_WRONLY,
)
};
// We don't expect this to fail and we're not in a position to return an error here so just
// panic.
if raw_descriptor < 0 {
panic!(
"Failed to open /proc/thread-self/attr/fscreate: {}",
io::Error::last_os_error()
);
}
// SAFETY: safe because we just opened this descriptor.
unsafe { File::from_raw_descriptor(raw_descriptor) }
}
struct ScopedSecurityContext;
impl ScopedSecurityContext {
fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
THREAD_FSCREATE.with(|thread_fscreate| {
let mut fscreate = thread_fscreate.borrow_mut();
let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
// SAFETY: this doesn't modify any memory and we check the return value.
let ret = unsafe {
libc::write(
file.as_raw_descriptor(),
ctx.as_ptr() as *const libc::c_void,
ctx.to_bytes_with_nul().len(),
)
};
if ret < 0 {
Err(io::Error::last_os_error())
} else {
Ok(ScopedSecurityContext)
}
})
}
}
impl Drop for ScopedSecurityContext {
fn drop(&mut self) {
THREAD_FSCREATE.with(|thread_fscreate| {
// expect is safe here because the thread local would have been initialized by the call
// to `new` above.
let fscreate = thread_fscreate.borrow();
let file = fscreate
.as_ref()
.expect("Uninitialized thread-local when dropping ScopedSecurityContext");
// SAFETY: this doesn't modify any memory and we check the return value.
let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
if ret < 0 {
warn!(
"Failed to restore security context: {}",
io::Error::last_os_error()
);
}
})
}
}
struct ScopedUmask {
old: libc::mode_t,
mask: libc::mode_t,
}
impl ScopedUmask {
fn new(mask: libc::mode_t) -> ScopedUmask {
ScopedUmask {
// SAFETY: this doesn't modify any memory and always succeeds.
old: unsafe { libc::umask(mask) },
mask,
}
}
}
impl Drop for ScopedUmask {
fn drop(&mut self) {
// SAFETY: this doesn't modify any memory and always succeeds.
let previous = unsafe { libc::umask(self.old) };
debug_assert_eq!(
previous, self.mask,
"umask changed while holding ScopedUmask"
);
}
}
struct ScopedFsetid(Caps);
impl Drop for ScopedFsetid {
fn drop(&mut self) {
if let Err(e) = raise_cap_fsetid(&mut self.0) {
error!(
"Failed to restore CAP_FSETID: {}. Some operations may be broken.",
e
)
}
}
}
fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
c.apply()
}
// Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
// adds the capability back when it is dropped.
fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
let mut caps = Caps::for_current_thread()?;
caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
caps.apply()?;
Ok(ScopedFsetid(caps))
}
fn ebadf() -> io::Error {
io::Error::from_raw_os_error(libc::EBADF)
}
fn eexist() -> io::Error {
io::Error::from_raw_os_error(libc::EEXIST)
}
fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
let mut st = MaybeUninit::<libc::stat64>::zeroed();
// SAFETY: this is a constant value that is a nul-terminated string without interior nul bytes.
let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// SAFETY: the kernel will only write data in `st` and we check the return value.
syscall!(unsafe {
libc::fstatat64(
f.as_raw_descriptor(),
pathname.as_ptr(),
st.as_mut_ptr(),
libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
)
})?;
// SAFETY: the kernel guarantees that the struct is now fully initialized.
Ok(unsafe { st.assume_init() })
}
fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
let mut st = MaybeUninit::<libc::stat64>::zeroed();
// SAFETY: the kernel will only write data in `st` and we check the return value.
syscall!(unsafe {
libc::fstatat64(
dir.as_raw_descriptor(),
name.as_ptr(),
st.as_mut_ptr(),
libc::AT_SYMLINK_NOFOLLOW,
)
})?;
// SAFETY: the kernel guarantees that the struct is now fully initialized.
Ok(unsafe { st.assume_init() })
}
#[cfg(feature = "arc_quota")]
fn is_android_project_id(project_id: u32) -> bool {
// The following constants defines the valid range of project ID used by
// Android and are taken from android_filesystem_config.h in Android
// codebase.
//
// Project IDs reserved for Android files on external storage. Total 100 IDs
// from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
// Project IDs reserved for Android apps.
// The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
// The upper-limit of the range differs before and after T. Here we use that
// of T (PROJECT_ID_APP_CACHE_END) as it is larger.
const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
|| PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
}
/// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
///
/// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
/// The value is the case-sensitive file name stored in the host file system.
/// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
/// covers all file names that exist within the directory.
/// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
/// update this cache.
struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
impl CasefoldCache {
fn new(dir: &InodeData) -> io::Result<Self> {
let mut mp = BTreeMap::new();
let mut buf = [0u8; 1024];
let mut offset = 0;
loop {
let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
if read_dir.remaining() == 0 {
break;
}
while let Some(entry) = read_dir.next() {
offset = entry.offset as libc::off64_t;
let entry_name = entry.name;
mp.insert(
entry_name.to_bytes().to_ascii_lowercase(),
entry_name.to_owned(),
);
}
}
Ok(Self(mp))
}
fn insert(&mut self, name: &CStr) {
let lower_case = name.to_bytes().to_ascii_lowercase();
self.0.insert(lower_case, name.into());
}
fn lookup(&self, name: &[u8]) -> Option<CString> {
let lower = name.to_ascii_lowercase();
self.0.get(&lower).cloned()
}
fn remove(&mut self, name: &CStr) {
let lower_case = name.to_bytes().to_ascii_lowercase();
self.0.remove(&lower_case);
}
}
/// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
/// Each entry will be expired after `timeout`.
/// When ascii_casefold is disabled, this struct does nothing.
struct ExpiringCasefoldLookupCaches {
inner: ExpiringMap<Inode, CasefoldCache>,
}
impl ExpiringCasefoldLookupCaches {
fn new(timeout: Duration) -> Self {
Self {
inner: ExpiringMap::new(timeout),
}
}
fn insert(&mut self, parent: Inode, name: &CStr) {
if let Some(dir_cache) = self.inner.get_mut(&parent) {
dir_cache.insert(name);
}
}
fn remove(&mut self, parent: Inode, name: &CStr) {
if let Some(dir_cache) = self.inner.get_mut(&parent) {
dir_cache.remove(name);
}
}
fn forget(&mut self, parent: Inode) {
self.inner.remove(&parent);
}
/// Get `CasefoldCache` for the given directory.
/// If the cache doesn't exist, generate it by fetching directory information with
/// `getdents64()`.
fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
self.inner
.get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
}
#[cfg(test)]
fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
if let Some(dir_cache) = self.inner.get(&parent) {
dir_cache.lookup(name.to_bytes()).is_some()
} else {
false
}
}
}
/// A file system that simply "passes through" all requests it receives to the underlying file
/// system. To keep the implementation simple it servers the contents of its root directory. Users
/// that wish to serve only a specific directory should set up the environment so that that
/// directory ends up as the root of the file system process. One way to accomplish this is via a
/// combination of mount namespaces and the pivot_root system call.
pub struct PassthroughFs {
// Mutex that must be acquired before executing a process-wide operation such as fchdir.
process_lock: Mutex<()>,
// virtio-fs tag that the guest uses when mounting. This is only used for debugging
// when tracing is enabled.
#[cfg_attr(not(feature = "trace_marker"), allow(dead_code))]
tag: String,
// File descriptors for various points in the file system tree.
inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
next_inode: AtomicU64,
// File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
// used for reading and writing data.
handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
next_handle: AtomicU64,
// File descriptor pointing to the `/proc` directory. This is used to convert an fd from
// `inodes` into one that can go into `handles`. This is accomplished by reading the
// `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
// to be serving doesn't have access to `/proc`.
proc: File,
// Whether writeback caching is enabled for this directory. This will only be true when
// `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
writeback: AtomicBool,
// Whether zero message opens are supported by the kernel driver.
zero_message_open: AtomicBool,
// Whether zero message opendir is supported by the kernel driver.
zero_message_opendir: AtomicBool,
// Used to communicate with other processes using D-Bus.
#[cfg(feature = "arc_quota")]
dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
#[cfg(feature = "arc_quota")]
dbus_fd: Option<std::os::unix::io::RawFd>,
// Time-expiring cache for `ascii_casefold_lookup()`.
// The key is an inode of a directory, and the value is a cache for the directory.
// Each value will be expired `cfg.timeout` after it's created.
//
// TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
// if we use PassthroughFs in multi-threaded environments.
expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
cfg: Config,
}
impl std::fmt::Debug for PassthroughFs {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("PassthroughFs")
.field("tag", &self.tag)
.field("next_inode", &self.next_inode)
.field("next_handle", &self.next_handle)
.field("proc", &self.proc)
.field("writeback", &self.writeback)
.field("zero_message_open", &self.zero_message_open)
.field("zero_message_opendir", &self.zero_message_opendir)
.field("cfg", &self.cfg)
.finish()
}
}
impl PassthroughFs {
pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
// SAFETY: this is a constant value that is a nul-terminated string without interior
// nul bytes.
let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
// SAFETY: this doesn't modify any memory and we check the return value.
let raw_descriptor = syscall!(unsafe {
libc::openat64(
libc::AT_FDCWD,
proc_cstr.as_ptr(),
libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
)
})?;
// Privileged UIDs can use D-Bus to perform some operations.
#[cfg(feature = "arc_quota")]
let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
(None, None)
} else {
let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
channel.set_watch_enabled(true);
let dbus_fd = channel.watch().fd;
channel.set_watch_enabled(false);
(
Some(Mutex::new(dbus::blocking::Connection::from(channel))),
Some(dbus_fd),
)
};
// SAFETY: safe because we just opened this descriptor.
let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
} else {
None
};
let passthroughfs = PassthroughFs {
process_lock: Mutex::new(()),
tag: tag.to_string(),
inodes: Mutex::new(MultikeyBTreeMap::new()),
next_inode: AtomicU64::new(ROOT_ID + 1),
handles: Mutex::new(BTreeMap::new()),
next_handle: AtomicU64::new(1),
proc,
writeback: AtomicBool::new(false),
zero_message_open: AtomicBool::new(false),
zero_message_opendir: AtomicBool::new(false),
#[cfg(feature = "arc_quota")]
dbus_connection,
#[cfg(feature = "arc_quota")]
dbus_fd,
expiring_casefold_lookup_caches,
cfg,
};
cros_tracing::trace_simple_print!(
VirtioFs,
"New PassthroughFS initialized: {:?}",
passthroughfs
);
Ok(passthroughfs)
}
pub fn cfg(&self) -> &Config {
&self.cfg
}
pub fn keep_rds(&self) -> Vec<RawDescriptor> {
#[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
let mut keep_rds = vec![self.proc.as_raw_descriptor()];
#[cfg(feature = "arc_quota")]
if let Some(fd) = self.dbus_fd {
keep_rds.push(fd);
}
keep_rds
}
fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
if !self.cfg.rewrite_security_xattrs {
return Cow::Borrowed(name);
}
// Does not include nul-terminator.
let buf = name.to_bytes();
if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
return Cow::Borrowed(name);
}
let mut newname = USER_VIRTIOFS_XATTR.to_vec();
newname.extend_from_slice(buf);
// The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
// to_bytes() call above will not return a byte slice with any interior nul-bytes either.
Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
}
fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
self.inodes
.lock()
.get(&inode)
.map(Arc::clone)
.ok_or_else(ebadf)
}
fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
self.handles
.lock()
.get(&handle)
.filter(|hd| hd.inode == inode)
.map(Arc::clone)
.ok_or_else(ebadf)
}
fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
let pathname = CString::new(format!("self/fd/{}", fd))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// SAFETY: this doesn't modify any memory and we check the return value. We don't really
// check `flags` because if the kernel can't handle poorly specified flags then we have
// much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
// to follow the `/proc/self/fd` symlink to get the file.
let raw_descriptor = syscall!(unsafe {
libc::openat64(
self.proc.as_raw_descriptor(),
pathname.as_ptr(),
(flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
)
})?;
// SAFETY: safe because we just opened this descriptor.
Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
}
/// Modifies the provided open flags based on the writeback caching configuration.
/// Return the updated open flags.
fn update_open_flags(&self, mut flags: i32) -> i32 {
// When writeback caching is enabled, the kernel may send read requests even if the
// userspace program opened the file write-only. So we need to ensure that we have opened
// the file for reading as well as writing.
let writeback = self.writeback.load(Ordering::Relaxed);
if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
flags &= !libc::O_ACCMODE;
flags |= libc::O_RDWR;
}
// When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
// However, this breaks atomicity as the file may have changed on disk, invalidating the
// cached copy of the data in the kernel and the offset that the kernel thinks is the end of
// the file. Just allow this for now as it is the user's responsibility to enable writeback
// caching only for directories that are not shared. It also means that we need to clear the
// `O_APPEND` flag.
if writeback && flags & libc::O_APPEND != 0 {
flags &= !libc::O_APPEND;
}
flags
}
fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
// handle writeback caching cases
flags = self.update_open_flags(flags);
self.open_fd(inode.as_raw_descriptor(), flags)
}
// Increases the inode refcount and returns the inode.
fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
// Matches with the release store in `forget`.
inode_data.refcount.fetch_add(1, Ordering::Acquire);
inode_data.inode
}
// Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
// The inodes mutex lock must not be already taken by the same thread otherwise this
// will deadlock.
fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int, path: String) -> Entry {
let mut inodes = self.inodes.lock();
let altkey = InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
};
let inode = if let Some(data) = inodes.get_alt(&altkey) {
self.increase_inode_refcount(data)
} else {
let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
inodes.insert(
inode,
altkey,
Arc::new(InodeData {
inode,
file: Mutex::new((f, open_flags)),
refcount: AtomicU64::new(1),
filetype: st.st_mode.into(),
path,
}),
);
inode
};
Entry {
inode,
generation: 0,
attr: st,
// We use the same timeout for the attribute and the entry.
attr_timeout: self.cfg.timeout,
entry_timeout: self.cfg.timeout,
}
}
/// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
self.expiring_casefold_lookup_caches
.as_ref()
.map(|c| c.lock())
}
// Returns an actual case-sensitive file name that matches with the given `name`.
// Returns `Ok(None)` if no file matches with the give `name`.
// This function will panic if casefold is not enabled.
fn get_case_unfolded_name(
&self,
parent: &InodeData,
name: &[u8],
) -> io::Result<Option<CString>> {
let mut caches = self
.lock_casefold_lookup_caches()
.expect("casefold must be enabled");
let dir_cache = caches.get(parent)?;
Ok(dir_cache.lookup(name))
}
// Performs an ascii case insensitive lookup.
fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
match self.get_case_unfolded_name(parent, name)? {
None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
Some(actual_name) => self.do_lookup(parent, &actual_name),
}
}
#[cfg(test)]
fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
let mut cache = self
.lock_casefold_lookup_caches()
.expect("casefold must be enabled");
cache.exists_in_cache(parent, name)
}
fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
let st = statat(parent, name)?;
let altkey = InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
};
// Check if we already have an entry before opening a new file.
if let Some(data) = self.inodes.lock().get_alt(&altkey) {
// Return the same inode with the reference counter increased.
return Ok(Entry {
inode: self.increase_inode_refcount(data),
generation: 0,
attr: st,
// We use the same timeout for the attribute and the entry.
attr_timeout: self.cfg.timeout,
entry_timeout: self.cfg.timeout,
});
}
// Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
// be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
// If the crosvm process doesn't have a read permission, fall back to O_PATH below.
let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
match FileType::from(st.st_mode) {
FileType::Regular => {}
FileType::Directory => flags |= libc::O_DIRECTORY,
FileType::Other => flags |= libc::O_PATH,
};
// SAFETY: this doesn't modify any memory and we check the return value.
let fd = match unsafe {
syscall!(libc::openat64(
parent.as_raw_descriptor(),
name.as_ptr(),
flags
))
} {
Ok(fd) => fd,
Err(e) if e.errno() == libc::EACCES => {
// If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
// `InodeData`.
// Note that some operations which should be allowed without read permissions
// require syscalls that don't support O_PATH fds. For those syscalls, we will
// need to fall back to their path-based equivalents with /self/fd/${FD}.
// e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
// works.
flags |= libc::O_PATH;
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe {
syscall!(libc::openat64(
parent.as_raw_descriptor(),
name.as_ptr(),
flags
))
}?
}
Err(e) => {
return Err(e.into());
}
};
// SAFETY: safe because we own the fd.
let f = unsafe { File::from_raw_descriptor(fd) };
let path = format!(
"{}/{}",
parent.path.clone(),
name.to_str().unwrap_or("<non UTF-8 str>")
);
// We made sure the lock acquired for `self.inodes` is released automatically when
// the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
// here. This would not be the case if this were executed in an else block instead.
Ok(self.add_entry(f, st, flags, path))
}
fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
let mut opts = OpenOptions::empty();
match self.cfg.cache_policy {
// We only set the direct I/O option on files.
CachePolicy::Never => opts.set(
OpenOptions::DIRECT_IO,
flags & (libc::O_DIRECTORY as u32) == 0,
),
CachePolicy::Always => {
opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
OpenOptions::KEEP_CACHE
} else {
OpenOptions::CACHE_DIR
}
}
_ => {}
};
opts
}
// Performs lookup using original name first, if it fails and ascii_casefold is enabled,
// it tries to unfold the name and do lookup again.
fn do_lookup_with_casefold_fallback(
&self,
parent: &InodeData,
name: &CStr,
) -> io::Result<Entry> {
let mut res = self.do_lookup(parent, name);
// If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
if res.is_err() && self.cfg.ascii_casefold {
res = self.ascii_casefold_lookup(parent, name.to_bytes());
}
res
}
fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
let inode_data = self.find_inode(inode)?;
let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
let data = HandleData { inode, file };
self.handles.lock().insert(handle, Arc::new(data));
let opts = self.get_cache_open_options(flags);
Ok((Some(handle), opts))
}
fn do_open_at(
&self,
parent_data: Arc<InodeData>,
name: &CStr,
inode: Inode,
flags: u32,
) -> io::Result<(Option<Handle>, OpenOptions)> {
let open_flags = self.update_open_flags(flags as i32);
let fd_open = syscall!(
// SAFETY: return value is checked.
unsafe {
libc::openat64(
parent_data.as_raw_descriptor(),
name.as_ptr(),
(open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
)
}
)?;
// SAFETY: fd_open is valid
let file_open = unsafe { File::from_raw_descriptor(fd_open) };
let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
let data = HandleData {
inode,
file: Mutex::new(file_open),
};
self.handles.lock().insert(handle, Arc::new(data));
let opts = self.get_cache_open_options(open_flags as u32);
Ok((Some(handle), opts))
}
fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
let mut handles = self.handles.lock();
if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
if e.get().inode == inode {
// We don't need to close the file here because that will happen automatically when
// the last `Arc` is dropped.
e.remove();
return Ok(());
}
}
Err(ebadf())
}
fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
let st = stat(inode)?;
Ok((st, self.cfg.timeout))
}
fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
Ok(())
}
fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
if datasync {
libc::fdatasync(file.as_raw_descriptor())
} else {
libc::fsync(file.as_raw_descriptor())
}
})?;
Ok(())
}
// Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
// directory. This effectively emulates an *at syscall starting at /proc, which is useful when
// there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
// root inode.
//
// NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
// be taken to avoid the risk of deadlocks.
fn with_proc_chdir<F, T>(&self, f: F) -> T
where
F: FnOnce() -> T,
{
let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
// Acquire a lock for `fchdir`.
let _proc_lock = self.process_lock.lock();
// SAFETY: this doesn't modify any memory and we check the return value. Since the
// fchdir should never fail we just use debug_asserts.
let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
debug_assert_eq!(
proc_cwd,
0,
"failed to fchdir to /proc: {}",
io::Error::last_os_error()
);
let res = f();
// SAFETY: this doesn't modify any memory and we check the return value. Since the
// fchdir should never fail we just use debug_asserts.
let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
debug_assert_eq!(
root_cwd,
0,
"failed to fchdir back to root directory: {}",
io::Error::last_os_error()
);
res
}
fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
let file = inode.file.lock();
let o_path_file = (file.1 & libc::O_PATH) != 0;
let res = if o_path_file {
// For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// SAFETY: this will only modify `value` and we check the return value.
self.with_proc_chdir(|| unsafe {
libc::getxattr(
path.as_ptr(),
name.as_ptr(),
value.as_mut_ptr() as *mut libc::c_void,
value.len() as libc::size_t,
)
})
} else {
// For regular files and directories, we can just use fgetxattr.
// SAFETY: this will only write to `value` and we check the return value.
unsafe {
libc::fgetxattr(
file.0.as_raw_descriptor(),
name.as_ptr(),
value.as_mut_ptr() as *mut libc::c_void,
value.len() as libc::size_t,
)
}
};
if res < 0 {
Err(io::Error::last_os_error())
} else {
Ok(res as usize)
}
}
fn get_encryption_policy_ex<R: io::Read>(
&self,
inode: Inode,
handle: Handle,
mut r: R,
) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
// SAFETY: this struct only has integer fields and any value is valid.
let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
r.read_exact(arg.policy_size.as_bytes_mut())?;
let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
arg.policy_size = policy_size;
let res =
// SAFETY: the kernel will only write to `arg` and we check the return value.
unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
let len = size_of::<u64>() + arg.policy_size as usize;
Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
}
}
fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
let mut buf = MaybeUninit::<fsxattr>::zeroed();
// SAFETY: the kernel will only write to `buf` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
// SAFETY: the kernel guarantees that the policy is now initialized.
let xattr = unsafe { buf.assume_init() };
Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
}
}
fn set_fsxattr<R: io::Read>(
&self,
#[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
inode: Inode,
handle: Handle,
mut r: R,
) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
let mut in_attr = fsxattr::new_zeroed();
r.read_exact(in_attr.as_bytes_mut())?;
#[cfg(feature = "arc_quota")]
let st = stat(&*data)?;
// Changing quota project ID requires CAP_FOWNER or being file owner.
// Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
#[cfg(feature = "arc_quota")]
if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
// Get the current fsxattr.
let mut buf = MaybeUninit::<fsxattr>::zeroed();
// SAFETY: the kernel will only write to `buf` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
if res < 0 {
return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
}
// SAFETY: the kernel guarantees that the policy is now initialized.
let current_attr = unsafe { buf.assume_init() };
// Project ID cannot be changed inside a user namespace.
// Use Spaced to avoid this restriction.
if current_attr.fsx_projid != in_attr.fsx_projid {
let connection = self.dbus_connection.as_ref().unwrap().lock();
let proxy = connection.with_proxy(
"org.chromium.Spaced",
"/org/chromium/Spaced",
DEFAULT_DBUS_TIMEOUT,
);
let project_id = in_attr.fsx_projid;
if !is_android_project_id(project_id) {
return Err(io::Error::from_raw_os_error(libc::EINVAL));
}
let file_clone = base::SafeDescriptor::try_from(&*data)?;
match proxy.set_project_id(file_clone.into(), project_id) {
Ok(r) => {
let r = SetProjectIdReply::parse_from_bytes(&r)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if !r.success {
return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
r.error,
))));
}
}
Err(e) => {
return Err(io::Error::new(io::ErrorKind::Other, e));
}
};
}
}
// SAFETY: this doesn't modify any memory and we check the return value.
let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
// The ioctl encoding is a long but the parameter is actually an int.
let mut flags: c_int = 0;
// SAFETY: the kernel will only write to `flags` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
}
}
fn set_flags<R: io::Read>(
&self,
#[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
inode: Inode,
handle: Handle,
mut r: R,
) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
// The ioctl encoding is a long but the parameter is actually an int.
let mut in_flags: c_int = 0;
r.read_exact(in_flags.as_bytes_mut())?;
#[cfg(feature = "arc_quota")]
let st = stat(&*data)?;
// Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
#[cfg(feature = "arc_quota")]
if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
// Get the current flag.
let mut buf = MaybeUninit::<c_int>::zeroed();
// SAFETY: the kernel will only write to `buf` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), buf.as_mut_ptr()) };
if res < 0 {
return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
}
// SAFETY: the kernel guarantees that the policy is now initialized.
let current_flags = unsafe { buf.assume_init() };
// Project inheritance flag cannot be changed inside a user namespace.
// Use Spaced to avoid this restriction.
if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
let connection = self.dbus_connection.as_ref().unwrap().lock();
let proxy = connection.with_proxy(
"org.chromium.Spaced",
"/org/chromium/Spaced",
DEFAULT_DBUS_TIMEOUT,
);
// If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
// reset.
let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
let file_clone = base::SafeDescriptor::try_from(&*data)?;
match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
Ok(r) => {
let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if !r.success {
return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
r.error,
))));
}
}
Err(e) => {
return Err(io::Error::new(io::ErrorKind::Other, e));
}
};
}
}
// SAFETY: this doesn't modify any memory and we check the return value.
let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &in_flags) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
fn enable_verity<R: io::Read>(
&self,
inode: Inode,
handle: Handle,
mut r: R,
) -> io::Result<IoctlReply> {
let inode_data = self.find_inode(inode)?;
// These match the return codes from `fsverity_ioctl_enable` in the kernel.
match inode_data.filetype {
FileType::Regular => {}
FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
}
{
// We cannot enable verity while holding a writable fd so get a new one, if necessary.
let mut file = inode_data.file.lock();
let mut flags = file.1;
match flags & libc::O_ACCMODE {
libc::O_WRONLY | libc::O_RDWR => {
flags &= !libc::O_ACCMODE;
flags |= libc::O_RDONLY;
// We need to get a read-only handle for this file.
let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
*file = (newfile, flags);
}
libc::O_RDONLY => {}
_ => panic!("Unexpected flags: {:#x}", flags),
}
}
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
inode_data
} else {
let data = self.find_handle(handle, inode)?;
{
// We can't enable verity while holding a writable fd. We don't know whether the
// file was opened for writing so check it here. We don't expect
// this to be a frequent operation so the extra latency should be
// fine.
let mut file = data.file.lock();
let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
match flags {
FileFlags::ReadWrite | FileFlags::Write => {
// We need to get a read-only handle for this file.
*file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
}
FileFlags::Read => {}
}
}
data
};
let mut arg = fsverity_enable_arg::new_zeroed();
r.read_exact(arg.as_bytes_mut())?;
let mut salt;
if arg.salt_size > 0 {
if arg.salt_size > self.max_buffer_size() {
return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
libc::ENOMEM,
))));
}
salt = vec![0; arg.salt_size as usize];
r.read_exact(&mut salt)?;
arg.salt_ptr = salt.as_ptr() as usize as u64;
} else {
arg.salt_ptr = 0;
}
let mut sig;
if arg.sig_size > 0 {
if arg.sig_size > self.max_buffer_size() {
return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
libc::ENOMEM,
))));
}
sig = vec![0; arg.sig_size as usize];
r.read_exact(&mut sig)?;
arg.sig_ptr = sig.as_ptr() as usize as u64;
} else {
arg.sig_ptr = 0;
}
// SAFETY: this doesn't modify any memory and we check the return value.
let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
Ok(IoctlReply::Done(Ok(Vec::new())))
}
}
fn measure_verity<R: io::Read>(
&self,
inode: Inode,
handle: Handle,
mut r: R,
out_size: u32,
) -> io::Result<IoctlReply> {
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
let mut digest = fsverity_digest::new_zeroed();
r.read_exact(digest.as_bytes_mut())?;
// Taken from fs/verity/fsverity_private.h.
const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
// This digest size is what the fsverity command line utility uses.
const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
const ROUNDED_LEN: usize =
(BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
// Make sure we get a properly aligned allocation.
let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
// SAFETY: we are only writing data and not reading uninitialized memory.
unsafe {
// TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
.write(DIGEST_SIZE)
};
// SAFETY: this will only modify `buf` and we check the return value.
let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
if res < 0 {
Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
} else {
let digest_size =
// SAFETY: this value was initialized by us already and then overwritten by the kernel.
// TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
// The kernel guarantees this but it doesn't hurt to be paranoid.
debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
if digest.digest_size < digest_size || out_size < outlen {
return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
libc::EOVERFLOW,
))));
}
let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
// SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
// doesn't contain any references.
unsafe { mem::transmute(buf) };
let buf =
// SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
// first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
// to have the same layout as `u8`.
// TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
Ok(IoctlReply::Done(Ok(buf.to_vec())))
}
}
}
/// Decrements the refcount of the inode.
/// Returns `true` if the refcount became 0.
fn forget_one(
inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
inode: Inode,
count: u64,
) -> bool {
if let Some(data) = inodes.get(&inode) {
// Acquiring the write lock on the inode map prevents new lookups from incrementing the
// refcount but there is the possibility that a previous lookup already acquired a
// reference to the inode data and is in the process of updating the refcount so we need
// to loop here until we can decrement successfully.
loop {
let refcount = data.refcount.load(Ordering::Relaxed);
// Saturating sub because it doesn't make sense for a refcount to go below zero and
// we don't want misbehaving clients to cause integer overflow.
let new_count = refcount.saturating_sub(count);
// Synchronizes with the acquire load in `do_lookup`.
if data
.refcount
.compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
.is_ok()
{
if new_count == 0 {
// We just removed the last refcount for this inode. There's no need for an
// acquire fence here because we hold a write lock on the inode map and any
// thread that is waiting to do a forget on the same inode will have to wait
// until we release the lock. So there's is no other release store for us to
// synchronize with before deleting the entry.
inodes.remove(&inode);
return true;
}
break;
}
}
}
false
}
// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
fn strip_xattr_prefix(buf: &mut Vec<u8>) {
fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
if start >= b.len() {
return None;
}
let end = b[start..]
.iter()
.position(|&c| c == b'\0')
.map(|p| start + p + 1)
.unwrap_or(b.len());
Some(&b[start..end])
}
let mut pos = 0;
while let Some(name) = next_cstr(buf, pos) {
if !name.starts_with(USER_VIRTIOFS_XATTR) {
pos += name.len();
continue;
}
let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
pos += newlen;
}
}
impl FileSystem for PassthroughFs {
type Inode = Inode;
type Handle = Handle;
type DirIter = ReadDir<Box<[u8]>>;
fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
// SAFETY: this is a constant value that is a nul-terminated string without interior
// nul bytes.
let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
// SAFETY: this doesn't modify any memory and we check the return value.
let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
if raw_descriptor < 0 {
return Err(io::Error::last_os_error());
}
// SAFETY: safe because we just opened this descriptor above.
let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
let st = stat(&f)?;
// SAFETY: this doesn't modify any memory and there is no need to check the return
// value because this system call always succeeds. We need to clear the umask here because
// we want the client to be able to set all the bits in the mode.
unsafe { libc::umask(0o000) };
let mut inodes = self.inodes.lock();
// Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
inodes.insert(
ROOT_ID,
InodeAltKey {
ino: st.st_ino,
dev: st.st_dev,
},
Arc::new(InodeData {
inode: ROOT_ID,
file: Mutex::new((f, flags)),
refcount: AtomicU64::new(2),
filetype: st.st_mode.into(),
path: "".to_string(),
}),
);
let mut opts = FsOptions::DO_READDIRPLUS
| FsOptions::READDIRPLUS_AUTO
| FsOptions::EXPORT_SUPPORT
| FsOptions::DONT_MASK
| FsOptions::CACHE_SYMLINKS
| FsOptions::SECURITY_CONTEXT;
if self.cfg.posix_acl {
opts |= FsOptions::POSIX_ACL;
}
if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
opts |= FsOptions::WRITEBACK_CACHE;
self.writeback.store(true, Ordering::Relaxed);
}
if self.cfg.cache_policy == CachePolicy::Always {
if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
opts |= FsOptions::ZERO_MESSAGE_OPEN;
self.zero_message_open.store(true, Ordering::Relaxed);
}
if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
self.zero_message_opendir.store(true, Ordering::Relaxed);
}
}
Ok(opts)
}
fn destroy(&self) {
cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
self.handles.lock().clear();
self.inodes.lock().clear();
}
fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
let _trace = fs_trace!(self.tag, "statfs", inode);
let data = self.find_inode(inode)?;
let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
// SAFETY: this will only modify `out` and we check the return value.
syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
// SAFETY: the kernel guarantees that `out` has been initialized.
Ok(unsafe { out.assume_init() })
}
fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
let data = self.find_inode(parent)?;
#[allow(unused_variables)]
let path = format!(
"{}/{}",
data.path,
name.to_str().unwrap_or("<non UTF-8 path>")
);
let _trace = fs_trace!(self.tag, "lookup", parent, path);
let mut res = self.do_lookup_with_casefold_fallback(&data, name);
// FUSE takes a inode=0 as a request to do negative dentry cache.
// So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
// response.
if let Err(e) = &res {
if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
res = Ok(Entry::new_negative(self.cfg.negative_timeout));
}
}
res
}
fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
let _trace = fs_trace!(self.tag, "forget", inode, count);
let mut inodes = self.inodes.lock();
let caches = self.lock_casefold_lookup_caches();
if forget_one(&mut inodes, inode, count) {
if let Some(mut c) = caches {
c.forget(inode);
}
}
}
fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
let mut inodes = self.inodes.lock();
let mut caches = self.lock_casefold_lookup_caches();
for (inode, count) in requests {
if forget_one(&mut inodes, inode, count) {
if let Some(c) = caches.as_mut() {
c.forget(inode);
}
}
}
}
fn opendir(
&self,
_ctx: Context,
inode: Inode,
flags: u32,
) -> io::Result<(Option<Handle>, OpenOptions)> {
let _trace = fs_trace!(self.tag, "opendir", inode, flags);
if self.zero_message_opendir.load(Ordering::Relaxed) {
Err(io::Error::from_raw_os_error(libc::ENOSYS))
} else {
self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
}
}
fn releasedir(
&self,
_ctx: Context,
inode: Inode,
_flags: u32,
handle: Handle,
) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
if self.zero_message_opendir.load(Ordering::Relaxed) {
Ok(())
} else {
self.do_release(inode, handle)
}
}
fn mkdir(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mode: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
{
let casefold_cache = self.lock_casefold_lookup_caches();
let _scoped_umask = ScopedUmask::new(umask);
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
if let Some(mut c) = casefold_cache {
c.insert(data.inode, name);
}
}
self.do_lookup(&data, name)
}
fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "rmdir", parent, name);
let data = self.find_inode(parent)?;
let casefold_cache = self.lock_casefold_lookup_caches();
// TODO(b/278691962): If ascii_casefold is enabled, we need to call
// `get_case_unfolded_name()` to get the actual name to be unlinked.
self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
if let Some(mut c) = casefold_cache {
c.remove(data.inode, name);
}
Ok(())
}
fn readdir(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
size: u32,
offset: u64,
) -> io::Result<Self::DirIter> {
let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
let buf = vec![0; size as usize].into_boxed_slice();
if self.zero_message_opendir.load(Ordering::Relaxed) {
let data = self.find_inode(inode)?;
ReadDir::new(&*data, offset as libc::off64_t, buf)
} else {
let data = self.find_handle(handle, inode)?;
let dir = data.file.lock();
ReadDir::new(&*dir, offset as libc::off64_t, buf)
}
}
fn open(
&self,
_ctx: Context,
inode: Inode,
flags: u32,
) -> io::Result<(Option<Handle>, OpenOptions)> {
if self.zero_message_open.load(Ordering::Relaxed) {
let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
Err(io::Error::from_raw_os_error(libc::ENOSYS))
} else {
let _trace = fs_trace!(self.tag, "open", inode, flags);
self.do_open(inode, flags)
}
}
fn release(
&self,
_ctx: Context,
inode: Inode,
_flags: u32,
handle: Handle,
_flush: bool,
_flock_release: bool,
_lock_owner: Option<u64>,
) -> io::Result<()> {
if self.zero_message_open.load(Ordering::Relaxed) {
let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
Ok(())
} else {
let _trace = fs_trace!(self.tag, "release", inode, handle);
self.do_release(inode, handle)
}
}
fn chromeos_tmpfile(
&self,
ctx: Context,
parent: Self::Inode,
mode: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let _trace = fs_trace!(
self.tag,
"chromeos_tempfile",
parent,
mode,
umask,
security_ctx
);
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
// SAFETY: This string is nul-terminated and does not contain any interior nul bytes
let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
let fd = {
let _scoped_umask = ScopedUmask::new(umask);
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::openat64(
data.as_raw_descriptor(),
current_dir.as_ptr(),
tmpflags,
mode,
)
})?
};
// No need to add casefold_cache becuase we created an anonymous file.
// SAFETY: safe because we just opened this fd.
let tmpfile = unsafe { File::from_raw_descriptor(fd) };
let st = stat(&tmpfile)?;
let path = format!(
"{}/{}",
data.path.clone(),
current_dir.to_str().unwrap_or("<non UTF-8 str>")
);
Ok(self.add_entry(tmpfile, st, tmpflags, path))
}
fn create(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mode: u32,
flags: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
let _trace = fs_trace!(
self.tag,
"create",
parent,
name,
mode,
flags,
umask,
security_ctx
);
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let create_flags =
(flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
let fd = {
let _scoped_umask = ScopedUmask::new(umask);
let casefold_cache = self.lock_casefold_lookup_caches();
// SAFETY: this doesn't modify any memory and we check the return value. We don't really
// check `flags` because if the kernel can't handle poorly specified flags then we have
// much bigger problems.
// TODO(b/278691962): If ascii_casefold is enabled, we need to call
// `get_case_unfolded_name()` to get the actual name to be created.
let fd = syscall!(unsafe {
libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
})?;
if let Some(mut c) = casefold_cache {
c.insert(parent, name);
}
fd
};
// SAFETY: safe because we just opened this fd.
let file = unsafe { File::from_raw_descriptor(fd) };
let st = stat(&file)?;
let path = format!(
"{}/{}",
data.path.clone(),
name.to_str().unwrap_or("<non UTF-8 str>")
);
let entry = self.add_entry(file, st, create_flags, path);
let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
(None, OpenOptions::KEEP_CACHE)
} else {
self.do_open_at(
data,
name,
entry.inode,
flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
)
.map_err(|e| {
// Don't leak the entry.
self.forget(ctx, entry.inode, 1);
e
})?
};
Ok((entry, handle, opts))
}
fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "unlink", parent, name);
let data = self.find_inode(parent)?;
let casefold_cache = self.lock_casefold_lookup_caches();
// TODO(b/278691962): If ascii_casefold is enabled, we need to call
// `get_case_unfolded_name()` to get the actual name to be unlinked.
self.do_unlink(&data, name, 0)?;
if let Some(mut c) = casefold_cache {
c.remove(data.inode, name);
}
Ok(())
}
fn read<W: io::Write + ZeroCopyWriter>(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
mut w: W,
size: u32,
offset: u64,
_lock_owner: Option<u64>,
_flags: u32,
) -> io::Result<usize> {
if self.zero_message_open.load(Ordering::Relaxed) {
let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
let data = self.find_inode(inode)?;
let mut file = data.file.lock();
let mut flags = file.1;
match flags & libc::O_ACCMODE {
libc::O_WRONLY => {
flags &= !libc::O_WRONLY;
flags |= libc::O_RDWR;
// We need to get a readable handle for this file.
let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
*file = (newfile, flags);
}
libc::O_RDONLY | libc::O_RDWR => {}
_ => panic!("Unexpected flags: {:#x}", flags),
}
w.write_from(&mut file.0, size as usize, offset)
} else {
let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
let data = self.find_handle(handle, inode)?;
let mut f = data.file.lock();
w.write_from(&mut f, size as usize, offset)
}
}
fn write<R: io::Read + ZeroCopyReader>(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
mut r: R,
size: u32,
offset: u64,
_lock_owner: Option<u64>,
_delayed_write: bool,
flags: u32,
) -> io::Result<usize> {
// When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
// automatically clear the setuid and setgid bits for us.
let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
Some(drop_cap_fsetid()?)
} else {
None
};
if self.zero_message_open.load(Ordering::Relaxed) {
let _trace = fs_trace!(
self.tag,
"write (zero-message)",
inode,
handle,
size,
offset
);
let data = self.find_inode(inode)?;
let mut file = data.file.lock();
let mut flags = file.1;
match flags & libc::O_ACCMODE {
libc::O_RDONLY => {
flags &= !libc::O_RDONLY;
flags |= libc::O_RDWR;
// We need to get a writable handle for this file.
let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
*file = (newfile, flags);
}
libc::O_WRONLY | libc::O_RDWR => {}
_ => panic!("Unexpected flags: {:#x}", flags),
}
r.read_to(&mut file.0, size as usize, offset)
} else {
let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
let data = self.find_handle(handle, inode)?;
let mut f = data.file.lock();
r.read_to(&mut f, size as usize, offset)
}
}
fn getattr(
&self,
_ctx: Context,
inode: Inode,
_handle: Option<Handle>,
) -> io::Result<(libc::stat64, Duration)> {
let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
let data = self.find_inode(inode)?;
self.do_getattr(&data)
}
fn setattr(
&self,
_ctx: Context,
inode: Inode,
attr: libc::stat64,
handle: Option<Handle>,
valid: SetattrValid,
) -> io::Result<(libc::stat64, Duration)> {
let _trace = fs_trace!(self.tag, "setattr", inode, handle);
let inode_data = self.find_inode(inode)?;
enum Data {
Handle(Arc<HandleData>, RawDescriptor),
ProcPath(CString),
}
// If we have a handle then use it otherwise get a new fd from the inode.
let data = if let Some(handle) = handle.filter(|&h| h != 0) {
let hd = self.find_handle(handle, inode)?;
let fd = hd.file.lock().as_raw_descriptor();
Data::Handle(hd, fd)
} else {
let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Data::ProcPath(pathname)
};
if valid.contains(SetattrValid::MODE) {
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
match data {
Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
Data::ProcPath(ref p) => {
libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
}
}
})?;
}
if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
let uid = if valid.contains(SetattrValid::UID) {
attr.st_uid
} else {
// Cannot use -1 here because these are unsigned values.
::std::u32::MAX
};
let gid = if valid.contains(SetattrValid::GID) {
attr.st_gid
} else {
// Cannot use -1 here because these are unsigned values.
::std::u32::MAX
};
// SAFETY: this is a constant value that is a nul-terminated string without interior
// nul bytes.
let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::fchownat(
inode_data.as_raw_descriptor(),
empty.as_ptr(),
uid,
gid,
libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
)
})?;
}
if valid.contains(SetattrValid::SIZE) {
syscall!(match data {
Data::Handle(_, fd) => {
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe { libc::ftruncate64(fd, attr.st_size) }
}
_ => {
// There is no `ftruncateat` so we need to get a new fd and truncate it.
let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
}
})?;
}
if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
let mut tvs = [
libc::timespec {
tv_sec: 0,
tv_nsec: libc::UTIME_OMIT,
},
libc::timespec {
tv_sec: 0,
tv_nsec: libc::UTIME_OMIT,
},
];
if valid.contains(SetattrValid::ATIME_NOW) {
tvs[0].tv_nsec = libc::UTIME_NOW;
} else if valid.contains(SetattrValid::ATIME) {
tvs[0].tv_sec = attr.st_atime;
tvs[0].tv_nsec = attr.st_atime_nsec;
}
if valid.contains(SetattrValid::MTIME_NOW) {
tvs[1].tv_nsec = libc::UTIME_NOW;
} else if valid.contains(SetattrValid::MTIME) {
tvs[1].tv_sec = attr.st_mtime;
tvs[1].tv_nsec = attr.st_mtime_nsec;
}
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
match data {
Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
Data::ProcPath(ref p) => {
libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
}
}
})?;
}
self.do_getattr(&inode_data)
}
fn rename(
&self,
_ctx: Context,
olddir: Inode,
oldname: &CStr,
newdir: Inode,
newname: &CStr,
flags: u32,
) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
let old_inode = self.find_inode(olddir)?;
let new_inode = self.find_inode(newdir)?;
{
let casefold_cache = self.lock_casefold_lookup_caches();
// SAFETY: this doesn't modify any memory and we check the return value.
// TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
// and we have glibc 2.28.
syscall!(unsafe {
libc::syscall(
libc::SYS_renameat2,
old_inode.as_raw_descriptor(),
oldname.as_ptr(),
new_inode.as_raw_descriptor(),
newname.as_ptr(),
flags,
)
})?;
if let Some(mut c) = casefold_cache {
c.remove(olddir, oldname);
c.insert(newdir, newname);
}
}
Ok(())
}
fn mknod(
&self,
ctx: Context,
parent: Inode,
name: &CStr,
mode: u32,
rdev: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let _trace = fs_trace!(
self.tag,
"mknod",
parent,
name,
mode,
rdev,
umask,
security_ctx
);
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
{
let _scoped_umask = ScopedUmask::new(umask);
let casefold_cache = self.lock_casefold_lookup_caches();
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::mknodat(
data.as_raw_descriptor(),
name.as_ptr(),
mode as libc::mode_t,
rdev as libc::dev_t,
)
})?;
if let Some(mut c) = casefold_cache {
c.insert(parent, name);
}
}
self.do_lookup(&data, name)
}
fn link(
&self,
_ctx: Context,
inode: Inode,
newparent: Inode,
newname: &CStr,
) -> io::Result<Entry> {
let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
let data = self.find_inode(inode)?;
let new_inode = self.find_inode(newparent)?;
let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
{
let casefold_cache = self.lock_casefold_lookup_caches();
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::linkat(
self.proc.as_raw_descriptor(),
path.as_ptr(),
new_inode.as_raw_descriptor(),
newname.as_ptr(),
libc::AT_SYMLINK_FOLLOW,
)
})?;
if let Some(mut c) = casefold_cache {
c.insert(newparent, newname);
}
}
self.do_lookup(&new_inode, newname)
}
fn symlink(
&self,
ctx: Context,
linkname: &CStr,
parent: Inode,
name: &CStr,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
let data = self.find_inode(parent)?;
let _ctx = security_ctx
.filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
.map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
.transpose()?;
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
{
let casefold_cache = self.lock_casefold_lookup_caches();
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
})?;
if let Some(mut c) = casefold_cache {
c.insert(parent, name);
}
}
self.do_lookup(&data, name)
}
fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
let _trace = fs_trace!(self.tag, "readlink", inode);
let data = self.find_inode(inode)?;
let mut buf = vec![0; libc::PATH_MAX as usize];
// SAFETY: this is a constant value that is a nul-terminated string without interior nul
// bytes.
let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
// SAFETY: this will only modify the contents of `buf` and we check the return value.
let res = syscall!(unsafe {
libc::readlinkat(
data.as_raw_descriptor(),
empty.as_ptr(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len(),
)
})?;
buf.resize(res as usize, 0);
Ok(buf)
}
fn flush(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
_lock_owner: u64,
) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "flush", inode, handle);
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
self.find_inode(inode)?
} else {
self.find_handle(handle, inode)?
};
// SAFETY:
// Since this method is called whenever an fd is closed in the client, we can emulate that
// behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
// because this doesn't modify any memory and we check the return values.
unsafe {
let newfd = syscall!(libc::fcntl(
data.as_raw_descriptor(),
libc::F_DUPFD_CLOEXEC,
0
))?;
syscall!(libc::close(newfd))?;
}
Ok(())
}
fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
if self.zero_message_open.load(Ordering::Relaxed) {
let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
let data = self.find_inode(inode)?;
self.do_fsync(&*data, datasync)
} else {
let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
let data = self.find_handle(handle, inode)?;
let file = data.file.lock();
self.do_fsync(&*file, datasync)
}
}
fn fsyncdir(
&self,
_ctx: Context,
inode: Inode,
datasync: bool,
handle: Handle,
) -> io::Result<()> {
if self.zero_message_opendir.load(Ordering::Relaxed) {
let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
let data = self.find_inode(inode)?;
self.do_fsync(&*data, datasync)
} else {
let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
let data = self.find_handle(handle, inode)?;
let file = data.file.lock();
self.do_fsync(&*file, datasync)
}
}
fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "access", inode, mask);
let data = self.find_inode(inode)?;
let st = stat(&*data)?;
let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
if mode == libc::F_OK {
// The file exists since we were able to call `stat(2)` on it.
return Ok(());
}
if (mode & libc::R_OK) != 0 {
if ctx.uid != 0
&& (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
&& st.st_mode & 0o004 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
if (mode & libc::W_OK) != 0 {
if ctx.uid != 0
&& (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
&& st.st_mode & 0o002 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
// root can only execute something if it is executable by one of the owner, the group, or
// everyone.
if (mode & libc::X_OK) != 0 {
if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
&& (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
&& (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
&& st.st_mode & 0o001 == 0
{
return Err(io::Error::from_raw_os_error(libc::EACCES));
}
}
Ok(())
}
fn setxattr(
&self,
_ctx: Context,
inode: Inode,
name: &CStr,
value: &[u8],
flags: u32,
) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
// We can't allow the VM to set this xattr because an unprivileged process may use it to set
// a privileged xattr.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::EPERM));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let file = data.file.lock();
let o_path_file = (file.1 & libc::O_PATH) != 0;
if o_path_file {
// For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
// an _at syscall by changing the CWD to /proc, running the path based syscall, and then
// setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
syscall!(self.with_proc_chdir(|| {
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe {
libc::setxattr(
path.as_ptr(),
name.as_ptr(),
value.as_ptr() as *const libc::c_void,
value.len() as libc::size_t,
flags as c_int,
)
}
}))?;
} else {
syscall!(
// For regular files and directories, we can just use fsetxattr.
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe {
libc::fsetxattr(
file.0.as_raw_descriptor(),
name.as_ptr(),
value.as_ptr() as *const libc::c_void,
value.len() as libc::size_t,
flags as c_int,
)
}
)?;
}
Ok(())
}
fn getxattr(
&self,
_ctx: Context,
inode: Inode,
name: &CStr,
size: u32,
) -> io::Result<GetxattrReply> {
let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
// We don't allow the VM to set this xattr so we also pretend there is no value associated
// with it.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::ENODATA));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let mut buf = vec![0u8; size as usize];
// SAFETY: this will only modify the contents of `buf`.
let res = self.do_getxattr(&data, &name, &mut buf[..])?;
if size == 0 {
Ok(GetxattrReply::Count(res as u32))
} else {
buf.truncate(res);
Ok(GetxattrReply::Value(buf))
}
}
fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
let _trace = fs_trace!(self.tag, "listxattr", inode, size);
let data = self.find_inode(inode)?;
let mut buf = vec![0u8; size as usize];
let file = data.file.lock();
let o_path_file = (file.1 & libc::O_PATH) != 0;
let res = if o_path_file {
// For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
// SAFETY: this will only modify `buf` and we check the return value.
syscall!(self.with_proc_chdir(|| unsafe {
libc::listxattr(
path.as_ptr(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len() as libc::size_t,
)
}))?
} else {
// For regular files and directories, we can just flistxattr.
// SAFETY: this will only write to `buf` and we check the return value.
syscall!(unsafe {
libc::flistxattr(
file.0.as_raw_descriptor(),
buf.as_mut_ptr() as *mut libc::c_char,
buf.len() as libc::size_t,
)
})?
};
if size == 0 {
Ok(ListxattrReply::Count(res as u32))
} else {
buf.truncate(res as usize);
if self.cfg.rewrite_security_xattrs {
strip_xattr_prefix(&mut buf);
}
Ok(ListxattrReply::Names(buf))
}
}
fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "removexattr", inode, name);
// We don't allow the VM to set this xattr so we also pretend there is no value associated
// with it.
if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
return Err(io::Error::from_raw_os_error(libc::ENODATA));
}
let data = self.find_inode(inode)?;
let name = self.rewrite_xattr_name(name);
let file = data.file.lock();
let o_path_file = (file.1 & libc::O_PATH) != 0;
if o_path_file {
// For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
// emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
// and then setting the CWD back to the root directory.
let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
syscall!(self.with_proc_chdir(||
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
} else {
// For regular files and directories, we can just use fremovexattr.
syscall!(
// SAFETY: this doesn't modify any memory and we check the return value.
unsafe { libc::fremovexattr(file.0.as_raw_descriptor(), name.as_ptr()) }
)?;
}
Ok(())
}
fn fallocate(
&self,
_ctx: Context,
inode: Inode,
handle: Handle,
mode: u32,
offset: u64,
length: u64,
) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
let data = self.find_inode(inode)?;
{
// fallocate needs a writable fd
let mut file = data.file.lock();
let mut flags = file.1;
match flags & libc::O_ACCMODE {
libc::O_RDONLY => {
flags &= !libc::O_RDONLY;
flags |= libc::O_RDWR;
// We need to get a writable handle for this file.
let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
*file = (newfile, flags);
}
libc::O_WRONLY | libc::O_RDWR => {}
_ => panic!("Unexpected flags: {:#x}", flags),
}
}
data
} else {
self.find_handle(handle, inode)?
};
let fd = data.as_raw_descriptor();
// SAFETY: this doesn't modify any memory and we check the return value.
syscall!(unsafe {
libc::fallocate64(
fd,
mode as libc::c_int,
offset as libc::off64_t,
length as libc::off64_t,
)
})?;
Ok(())
}
#[allow(clippy::unnecessary_cast)]
fn ioctl<R: io::Read>(
&self,
ctx: Context,
inode: Inode,
handle: Handle,
_flags: IoctlFlags,
cmd: u32,
_arg: u64,
in_size: u32,
out_size: u32,
r: R,
) -> io::Result<IoctlReply> {
let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
match cmd {
GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
GET_FSXATTR => {
if out_size < size_of::<fsxattr>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.get_fsxattr(inode, handle)
}
}
SET_FSXATTR => {
if in_size < size_of::<fsxattr>() as u32 {
Err(io::Error::from_raw_os_error(libc::EINVAL))
} else {
self.set_fsxattr(ctx, inode, handle, r)
}
}
GET_FLAGS32 | GET_FLAGS64 => {
if out_size < size_of::<c_int>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.get_flags(inode, handle)
}
}
SET_FLAGS32 | SET_FLAGS64 => {
if in_size < size_of::<c_int>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.set_flags(ctx, inode, handle, r)
}
}
ENABLE_VERITY => {
if in_size < size_of::<fsverity_enable_arg>() as u32 {
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.enable_verity(inode, handle, r)
}
}
MEASURE_VERITY => {
if in_size < size_of::<fsverity_digest>() as u32
|| out_size < size_of::<fsverity_digest>() as u32
{
Err(io::Error::from_raw_os_error(libc::ENOMEM))
} else {
self.measure_verity(inode, handle, r, out_size)
}
}
_ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
}
}
fn copy_file_range(
&self,
ctx: Context,
inode_src: Inode,
handle_src: Handle,
offset_src: u64,
inode_dst: Inode,
handle_dst: Handle,
offset_dst: u64,
length: u64,
flags: u64,
) -> io::Result<usize> {
let _trace = fs_trace!(
self.tag,
"copy_file_range",
inode_src,
handle_src,
offset_src,
inode_dst,
handle_dst,
offset_dst,
length,
flags
);
// We need to change credentials during a write so that the kernel will remove setuid or
// setgid bits from the file if it was written to by someone other than the owner.
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
if self.zero_message_open.load(Ordering::Relaxed) {
(self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
} else {
(
self.find_handle(handle_src, inode_src)?,
self.find_handle(handle_dst, inode_dst)?,
)
};
let src = src_data.as_raw_descriptor();
let dst = dst_data.as_raw_descriptor();
Ok(syscall!(
// SAFETY: this call is safe because it doesn't modify any memory and we
// check the return value.
unsafe {
libc::syscall(
libc::SYS_copy_file_range,
src,
&offset_src,
dst,
&offset_dst,
length,
flags,
)
}
)? as usize)
}
fn set_up_mapping<M: Mapper>(
&self,
_ctx: Context,
inode: Self::Inode,
_handle: Self::Handle,
file_offset: u64,
mem_offset: u64,
size: usize,
prot: u32,
mapper: M,
) -> io::Result<()> {
let _trace = fs_trace!(
self.tag,
"set_up_mapping",
inode,
file_offset,
mem_offset,
size,
prot
);
if !self.cfg.use_dax {
return Err(io::Error::from_raw_os_error(libc::ENOSYS));
}
let read = prot & libc::PROT_READ as u32 != 0;
let write = prot & libc::PROT_WRITE as u32 != 0;
let (mmap_flags, prot) = match (read, write) {
(true, true) => (libc::O_RDWR, Protection::read_write()),
(true, false) => (libc::O_RDONLY, Protection::read()),
// Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
(false, true) => (libc::O_RDWR, Protection::write()),
(false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
};
let data = self.find_inode(inode)?;
if self.zero_message_open.load(Ordering::Relaxed) {
let mut file = data.file.lock();
let mut open_flags = file.1;
match (mmap_flags, open_flags & libc::O_ACCMODE) {
(libc::O_RDONLY, libc::O_WRONLY)
| (libc::O_RDWR, libc::O_RDONLY)
| (libc::O_RDWR, libc::O_WRONLY) => {
// We have a read-only or write-only fd and we need to upgrade it.
open_flags &= !libc::O_ACCMODE;
open_flags |= libc::O_RDWR;
let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
*file = (newfile, open_flags);
}
(libc::O_RDONLY, libc::O_RDONLY)
| (libc::O_RDONLY, libc::O_RDWR)
| (libc::O_RDWR, libc::O_RDWR) => {}
(m, o) => panic!(
"Unexpected combination of access flags: ({:#x}, {:#x})",
m, o
),
}
mapper.map(mem_offset, size, &file.0, file_offset, prot)
} else {
let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
mapper.map(mem_offset, size, &file, file_offset, prot)
}
}
fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
if !self.cfg.use_dax {
return Err(io::Error::from_raw_os_error(libc::ENOSYS));
}
for RemoveMappingOne { moffset, len } in msgs {
mapper.unmap(*moffset, *len)?;
}
Ok(())
}
fn atomic_open(
&self,
ctx: Context,
parent: Self::Inode,
name: &CStr,
mode: u32,
flags: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
let _trace = fs_trace!(
self.tag,
"atomic_open",
parent,
name,
mode,
flags,
umask,
security_ctx
);
let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
// Perform lookup but not create negative dentry
let data = self.find_inode(parent)?;
// This lookup serves two purposes:
// 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
// 2. If the O_CREATE flag is set, it checks whether the file exists.
let res = self.do_lookup_with_casefold_fallback(&data, name);
if let Err(e) = res {
if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
// If the file did not exist & O_CREAT is set,
// create file & set FILE_CREATED bits in open options
let (entry, handler, mut opts) =
self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
opts |= OpenOptions::FILE_CREATED;
return Ok((entry, handler, opts));
} else if e.kind() == std::io::ErrorKind::NotFound
&& !self.cfg.negative_timeout.is_zero()
{
return Ok((
Entry::new_negative(self.cfg.negative_timeout),
None,
OpenOptions::empty(),
));
}
return Err(e);
}
// SAFETY: checked res is not error before
let entry = res.unwrap();
if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
return Ok((entry, None, OpenOptions::empty()));
}
if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
return Err(eexist());
}
let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
(None, OpenOptions::KEEP_CACHE)
} else {
let (handler, opts) = self.do_open(entry.inode, flags)?;
(handler, opts)
};
Ok((entry, handler, opts))
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use named_lock::NamedLock;
use tempfile::TempDir;
use super::*;
const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
// Create an instance of `Context` with valid uid, gid, and pid.
// The correct ids are necessary for test cases where new files are created.
fn get_context() -> Context {
// SAFETY: both calls take no parameters and only return an integer value. The kernel also
// guarantees that they can never fail.
let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
// SAFETY: both calls take no parameters and only return an integer value. The kernel also
// guarantees that they can never fail.
let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
let pid = std::process::id() as libc::pid_t;
Context { uid, gid, pid }
}
/// Creates the given directories and files under `temp_dir`.
fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
let path = temp_dir.path();
for d in dirs {
std::fs::create_dir_all(path.join(d)).unwrap();
}
for f in files {
File::create(path.join(f)).unwrap();
}
}
/// Looks up the given `path` in `fs`.
fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
let mut inode = 1;
let ctx = get_context();
for name in path.iter() {
let name = CString::new(name.to_str().unwrap()).unwrap();
let ent = match fs.lookup(ctx, inode, &name) {
Ok(ent) => ent,
Err(e) => {
return Err(e);
}
};
inode = ent.inode;
}
Ok(inode)
}
/// Creates a file at the given `path`.
fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
let parent = path.parent().unwrap();
let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
let parent_inode = lookup(fs, parent)?;
let ctx = get_context();
let security_ctx = None;
fs.create(
ctx,
parent_inode,
&filename,
0o666,
libc::O_RDWR as u32,
0,
security_ctx,
)
.map(|(entry, _, _)| entry)
}
/// Removes a file at the given `path`.
fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
let parent = path.parent().unwrap();
let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
let parent_inode = lookup(fs, parent)?;
let ctx = get_context();
fs.unlink(ctx, parent_inode, &filename)
}
/// Forgets cache.
fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
let ctx = get_context();
let inode = lookup(fs, path)?;
// Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
fs.forget(ctx, inode, u64::MAX);
Ok(())
}
/// Looks up and open the given `path` in `fs`.
fn atomic_open(
fs: &PassthroughFs,
path: &Path,
mode: u32,
flags: u32,
umask: u32,
security_ctx: Option<&CStr>,
) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
let mut inode = 1;
let ctx = get_context();
let path_vec: Vec<_> = path.iter().collect();
let vec_len = path_vec.len();
// Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
// the behavior of VFS, since when VFS call atomic_open only at last look up.
for name in &path_vec[0..vec_len - 1] {
let name = CString::new(name.to_str().unwrap()).unwrap();
let ent = fs.lookup(ctx, inode, &name)?;
inode = ent.inode;
}
let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
}
fn symlink(
fs: &PassthroughFs,
linkname: &Path,
name: &Path,
security_ctx: Option<&CStr>,
) -> io::Result<Entry> {
let inode = 1;
let ctx = get_context();
let name = CString::new(name.to_str().unwrap()).unwrap();
let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
fs.symlink(ctx, &linkname, inode, &name, security_ctx)
}
#[test]
fn rewrite_xattr_names() {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let cfg = Config {
rewrite_security_xattrs: true,
..Default::default()
};
let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
// Selinux shouldn't get overwritten.
// SAFETY: trivially safe
let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
// user, trusted, and system should not be changed either.
// SAFETY: trivially safe
let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
// SAFETY: trivially safe
let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
// SAFETY: trivially safe
let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
// sehash should be re-written.
// SAFETY: trivially safe
let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
assert_eq!(
p.rewrite_xattr_name(sehash).to_bytes(),
b"user.virtiofs.security.sehash"
);
}
#[test]
fn strip_xattr_names() {
let only_nuls = b"\0\0\0\0\0";
let mut actual = only_nuls.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &only_nuls[..]);
let no_nuls = b"security.sehashuser.virtiofs";
let mut actual = no_nuls.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &no_nuls[..]);
let empty = b"";
let mut actual = empty.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &empty[..]);
let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
let mut actual = no_strippable_names.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], &no_strippable_names[..]);
let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
let mut actual = only_strippable_names.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
let mut actual = mixed_names.to_vec();
strip_xattr_prefix(&mut actual);
let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
assert_eq!(&actual[..], &expected[..]);
let no_nul_with_prefix = b"user.virtiofs.security.sehash";
let mut actual = no_nul_with_prefix.to_vec();
strip_xattr_prefix(&mut actual);
assert_eq!(&actual[..], b"security.sehash");
}
#[test]
fn lookup_files() {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
let cfg = Default::default();
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
assert_eq!(
lookup(&fs, &temp_dir.path().join("nonexistent-file"))
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound
);
// "A.txt" is different from "a.txt".
assert_eq!(
lookup(&fs, &temp_dir.path().join("A.txt"))
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound
);
}
#[test]
fn lookup_files_ascii_casefold() {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
let cfg = Config {
ascii_casefold: true,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
// Ensure that "A.txt" is equated with "a.txt".
let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
assert_eq!(
lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
a_inode
);
let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
assert_eq!(
lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
dir_inode
);
let b_inode =
lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
assert_eq!(
lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
b_inode
);
assert_eq!(
lookup(&fs, &temp_dir.path().join("nonexistent-file"))
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound
);
}
fn test_create_and_remove(ascii_casefold: bool) {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
let timeout = Duration::from_millis(10);
let cfg = Config {
timeout,
cache_policy: CachePolicy::Auto,
ascii_casefold,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
// Create a.txt and b.txt.
let a_path = temp_dir.path().join("a.txt");
let b_path = temp_dir.path().join("b.txt");
let a_entry = create(&fs, &a_path).expect("create a.txt");
let b_entry = create(&fs, &b_path).expect("create b.txt");
assert_eq!(
a_entry.inode,
lookup(&fs, &a_path).expect("lookup a.txt"),
"Created file 'a.txt' must be looked up"
);
assert_eq!(
b_entry.inode,
lookup(&fs, &b_path).expect("lookup b.txt"),
"Created file 'b.txt' must be looked up"
);
// Remove a.txt only
unlink(&fs, &a_path).expect("Remove");
assert_eq!(
lookup(&fs, &a_path)
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound,
"a.txt must be removed"
);
// "A.TXT" must not be found regardless of whether casefold is enabled or not.
let upper_a_path = temp_dir.path().join("A.TXT");
assert_eq!(
lookup(&fs, &upper_a_path)
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound,
"A.txt must be removed"
);
// Check if the host file system doesn't have a.txt but does b.txt.
assert!(!a_path.exists(), "a.txt must be removed");
assert!(b_path.exists(), "b.txt must exist");
}
#[test]
fn create_and_remove() {
test_create_and_remove(false /* casefold */);
}
#[test]
fn create_and_remove_casefold() {
test_create_and_remove(true /* casefold */);
}
fn test_create_and_forget(ascii_casefold: bool) {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
let timeout = Duration::from_millis(10);
let cfg = Config {
timeout,
cache_policy: CachePolicy::Auto,
ascii_casefold,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
// Create a.txt.
let a_path = temp_dir.path().join("a.txt");
let a_entry = create(&fs, &a_path).expect("create a.txt");
assert_eq!(
a_entry.inode,
lookup(&fs, &a_path).expect("lookup a.txt"),
"Created file 'a.txt' must be looked up"
);
// Forget a.txt's inode from PassthroughFs's internal cache.
forget(&fs, &a_path).expect("forget a.txt");
if ascii_casefold {
let upper_a_path = temp_dir.path().join("A.TXT");
let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
assert_ne!(
a_entry.inode, new_a_inode,
"inode must be changed after forget()"
);
assert_eq!(
new_a_inode,
lookup(&fs, &a_path).expect("lookup a.txt"),
"inode must be same for a.txt and A.TXT"
);
} else {
assert_ne!(
a_entry.inode,
lookup(&fs, &a_path).expect("lookup a.txt"),
"inode must be changed after forget()"
);
}
}
#[test]
fn create_and_forget() {
test_create_and_forget(false /* ascii_casefold */);
}
#[test]
fn create_and_forget_casefold() {
test_create_and_forget(true /* ascii_casefold */);
}
#[test]
fn casefold_lookup_cache() {
let temp_dir = TempDir::new().unwrap();
// Prepare `a.txt` before starting the test.
create_test_data(&temp_dir, &[], &["a.txt"]);
let cfg = Config {
ascii_casefold: true,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
// Since `a.txt` exists, "A.TXT" must exist.
let large_a_path = temp_dir.path().join("A.TXT");
// Looking up "A.TXT" must create a CasefoldCache entry.
lookup(&fs, &large_a_path).expect("A.TXT must exist");
assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
// Create b.txt.
let b_path = temp_dir.path().join("b.txt");
create(&fs, &b_path).expect("create b.txt");
// Then, b.txt must exists in the cache.
assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
// When removing b.txt, it must be removed from the cache as well.
unlink(&fs, &b_path).expect("remove b.txt");
assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
}
#[test]
fn lookup_negative_cache() {
let temp_dir = TempDir::new().unwrap();
// Prepare `a.txt` before starting the test.
create_test_data(&temp_dir, &[], &[]);
let cfg = Config {
negative_timeout: Duration::from_secs(5),
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
let a_path = temp_dir.path().join("a.txt");
// a.txt hasn't existed yet.
// Since negative_timeout is enabled, success with inode=0 is expected.
assert_eq!(
0,
lookup(&fs, &a_path).expect("lookup a.txt"),
"Entry with inode=0 is expected for non-existing file 'a.txt'"
);
// Create a.txt
let a_entry = create(&fs, &a_path).expect("create a.txt");
assert_eq!(
a_entry.inode,
lookup(&fs, &a_path).expect("lookup a.txt"),
"Created file 'a.txt' must be looked up"
);
// Remove a.txt
unlink(&fs, &a_path).expect("Remove");
assert_eq!(
0,
lookup(&fs, &a_path).expect("lookup a.txt"),
"Entry with inode=0 is expected for the removed file 'a.txt'"
);
}
#[test]
fn test_atomic_open_existing_file() {
atomic_open_existing_file(false);
}
#[test]
fn test_atomic_open_existing_file_zero_message() {
atomic_open_existing_file(true);
}
fn atomic_open_existing_file(zero_message_open: bool) {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
let cache_policy = match zero_message_open {
true => CachePolicy::Always,
false => CachePolicy::Auto,
};
let cfg = Config {
cache_policy,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::ZERO_MESSAGE_OPEN;
fs.init(capable).unwrap();
// atomic_open with flag O_RDWR, should return positive dentry and file handler
let res = atomic_open(
&fs,
&temp_dir.path().join("a.txt"),
0o666,
libc::O_RDWR as u32,
0,
None,
);
assert!(res.is_ok());
let (entry, handler, open_options) = res.unwrap();
assert_ne!(entry.inode, 0);
if zero_message_open {
assert!(handler.is_none());
assert_eq!(open_options, OpenOptions::KEEP_CACHE);
} else {
assert!(handler.is_some());
assert_ne!(
open_options & OpenOptions::FILE_CREATED,
OpenOptions::FILE_CREATED
);
}
// atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
let res = atomic_open(
&fs,
&temp_dir.path().join("dir/b.txt"),
0o666,
(libc::O_RDWR | libc::O_CREAT) as u32,
0,
None,
);
assert!(res.is_ok());
let (entry, handler, open_options) = res.unwrap();
assert_ne!(entry.inode, 0);
if zero_message_open {
assert!(handler.is_none());
assert_eq!(open_options, OpenOptions::KEEP_CACHE);
} else {
assert!(handler.is_some());
assert_ne!(
open_options & OpenOptions::FILE_CREATED,
OpenOptions::FILE_CREATED
);
}
// atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
// handler
let res = atomic_open(
&fs,
&temp_dir.path().join("dir/c.txt"),
0o666,
(libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
0,
None,
);
assert!(res.is_err());
let err_kind = res.unwrap_err().kind();
assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
}
#[test]
fn test_atomic_open_non_existing_file() {
atomic_open_non_existing_file(false);
}
#[test]
fn test_atomic_open_non_existing_file_zero_message() {
atomic_open_non_existing_file(true);
}
fn atomic_open_non_existing_file(zero_message_open: bool) {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
let cache_policy = match zero_message_open {
true => CachePolicy::Always,
false => CachePolicy::Auto,
};
let cfg = Config {
cache_policy,
..Default::default()
};
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::ZERO_MESSAGE_OPEN;
fs.init(capable).unwrap();
// atomic_open with flag O_RDWR, should return NO_EXIST error
let res = atomic_open(
&fs,
&temp_dir.path().join("a.txt"),
0o666,
libc::O_RDWR as u32,
0,
None,
);
assert!(res.is_err());
let err_kind = res.unwrap_err().kind();
assert_eq!(err_kind, io::ErrorKind::NotFound);
// atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
let res = atomic_open(
&fs,
&temp_dir.path().join("b.txt"),
0o666,
(libc::O_RDWR | libc::O_CREAT) as u32,
0,
None,
);
assert!(res.is_ok());
let (entry, handler, open_options) = res.unwrap();
assert_ne!(entry.inode, 0);
if zero_message_open {
assert!(handler.is_none());
assert_eq!(
open_options & OpenOptions::KEEP_CACHE,
OpenOptions::KEEP_CACHE
);
} else {
assert!(handler.is_some());
}
assert_eq!(
open_options & OpenOptions::FILE_CREATED,
OpenOptions::FILE_CREATED
);
}
#[test]
fn atomic_open_symbol_link() {
// Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
// `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
let _guard = lock.lock().expect("acquire named lock");
let temp_dir = TempDir::new().unwrap();
create_test_data(&temp_dir, &["dir"], &["a.txt"]);
let cfg = Default::default();
let fs = PassthroughFs::new("tag", cfg).unwrap();
let capable = FsOptions::empty();
fs.init(capable).unwrap();
// atomic open the link destination file
let res_dst = atomic_open(
&fs,
&temp_dir.path().join("a.txt"),
0o666,
libc::O_RDWR as u32,
0,
None,
);
assert!(res_dst.is_ok());
let (entry_dst, handler_dst, _) = res_dst.unwrap();
assert_ne!(entry_dst.inode, 0);
assert!(handler_dst.is_some());
// create depth 1 symbol link
let sym1_res = symlink(
&fs,
&temp_dir.path().join("a.txt"),
&temp_dir.path().join("blink"),
None,
);
assert!(sym1_res.is_ok());
let sym1_entry = sym1_res.unwrap();
assert_ne!(sym1_entry.inode, 0);
// atomic_open symbol link, should return dentry with no handler
let res = atomic_open(
&fs,
&temp_dir.path().join("blink"),
0o666,
libc::O_RDWR as u32,
0,
None,
);
assert!(res.is_ok());
let (entry, handler, open_options) = res.unwrap();
assert_eq!(entry.inode, sym1_entry.inode);
assert!(handler.is_none());
assert_eq!(open_options, OpenOptions::empty());
// delete link destination
unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
assert_eq!(
lookup(&fs, &temp_dir.path().join("a.txt"))
.expect_err("file must not exist")
.kind(),
io::ErrorKind::NotFound,
"a.txt must be removed"
);
// after link destination removed, should still return valid dentry
let res = atomic_open(
&fs,
&temp_dir.path().join("blink"),
0o666,
libc::O_RDWR as u32,
0,
None,
);
assert!(res.is_ok());
let (entry, handler, open_options) = res.unwrap();
assert_eq!(entry.inode, sym1_entry.inode);
assert!(handler.is_none());
assert_eq!(open_options, OpenOptions::empty());
}
}