blob: 89712c2ebc78207e4e21c840baa84461c3147455 [file] [log] [blame]
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Eugene Zemtsov <ezemtsov@google.com>
Date: Mon, 18 Nov 2019 20:21:06 -0800
Subject: ANDROID: Initial commit of Incremental FS
Fully working incremental fs filesystem
[CPNOTE: 20/07/21] Lee: Asked Paul to open an OoT bug to follow progress
Bug: 133435829
Signed-off-by: Eugene Zemtsov <ezemtsov@google.com>
Signed-off-by: Paul Lawrence <paullawrence@google.com>
[Lee: Squashed all subsequent changes into this initial patch]
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Change-Id: I02cce0b654d0ef74de0a190d30907410b23ab160
Signed-off-by: Lee Jones <joneslee@google.com>
---
Documentation/ABI/testing/sysfs-fs-incfs | 70 +
Documentation/filesystems/incfs.rst | 85 +
MAINTAINERS | 7 +
fs/Kconfig | 1 +
fs/Makefile | 1 +
fs/incfs/Kconfig | 15 +
fs/incfs/Makefile | 13 +
fs/incfs/data_mgmt.c | 1889 ++++++++++++++++++++
fs/incfs/data_mgmt.h | 551 ++++++
fs/incfs/format.c | 752 ++++++++
fs/incfs/format.h | 408 +++++
fs/incfs/integrity.c | 235 +++
fs/incfs/integrity.h | 56 +
fs/incfs/internal.h | 23 +
fs/incfs/main.c | 48 +
fs/incfs/pseudo_files.c | 1394 +++++++++++++++
fs/incfs/pseudo_files.h | 20 +
fs/incfs/sysfs.c | 205 +++
fs/incfs/sysfs.h | 22 +
fs/incfs/verity.c | 821 +++++++++
fs/incfs/verity.h | 49 +
fs/incfs/vfs.c | 1994 ++++++++++++++++++++++
fs/incfs/vfs.h | 33 +
include/uapi/linux/incrementalfs.h | 590 +++++++
24 files changed, 9282 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-fs-incfs
create mode 100644 Documentation/filesystems/incfs.rst
create mode 100644 fs/incfs/Kconfig
create mode 100644 fs/incfs/Makefile
create mode 100644 fs/incfs/data_mgmt.c
create mode 100644 fs/incfs/data_mgmt.h
create mode 100644 fs/incfs/format.c
create mode 100644 fs/incfs/format.h
create mode 100644 fs/incfs/integrity.c
create mode 100644 fs/incfs/integrity.h
create mode 100644 fs/incfs/internal.h
create mode 100644 fs/incfs/main.c
create mode 100644 fs/incfs/pseudo_files.c
create mode 100644 fs/incfs/pseudo_files.h
create mode 100644 fs/incfs/sysfs.c
create mode 100644 fs/incfs/sysfs.h
create mode 100644 fs/incfs/verity.c
create mode 100644 fs/incfs/verity.h
create mode 100644 fs/incfs/vfs.c
create mode 100644 fs/incfs/vfs.h
create mode 100644 include/uapi/linux/incrementalfs.h
diff --git a/Documentation/ABI/testing/sysfs-fs-incfs b/Documentation/ABI/testing/sysfs-fs-incfs
new file mode 100644
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-incfs
@@ -0,0 +1,70 @@
+What: /sys/fs/incremental-fs/features/corefs
+Date: 2019
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Reads 'supported'. Always present.
+
+What: /sys/fs/incremental-fs/features/v2
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Reads 'supported'. Present if all v2 features of incfs are
+ supported.
+
+What: /sys/fs/incremental-fs/features/zstd
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Reads 'supported'. Present if zstd compression is supported
+ for data blocks.
+
+What: /sys/fs/incremental-fs/features/bugfix_throttling
+Date: January 2023
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Reads 'supported'. Present if the throttling lock bug is fixed
+ https://android-review.git.corp.google.com/c/kernel/common/+/2381827
+
+What: /sys/fs/incremental-fs/instances/[name]
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Folder created when incfs is mounted with the sysfs_name=[name]
+ option. If this option is used, the following values are created
+ in this folder.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns a count of the number of reads that were delayed as a
+ result of the per UID read timeouts min time setting.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns total delay time for all files since first mount as a
+ result of the per UID read timeouts min time setting.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns a count of the number of reads that were delayed as a
+ result of waiting for a pending read.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns total delay time for all files since first mount as a
+ result of waiting for a pending read.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns number of reads that failed because of hash verification
+ failures.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_failed_other
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns number of reads that failed for reasons other than
+ timing out or hash failures.
+
+What: /sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
+Date: April 2021
+Contact: Paul Lawrence <paullawrence@google.com>
+Description: Returns number of reads that timed out.
diff --git a/Documentation/filesystems/incfs.rst b/Documentation/filesystems/incfs.rst
new file mode 100644
--- /dev/null
+++ b/Documentation/filesystems/incfs.rst
@@ -0,0 +1,85 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+incfs: A stacked incremental filesystem for Linux
+=================================================
+
+/sys/fs interface
+=================
+
+Please update Documentation/ABI/testing/sysfs-fs-incfs if you update this
+section.
+
+incfs creates the following files in /sys/fs.
+
+Features
+--------
+
+/sys/fs/incremental-fs/features/corefs
+ Reads 'supported'. Always present.
+
+/sys/fs/incremental-fs/features/v2
+ Reads 'supported'. Present if all v2 features of incfs are supported. These
+ are:
+ fs-verity support
+ inotify support
+ ioclts:
+ INCFS_IOC_SET_READ_TIMEOUTS
+ INCFS_IOC_GET_READ_TIMEOUTS
+ INCFS_IOC_GET_BLOCK_COUNT
+ INCFS_IOC_CREATE_MAPPED_FILE
+ .incomplete folder
+ .blocks_written pseudo file
+ report_uid mount option
+
+/sys/fs/incremental-fs/features/zstd
+ Reads 'supported'. Present if zstd compression is supported for data blocks.
+
+/sys/fs/incremental-fs/features/bugfix_throttling
+ Reads 'supported'. Present if the throttling lock bug is fixed
+
+Optional per mount
+------------------
+
+For each incfs mount, the mount option sysfs_name=[name] creates a /sys/fs
+node called:
+
+/sys/fs/incremental-fs/instances/[name]
+
+This will contain the following files:
+
+/sys/fs/incremental-fs/instances/[name]/reads_delayed_min
+ Returns a count of the number of reads that were delayed as a result of the
+ per UID read timeouts min time setting.
+
+/sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
+ Returns total delay time for all files since first mount as a result of the
+ per UID read timeouts min time setting.
+
+/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
+ Returns a count of the number of reads that were delayed as a result of
+ waiting for a pending read.
+
+/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
+ Returns total delay time for all files since first mount as a result of
+ waiting for a pending read.
+
+/sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
+ Returns number of reads that failed because of hash verification failures.
+
+/sys/fs/incremental-fs/instances/[name]/reads_failed_other
+ Returns number of reads that failed for reasons other than timing out or
+ hash failures.
+
+/sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
+ Returns number of reads that timed out.
+
+For reads_delayed_*** settings, note that a file can count for both
+reads_delayed_min and reads_delayed_pending if incfs first waits for a pending
+read then has to wait further for the min time. In that case, the time spent
+waiting is split between reads_delayed_pending_us, which is increased by the
+time spent waiting for the pending read, and reads_delayed_min_us, which is
+increased by the remainder of the time spent waiting.
+
+Reads that timed out are not added to the reads_delayed_pending or the
+reads_delayed_pending_us counters.
diff --git a/MAINTAINERS b/MAINTAINERS
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10373,6 +10373,13 @@ F: Documentation/hwmon/ina2xx.rst
F: drivers/hwmon/ina2xx.c
F: include/linux/platform_data/ina2xx.h
+INCREMENTAL FILE SYSTEM
+M: Paul Lawrence <paullawrence@google.com>
+L: linux-unionfs@vger.kernel.org
+S: Supported
+F: fs/incfs/
+F: tools/testing/selftests/filesystems/incfs/
+
INDEX OF FURTHER KERNEL DOCUMENTATION
M: Carlos Bilbao <carlos.bilbao@amd.com>
S: Maintained
diff --git a/fs/Kconfig b/fs/Kconfig
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,6 +136,7 @@ source "fs/quota/Kconfig"
source "fs/autofs/Kconfig"
source "fs/fuse/Kconfig"
source "fs/overlayfs/Kconfig"
+source "fs/incfs/Kconfig"
menu "Caches"
diff --git a/fs/Makefile b/fs/Makefile
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_OVERLAY_FS) += overlayfs/
obj-$(CONFIG_ORANGEFS_FS) += orangefs/
+obj-$(CONFIG_INCREMENTAL_FS) += incfs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/incfs/Kconfig b/fs/incfs/Kconfig
new file mode 100644
--- /dev/null
+++ b/fs/incfs/Kconfig
@@ -0,0 +1,15 @@
+config INCREMENTAL_FS
+ tristate "Incremental file system support"
+ depends on BLOCK
+ # incfs does not verify fsverity builtin signatures.
+ depends on !CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+ select DECOMPRESS_LZ4
+ select DECOMPRESS_ZSTD
+ select CRYPTO_SHA256
+ help
+ Incremental FS is a read-only virtual file system that facilitates execution
+ of programs while their binaries are still being lazily downloaded over the
+ network, USB or pigeon post.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called incrementalfs.
diff --git a/fs/incfs/Makefile b/fs/incfs/Makefile
new file mode 100644
--- /dev/null
+++ b/fs/incfs/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INCREMENTAL_FS) += incrementalfs.o
+
+incrementalfs-y := \
+ data_mgmt.o \
+ format.o \
+ integrity.o \
+ main.o \
+ pseudo_files.o \
+ sysfs.o \
+ vfs.o
+
+incrementalfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/incfs/data_mgmt.c b/fs/incfs/data_mgmt.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/data_mgmt.c
@@ -0,0 +1,1889 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+#include <linux/crc32.h>
+#include <linux/file.h>
+#include <linux/fsverity.h>
+#include <linux/gfp.h>
+#include <linux/kobject.h>
+#include <linux/ktime.h>
+#include <linux/lz4.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "data_mgmt.h"
+#include "format.h"
+#include "integrity.h"
+#include "sysfs.h"
+#include "verity.h"
+
+static int incfs_scan_metadata_chain(struct data_file *df);
+
+static void log_wake_up_all(struct work_struct *work)
+{
+ struct delayed_work *dw = container_of(work, struct delayed_work, work);
+ struct read_log *rl = container_of(dw, struct read_log, ml_wakeup_work);
+ wake_up_all(&rl->ml_notif_wq);
+}
+
+static void zstd_free_workspace(struct work_struct *work)
+{
+ struct delayed_work *dw = container_of(work, struct delayed_work, work);
+ struct mount_info *mi =
+ container_of(dw, struct mount_info, mi_zstd_cleanup_work);
+
+ mutex_lock(&mi->mi_zstd_workspace_mutex);
+ kvfree(mi->mi_zstd_workspace);
+ mi->mi_zstd_workspace = NULL;
+ mi->mi_zstd_stream = NULL;
+ mutex_unlock(&mi->mi_zstd_workspace_mutex);
+}
+
+struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
+ struct mount_options *options,
+ struct path *backing_dir_path)
+{
+ struct mount_info *mi = NULL;
+ int error = 0;
+ struct incfs_sysfs_node *node;
+
+ mi = kzalloc(sizeof(*mi), GFP_NOFS);
+ if (!mi)
+ return ERR_PTR(-ENOMEM);
+
+ mi->mi_sb = sb;
+ mi->mi_backing_dir_path = *backing_dir_path;
+ mi->mi_owner = get_current_cred();
+ path_get(&mi->mi_backing_dir_path);
+ mutex_init(&mi->mi_dir_struct_mutex);
+ init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
+ init_waitqueue_head(&mi->mi_log.ml_notif_wq);
+ init_waitqueue_head(&mi->mi_blocks_written_notif_wq);
+ atomic_set(&mi->mi_blocks_written, 0);
+ INIT_DELAYED_WORK(&mi->mi_log.ml_wakeup_work, log_wake_up_all);
+ spin_lock_init(&mi->mi_log.rl_lock);
+ spin_lock_init(&mi->pending_read_lock);
+ INIT_LIST_HEAD(&mi->mi_reads_list_head);
+ spin_lock_init(&mi->mi_per_uid_read_timeouts_lock);
+ mutex_init(&mi->mi_zstd_workspace_mutex);
+ INIT_DELAYED_WORK(&mi->mi_zstd_cleanup_work, zstd_free_workspace);
+ mutex_init(&mi->mi_le_mutex);
+
+ node = incfs_add_sysfs_node(options->sysfs_name, mi);
+ if (IS_ERR(node)) {
+ error = PTR_ERR(node);
+ goto err;
+ }
+ mi->mi_sysfs_node = node;
+
+ error = incfs_realloc_mount_info(mi, options);
+ if (error)
+ goto err;
+
+ return mi;
+
+err:
+ incfs_free_mount_info(mi);
+ return ERR_PTR(error);
+}
+
+int incfs_realloc_mount_info(struct mount_info *mi,
+ struct mount_options *options)
+{
+ void *new_buffer = NULL;
+ void *old_buffer;
+ size_t new_buffer_size = 0;
+
+ if (options->read_log_pages != mi->mi_options.read_log_pages) {
+ struct read_log_state log_state;
+ /*
+ * Even though having two buffers allocated at once isn't
+ * usually good, allocating a multipage buffer under a spinlock
+ * is even worse, so let's optimize for the shorter lock
+ * duration. It's not end of the world if we fail to increase
+ * the buffer size anyway.
+ */
+ if (options->read_log_pages > 0) {
+ new_buffer_size = PAGE_SIZE * options->read_log_pages;
+ new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
+ if (!new_buffer)
+ return -ENOMEM;
+ }
+
+ spin_lock(&mi->mi_log.rl_lock);
+ old_buffer = mi->mi_log.rl_ring_buf;
+ mi->mi_log.rl_ring_buf = new_buffer;
+ mi->mi_log.rl_size = new_buffer_size;
+ log_state = (struct read_log_state){
+ .generation_id = mi->mi_log.rl_head.generation_id + 1,
+ };
+ mi->mi_log.rl_head = log_state;
+ mi->mi_log.rl_tail = log_state;
+ spin_unlock(&mi->mi_log.rl_lock);
+
+ kfree(old_buffer);
+ }
+
+ if (options->sysfs_name && !mi->mi_sysfs_node)
+ mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name,
+ mi);
+ else if (!options->sysfs_name && mi->mi_sysfs_node) {
+ incfs_free_sysfs_node(mi->mi_sysfs_node);
+ mi->mi_sysfs_node = NULL;
+ } else if (options->sysfs_name &&
+ strcmp(options->sysfs_name,
+ kobject_name(&mi->mi_sysfs_node->isn_sysfs_node))) {
+ incfs_free_sysfs_node(mi->mi_sysfs_node);
+ mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name,
+ mi);
+ }
+
+ if (IS_ERR(mi->mi_sysfs_node)) {
+ int err = PTR_ERR(mi->mi_sysfs_node);
+
+ mi->mi_sysfs_node = NULL;
+ return err;
+ }
+
+ mi->mi_options = *options;
+ return 0;
+}
+
+void incfs_free_mount_info(struct mount_info *mi)
+{
+ int i;
+ if (!mi)
+ return;
+
+ flush_delayed_work(&mi->mi_log.ml_wakeup_work);
+ flush_delayed_work(&mi->mi_zstd_cleanup_work);
+
+ dput(mi->mi_index_dir);
+ dput(mi->mi_incomplete_dir);
+ path_put(&mi->mi_backing_dir_path);
+ mutex_destroy(&mi->mi_dir_struct_mutex);
+ mutex_destroy(&mi->mi_zstd_workspace_mutex);
+ put_cred(mi->mi_owner);
+ kfree(mi->mi_log.rl_ring_buf);
+ for (i = 0; i < ARRAY_SIZE(mi->pseudo_file_xattr); ++i)
+ kfree(mi->pseudo_file_xattr[i].data);
+ kfree(mi->mi_per_uid_read_timeouts);
+ incfs_free_sysfs_node(mi->mi_sysfs_node);
+ kfree(mi);
+}
+
+static void data_file_segment_init(struct data_file_segment *segment)
+{
+ init_waitqueue_head(&segment->new_data_arrival_wq);
+ init_rwsem(&segment->rwsem);
+ INIT_LIST_HEAD(&segment->reads_list_head);
+}
+
+char *file_id_to_str(incfs_uuid_t id)
+{
+ char *result = kmalloc(1 + sizeof(id.bytes) * 2, GFP_NOFS);
+ char *end;
+
+ if (!result)
+ return NULL;
+
+ end = bin2hex(result, id.bytes, sizeof(id.bytes));
+ *end = 0;
+ return result;
+}
+
+struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name)
+{
+ struct inode *inode;
+ struct dentry *result = NULL;
+
+ if (!parent)
+ return ERR_PTR(-EFAULT);
+
+ inode = d_inode(parent);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
+ result = lookup_one_len(name, parent, strlen(name));
+ inode_unlock(inode);
+
+ if (IS_ERR(result))
+ pr_warn("%s err:%ld\n", __func__, PTR_ERR(result));
+
+ return result;
+}
+
+static struct data_file *handle_mapped_file(struct mount_info *mi,
+ struct data_file *df)
+{
+ char *file_id_str;
+ struct dentry *index_file_dentry;
+ struct path path;
+ struct file *bf;
+ struct data_file *result = NULL;
+ const struct cred *old_cred;
+
+ file_id_str = file_id_to_str(df->df_id);
+ if (!file_id_str)
+ return ERR_PTR(-ENOENT);
+
+ index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir,
+ file_id_str);
+ kfree(file_id_str);
+ if (!index_file_dentry)
+ return ERR_PTR(-ENOENT);
+ if (IS_ERR(index_file_dentry))
+ return ERR_CAST(index_file_dentry);
+ if (!d_really_is_positive(index_file_dentry)) {
+ result = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ path = (struct path) {
+ .mnt = mi->mi_backing_dir_path.mnt,
+ .dentry = index_file_dentry
+ };
+
+ old_cred = override_creds(mi->mi_owner);
+ bf = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
+ current_cred());
+ revert_creds(old_cred);
+
+ if (IS_ERR(bf)) {
+ result = ERR_CAST(bf);
+ goto out;
+ }
+
+ result = incfs_open_data_file(mi, bf);
+ fput(bf);
+ if (IS_ERR(result))
+ goto out;
+
+ result->df_mapped_offset = df->df_metadata_off;
+
+out:
+ dput(index_file_dentry);
+ return result;
+}
+
+struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf)
+{
+ struct data_file *df = NULL;
+ struct backing_file_context *bfc = NULL;
+ int md_records;
+ u64 size;
+ int error = 0;
+ int i;
+
+ if (!bf || !mi)
+ return ERR_PTR(-EFAULT);
+
+ if (!S_ISREG(bf->f_inode->i_mode))
+ return ERR_PTR(-EBADF);
+
+ bfc = incfs_alloc_bfc(mi, bf);
+ if (IS_ERR(bfc))
+ return ERR_CAST(bfc);
+
+ df = kzalloc(sizeof(*df), GFP_NOFS);
+ if (!df) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ mutex_init(&df->df_enable_verity);
+
+ df->df_backing_file_context = bfc;
+ df->df_mount_info = mi;
+ for (i = 0; i < ARRAY_SIZE(df->df_segments); i++)
+ data_file_segment_init(&df->df_segments[i]);
+
+ error = incfs_read_file_header(bfc, &df->df_metadata_off, &df->df_id,
+ &size, &df->df_header_flags);
+
+ if (error)
+ goto out;
+
+ df->df_size = size;
+ if (size > 0)
+ df->df_data_block_count = get_blocks_count_for_size(size);
+
+ if (df->df_header_flags & INCFS_FILE_MAPPED) {
+ struct data_file *mapped_df = handle_mapped_file(mi, df);
+
+ incfs_free_data_file(df);
+ return mapped_df;
+ }
+
+ md_records = incfs_scan_metadata_chain(df);
+ if (md_records < 0)
+ error = md_records;
+
+out:
+ if (error) {
+ incfs_free_bfc(bfc);
+ if (df)
+ df->df_backing_file_context = NULL;
+ incfs_free_data_file(df);
+ return ERR_PTR(error);
+ }
+ return df;
+}
+
+void incfs_free_data_file(struct data_file *df)
+{
+ u32 data_blocks_written, hash_blocks_written;
+
+ if (!df)
+ return;
+
+ data_blocks_written = atomic_read(&df->df_data_blocks_written);
+ hash_blocks_written = atomic_read(&df->df_hash_blocks_written);
+
+ if (data_blocks_written != df->df_initial_data_blocks_written ||
+ hash_blocks_written != df->df_initial_hash_blocks_written) {
+ struct backing_file_context *bfc = df->df_backing_file_context;
+ int error = -1;
+
+ if (bfc && !mutex_lock_interruptible(&bfc->bc_mutex)) {
+ error = incfs_write_status_to_backing_file(
+ df->df_backing_file_context,
+ df->df_status_offset,
+ data_blocks_written,
+ hash_blocks_written);
+ mutex_unlock(&bfc->bc_mutex);
+ }
+
+ if (error)
+ /* Nothing can be done, just warn */
+ pr_warn("incfs: failed to write status to backing file\n");
+ }
+
+ incfs_free_mtree(df->df_hash_tree);
+ incfs_free_bfc(df->df_backing_file_context);
+ kfree(df->df_signature);
+ kfree(df->df_verity_file_digest.data);
+ kfree(df->df_verity_signature);
+ mutex_destroy(&df->df_enable_verity);
+ kfree(df);
+}
+
+int make_inode_ready_for_data_ops(struct mount_info *mi,
+ struct inode *inode,
+ struct file *backing_file)
+{
+ struct inode_info *node = get_incfs_node(inode);
+ struct data_file *df = NULL;
+ int err = 0;
+
+ inode_lock(inode);
+ if (S_ISREG(inode->i_mode)) {
+ if (!node->n_file) {
+ df = incfs_open_data_file(mi, backing_file);
+
+ if (IS_ERR(df))
+ err = PTR_ERR(df);
+ else
+ node->n_file = df;
+ }
+ } else
+ err = -EBADF;
+ inode_unlock(inode);
+ return err;
+}
+
+struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf)
+{
+ struct dir_file *dir = NULL;
+
+ if (!S_ISDIR(bf->f_inode->i_mode))
+ return ERR_PTR(-EBADF);
+
+ dir = kzalloc(sizeof(*dir), GFP_NOFS);
+ if (!dir)
+ return ERR_PTR(-ENOMEM);
+
+ dir->backing_dir = get_file(bf);
+ dir->mount_info = mi;
+ return dir;
+}
+
+void incfs_free_dir_file(struct dir_file *dir)
+{
+ if (!dir)
+ return;
+ if (dir->backing_dir)
+ fput(dir->backing_dir);
+ kfree(dir);
+}
+
+static ssize_t zstd_decompress_safe(struct mount_info *mi,
+ struct mem_range src, struct mem_range dst)
+{
+ ssize_t result;
+ ZSTD_inBuffer inbuf = {.src = src.data, .size = src.len};
+ ZSTD_outBuffer outbuf = {.dst = dst.data, .size = dst.len};
+
+ result = mutex_lock_interruptible(&mi->mi_zstd_workspace_mutex);
+ if (result)
+ return result;
+
+ if (!mi->mi_zstd_stream) {
+ unsigned int workspace_size = zstd_dstream_workspace_bound(
+ INCFS_DATA_FILE_BLOCK_SIZE);
+ void *workspace = kvmalloc(workspace_size, GFP_NOFS);
+ ZSTD_DStream *stream;
+
+ if (!workspace) {
+ result = -ENOMEM;
+ goto out;
+ }
+
+ stream = zstd_init_dstream(INCFS_DATA_FILE_BLOCK_SIZE, workspace,
+ workspace_size);
+ if (!stream) {
+ kvfree(workspace);
+ result = -EIO;
+ goto out;
+ }
+
+ mi->mi_zstd_workspace = workspace;
+ mi->mi_zstd_stream = stream;
+ }
+
+ result = zstd_decompress_stream(mi->mi_zstd_stream, &outbuf, &inbuf) ?
+ -EBADMSG : outbuf.pos;
+
+ mod_delayed_work(system_wq, &mi->mi_zstd_cleanup_work,
+ msecs_to_jiffies(5000));
+
+out:
+ mutex_unlock(&mi->mi_zstd_workspace_mutex);
+ return result;
+}
+
+static ssize_t decompress(struct mount_info *mi,
+ struct mem_range src, struct mem_range dst, int alg)
+{
+ int result;
+
+ switch (alg) {
+ case INCFS_BLOCK_COMPRESSED_LZ4:
+ result = LZ4_decompress_safe(src.data, dst.data, src.len,
+ dst.len);
+ if (result < 0)
+ return -EBADMSG;
+ return result;
+
+ case INCFS_BLOCK_COMPRESSED_ZSTD:
+ return zstd_decompress_safe(mi, src, dst);
+
+ default:
+ WARN_ON(true);
+ return -EOPNOTSUPP;
+ }
+}
+
+static void log_read_one_record(struct read_log *rl, struct read_log_state *rs)
+{
+ union log_record *record =
+ (union log_record *)((u8 *)rl->rl_ring_buf + rs->next_offset);
+ size_t record_size;
+
+ switch (record->full_record.type) {
+ case FULL:
+ rs->base_record = record->full_record;
+ record_size = sizeof(record->full_record);
+ break;
+
+ case SAME_FILE:
+ rs->base_record.block_index =
+ record->same_file.block_index;
+ rs->base_record.absolute_ts_us +=
+ record->same_file.relative_ts_us;
+ rs->base_record.uid = record->same_file.uid;
+ record_size = sizeof(record->same_file);
+ break;
+
+ case SAME_FILE_CLOSE_BLOCK:
+ rs->base_record.block_index +=
+ record->same_file_close_block.block_index_delta;
+ rs->base_record.absolute_ts_us +=
+ record->same_file_close_block.relative_ts_us;
+ record_size = sizeof(record->same_file_close_block);
+ break;
+
+ case SAME_FILE_CLOSE_BLOCK_SHORT:
+ rs->base_record.block_index +=
+ record->same_file_close_block_short.block_index_delta;
+ rs->base_record.absolute_ts_us +=
+ record->same_file_close_block_short.relative_ts_tens_us * 10;
+ record_size = sizeof(record->same_file_close_block_short);
+ break;
+
+ case SAME_FILE_NEXT_BLOCK:
+ ++rs->base_record.block_index;
+ rs->base_record.absolute_ts_us +=
+ record->same_file_next_block.relative_ts_us;
+ record_size = sizeof(record->same_file_next_block);
+ break;
+
+ case SAME_FILE_NEXT_BLOCK_SHORT:
+ ++rs->base_record.block_index;
+ rs->base_record.absolute_ts_us +=
+ record->same_file_next_block_short.relative_ts_tens_us * 10;
+ record_size = sizeof(record->same_file_next_block_short);
+ break;
+ }
+
+ rs->next_offset += record_size;
+ if (rs->next_offset > rl->rl_size - sizeof(*record)) {
+ rs->next_offset = 0;
+ ++rs->current_pass_no;
+ }
+ ++rs->current_record_no;
+}
+
+static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
+ int block_index)
+{
+ struct read_log *log = &mi->mi_log;
+ struct read_log_state *head, *tail;
+ s64 now_us;
+ s64 relative_us;
+ union log_record record;
+ size_t record_size;
+ uid_t uid = current_uid().val;
+ int block_delta;
+ bool same_file, same_uid;
+ bool next_block, close_block, very_close_block;
+ bool close_time, very_close_time, very_very_close_time;
+
+ /*
+ * This may read the old value, but it's OK to delay the logging start
+ * right after the configuration update.
+ */
+ if (READ_ONCE(log->rl_size) == 0)
+ return;
+
+ now_us = ktime_to_us(ktime_get());
+
+ spin_lock(&log->rl_lock);
+ if (log->rl_size == 0) {
+ spin_unlock(&log->rl_lock);
+ return;
+ }
+
+ head = &log->rl_head;
+ tail = &log->rl_tail;
+ relative_us = now_us - head->base_record.absolute_ts_us;
+
+ same_file = !memcmp(id, &head->base_record.file_id,
+ sizeof(incfs_uuid_t));
+ same_uid = uid == head->base_record.uid;
+
+ block_delta = block_index - head->base_record.block_index;
+ next_block = block_delta == 1;
+ very_close_block = block_delta >= S8_MIN && block_delta <= S8_MAX;
+ close_block = block_delta >= S16_MIN && block_delta <= S16_MAX;
+
+ very_very_close_time = relative_us < (1 << 5) * 10;
+ very_close_time = relative_us < (1 << 13);
+ close_time = relative_us < (1 << 16);
+
+ if (same_file && same_uid && next_block && very_very_close_time) {
+ record.same_file_next_block_short =
+ (struct same_file_next_block_short){
+ .type = SAME_FILE_NEXT_BLOCK_SHORT,
+ .relative_ts_tens_us = div_s64(relative_us, 10),
+ };
+ record_size = sizeof(struct same_file_next_block_short);
+ } else if (same_file && same_uid && next_block && very_close_time) {
+ record.same_file_next_block = (struct same_file_next_block){
+ .type = SAME_FILE_NEXT_BLOCK,
+ .relative_ts_us = relative_us,
+ };
+ record_size = sizeof(struct same_file_next_block);
+ } else if (same_file && same_uid && very_close_block &&
+ very_very_close_time) {
+ record.same_file_close_block_short =
+ (struct same_file_close_block_short){
+ .type = SAME_FILE_CLOSE_BLOCK_SHORT,
+ .relative_ts_tens_us = div_s64(relative_us, 10),
+ .block_index_delta = block_delta,
+ };
+ record_size = sizeof(struct same_file_close_block_short);
+ } else if (same_file && same_uid && close_block && very_close_time) {
+ record.same_file_close_block = (struct same_file_close_block){
+ .type = SAME_FILE_CLOSE_BLOCK,
+ .relative_ts_us = relative_us,
+ .block_index_delta = block_delta,
+ };
+ record_size = sizeof(struct same_file_close_block);
+ } else if (same_file && close_time) {
+ record.same_file = (struct same_file){
+ .type = SAME_FILE,
+ .block_index = block_index,
+ .relative_ts_us = relative_us,
+ .uid = uid,
+ };
+ record_size = sizeof(struct same_file);
+ } else {
+ record.full_record = (struct full_record){
+ .type = FULL,
+ .block_index = block_index,
+ .file_id = *id,
+ .absolute_ts_us = now_us,
+ .uid = uid,
+ };
+ head->base_record.file_id = *id;
+ record_size = sizeof(struct full_record);
+ }
+
+ head->base_record.block_index = block_index;
+ head->base_record.absolute_ts_us = now_us;
+
+ /* Advance tail beyond area we are going to overwrite */
+ while (tail->current_pass_no < head->current_pass_no &&
+ tail->next_offset < head->next_offset + record_size)
+ log_read_one_record(log, tail);
+
+ memcpy(((u8 *)log->rl_ring_buf) + head->next_offset, &record,
+ record_size);
+ head->next_offset += record_size;
+ if (head->next_offset > log->rl_size - sizeof(record)) {
+ head->next_offset = 0;
+ ++head->current_pass_no;
+ }
+ ++head->current_record_no;
+
+ spin_unlock(&log->rl_lock);
+ schedule_delayed_work(&log->ml_wakeup_work, msecs_to_jiffies(16));
+}
+
+static int validate_hash_tree(struct backing_file_context *bfc, struct file *f,
+ int block_index, struct mem_range data, u8 *buf)
+{
+ struct data_file *df = get_incfs_data_file(f);
+ u8 stored_digest[INCFS_MAX_HASH_SIZE] = {};
+ u8 calculated_digest[INCFS_MAX_HASH_SIZE] = {};
+ struct mtree *tree = NULL;
+ struct incfs_df_signature *sig = NULL;
+ int digest_size;
+ int hash_block_index = block_index;
+ int lvl;
+ int res;
+ loff_t hash_block_offset[INCFS_MAX_MTREE_LEVELS];
+ size_t hash_offset_in_block[INCFS_MAX_MTREE_LEVELS];
+ int hash_per_block;
+ pgoff_t file_pages;
+
+ /*
+ * Memory barrier to make sure tree is fully present if added via enable
+ * verity
+ */
+ tree = smp_load_acquire(&df->df_hash_tree);
+ sig = df->df_signature;
+ if (!tree || !sig)
+ return 0;
+
+ digest_size = tree->alg->digest_size;
+ hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size;
+ for (lvl = 0; lvl < tree->depth; lvl++) {
+ loff_t lvl_off = tree->hash_level_suboffset[lvl];
+
+ hash_block_offset[lvl] =
+ lvl_off + round_down(hash_block_index * digest_size,
+ INCFS_DATA_FILE_BLOCK_SIZE);
+ hash_offset_in_block[lvl] = hash_block_index * digest_size %
+ INCFS_DATA_FILE_BLOCK_SIZE;
+ hash_block_index /= hash_per_block;
+ }
+
+ memcpy(stored_digest, tree->root_hash, digest_size);
+
+ file_pages = DIV_ROUND_UP(df->df_size, INCFS_DATA_FILE_BLOCK_SIZE);
+ for (lvl = tree->depth - 1; lvl >= 0; lvl--) {
+ pgoff_t hash_page =
+ file_pages +
+ hash_block_offset[lvl] / INCFS_DATA_FILE_BLOCK_SIZE;
+ struct page *page = find_get_page_flags(
+ f->f_inode->i_mapping, hash_page, FGP_ACCESSED);
+
+ if (page && PageChecked(page)) {
+ u8 *addr = kmap_atomic(page);
+
+ memcpy(stored_digest, addr + hash_offset_in_block[lvl],
+ digest_size);
+
+ kunmap_atomic(addr);
+ put_page(page);
+ continue;
+ }
+
+ if (page)
+ put_page(page);
+
+ res = incfs_kread(bfc, buf, INCFS_DATA_FILE_BLOCK_SIZE,
+ hash_block_offset[lvl] + sig->hash_offset);
+ if (res < 0)
+ return res;
+ if (res != INCFS_DATA_FILE_BLOCK_SIZE)
+ return -EIO;
+ res = incfs_calc_digest(tree->alg,
+ range(buf, INCFS_DATA_FILE_BLOCK_SIZE),
+ range(calculated_digest, digest_size));
+ if (res)
+ return res;
+
+ if (memcmp(stored_digest, calculated_digest, digest_size)) {
+ int i;
+ bool zero = true;
+
+ pr_warn("incfs: Hash mismatch lvl:%d blk:%d\n",
+ lvl, block_index);
+ for (i = 0; i < digest_size; i++)
+ if (stored_digest[i]) {
+ zero = false;
+ break;
+ }
+
+ if (zero)
+ pr_debug("Note saved_digest all zero - did you forget to load the hashes?\n");
+ return -EBADMSG;
+ }
+
+ memcpy(stored_digest, buf + hash_offset_in_block[lvl],
+ digest_size);
+
+ page = grab_cache_page(f->f_inode->i_mapping, hash_page);
+ if (page) {
+ u8 *addr = kmap_atomic(page);
+
+ memcpy(addr, buf, INCFS_DATA_FILE_BLOCK_SIZE);
+ kunmap_atomic(addr);
+ SetPageChecked(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
+ res = incfs_calc_digest(tree->alg, data,
+ range(calculated_digest, digest_size));
+ if (res)
+ return res;
+
+ if (memcmp(stored_digest, calculated_digest, digest_size)) {
+ pr_debug("Leaf hash mismatch blk:%d\n", block_index);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static struct data_file_segment *get_file_segment(struct data_file *df,
+ int block_index)
+{
+ int seg_idx = block_index % ARRAY_SIZE(df->df_segments);
+
+ return &df->df_segments[seg_idx];
+}
+
+static bool is_data_block_present(struct data_file_block *block)
+{
+ return (block->db_backing_file_data_offset != 0) &&
+ (block->db_stored_size != 0);
+}
+
+static void convert_data_file_block(struct incfs_blockmap_entry *bme,
+ struct data_file_block *res_block)
+{
+ u16 flags = le16_to_cpu(bme->me_flags);
+
+ res_block->db_backing_file_data_offset =
+ le16_to_cpu(bme->me_data_offset_hi);
+ res_block->db_backing_file_data_offset <<= 32;
+ res_block->db_backing_file_data_offset |=
+ le32_to_cpu(bme->me_data_offset_lo);
+ res_block->db_stored_size = le16_to_cpu(bme->me_data_size);
+ res_block->db_comp_alg = flags & INCFS_BLOCK_COMPRESSED_MASK;
+}
+
+static int get_data_file_block(struct data_file *df, int index,
+ struct data_file_block *res_block)
+{
+ struct incfs_blockmap_entry bme = {};
+ struct backing_file_context *bfc = NULL;
+ loff_t blockmap_off = 0;
+ int error = 0;
+
+ if (!df || !res_block)
+ return -EFAULT;
+
+ blockmap_off = df->df_blockmap_off;
+ bfc = df->df_backing_file_context;
+
+ if (index < 0 || blockmap_off == 0)
+ return -EINVAL;
+
+ error = incfs_read_blockmap_entry(bfc, index, blockmap_off, &bme);
+ if (error)
+ return error;
+
+ convert_data_file_block(&bme, res_block);
+ return 0;
+}
+
+static int check_room_for_one_range(u32 size, u32 size_out)
+{
+ if (size_out + sizeof(struct incfs_filled_range) > size)
+ return -ERANGE;
+ return 0;
+}
+
+static int copy_one_range(struct incfs_filled_range *range, void __user *buffer,
+ u32 size, u32 *size_out)
+{
+ int error = check_room_for_one_range(size, *size_out);
+ if (error)
+ return error;
+
+ if (copy_to_user(((char __user *)buffer) + *size_out, range,
+ sizeof(*range)))
+ return -EFAULT;
+
+ *size_out += sizeof(*range);
+ return 0;
+}
+
+#define READ_BLOCKMAP_ENTRIES 512
+int incfs_get_filled_blocks(struct data_file *df,
+ struct incfs_file_data *fd,
+ struct incfs_get_filled_blocks_args *arg)
+{
+ int error = 0;
+ bool in_range = false;
+ struct incfs_filled_range range;
+ void __user *buffer = u64_to_user_ptr(arg->range_buffer);
+ u32 size = arg->range_buffer_size;
+ u32 end_index =
+ arg->end_index ? arg->end_index : df->df_total_block_count;
+ u32 *size_out = &arg->range_buffer_size_out;
+ int i = READ_BLOCKMAP_ENTRIES - 1;
+ int entries_read = 0;
+ struct incfs_blockmap_entry *bme;
+ int data_blocks_filled = 0;
+ int hash_blocks_filled = 0;
+
+ *size_out = 0;
+ if (end_index > df->df_total_block_count)
+ end_index = df->df_total_block_count;
+ arg->total_blocks_out = df->df_total_block_count;
+ arg->data_blocks_out = df->df_data_block_count;
+
+ if (atomic_read(&df->df_data_blocks_written) ==
+ df->df_data_block_count) {
+ pr_debug("File marked full, fast get_filled_blocks");
+ if (arg->start_index > end_index) {
+ arg->index_out = arg->start_index;
+ return 0;
+ }
+ arg->index_out = arg->start_index;
+
+ error = check_room_for_one_range(size, *size_out);
+ if (error)
+ return error;
+
+ range = (struct incfs_filled_range){
+ .begin = arg->start_index,
+ .end = end_index,
+ };
+
+ error = copy_one_range(&range, buffer, size, size_out);
+ if (error)
+ return error;
+ arg->index_out = end_index;
+ return 0;
+ }
+
+ bme = kzalloc(sizeof(*bme) * READ_BLOCKMAP_ENTRIES,
+ GFP_NOFS | __GFP_COMP);
+ if (!bme)
+ return -ENOMEM;
+
+ for (arg->index_out = arg->start_index; arg->index_out < end_index;
+ ++arg->index_out) {
+ struct data_file_block dfb;
+
+ if (++i == READ_BLOCKMAP_ENTRIES) {
+ entries_read = incfs_read_blockmap_entries(
+ df->df_backing_file_context, bme,
+ arg->index_out, READ_BLOCKMAP_ENTRIES,
+ df->df_blockmap_off);
+ if (entries_read < 0) {
+ error = entries_read;
+ break;
+ }
+
+ i = 0;
+ }
+
+ if (i >= entries_read) {
+ error = -EIO;
+ break;
+ }
+
+ convert_data_file_block(bme + i, &dfb);
+
+ if (is_data_block_present(&dfb)) {
+ if (arg->index_out >= df->df_data_block_count)
+ ++hash_blocks_filled;
+ else
+ ++data_blocks_filled;
+ }
+
+ if (is_data_block_present(&dfb) == in_range)
+ continue;
+
+ if (!in_range) {
+ error = check_room_for_one_range(size, *size_out);
+ if (error)
+ break;
+ in_range = true;
+ range.begin = arg->index_out;
+ } else {
+ range.end = arg->index_out;
+ error = copy_one_range(&range, buffer, size, size_out);
+ if (error) {
+ /* there will be another try out of the loop,
+ * it will reset the index_out if it fails too
+ */
+ break;
+ }
+ in_range = false;
+ }
+ }
+
+ if (in_range) {
+ range.end = arg->index_out;
+ error = copy_one_range(&range, buffer, size, size_out);
+ if (error)
+ arg->index_out = range.begin;
+ }
+
+ if (arg->start_index == 0) {
+ fd->fd_get_block_pos = 0;
+ fd->fd_filled_data_blocks = 0;
+ fd->fd_filled_hash_blocks = 0;
+ }
+
+ if (arg->start_index == fd->fd_get_block_pos) {
+ fd->fd_get_block_pos = arg->index_out + 1;
+ fd->fd_filled_data_blocks += data_blocks_filled;
+ fd->fd_filled_hash_blocks += hash_blocks_filled;
+ }
+
+ if (fd->fd_get_block_pos == df->df_total_block_count + 1) {
+ if (fd->fd_filled_data_blocks >
+ atomic_read(&df->df_data_blocks_written))
+ atomic_set(&df->df_data_blocks_written,
+ fd->fd_filled_data_blocks);
+
+ if (fd->fd_filled_hash_blocks >
+ atomic_read(&df->df_hash_blocks_written))
+ atomic_set(&df->df_hash_blocks_written,
+ fd->fd_filled_hash_blocks);
+ }
+
+ kfree(bme);
+ return error;
+}
+
+static bool is_read_done(struct pending_read *read)
+{
+ return atomic_read_acquire(&read->done) != 0;
+}
+
+static void set_read_done(struct pending_read *read)
+{
+ atomic_set_release(&read->done, 1);
+}
+
+/*
+ * Notifies a given data file about pending read from a given block.
+ * Returns a new pending read entry.
+ */
+static struct pending_read *add_pending_read(struct data_file *df,
+ int block_index)
+{
+ struct pending_read *result = NULL;
+ struct data_file_segment *segment = NULL;
+ struct mount_info *mi = NULL;
+
+ segment = get_file_segment(df, block_index);
+ mi = df->df_mount_info;
+
+ result = kzalloc(sizeof(*result), GFP_NOFS);
+ if (!result)
+ return NULL;
+
+ result->file_id = df->df_id;
+ result->block_index = block_index;
+ result->timestamp_us = ktime_to_us(ktime_get());
+ result->uid = current_uid().val;
+
+ spin_lock(&mi->pending_read_lock);
+
+ result->serial_number = ++mi->mi_last_pending_read_number;
+ mi->mi_pending_reads_count++;
+
+ list_add_rcu(&result->mi_reads_list, &mi->mi_reads_list_head);
+ list_add_rcu(&result->segment_reads_list, &segment->reads_list_head);
+
+ spin_unlock(&mi->pending_read_lock);
+
+ wake_up_all(&mi->mi_pending_reads_notif_wq);
+ return result;
+}
+
+static void free_pending_read_entry(struct rcu_head *entry)
+{
+ struct pending_read *read;
+
+ read = container_of(entry, struct pending_read, rcu);
+
+ kfree(read);
+}
+
+/* Notifies a given data file that pending read is completed. */
+static void remove_pending_read(struct data_file *df, struct pending_read *read)
+{
+ struct mount_info *mi = NULL;
+
+ if (!df || !read) {
+ WARN_ON(!df);
+ WARN_ON(!read);
+ return;
+ }
+
+ mi = df->df_mount_info;
+
+ spin_lock(&mi->pending_read_lock);
+
+ list_del_rcu(&read->mi_reads_list);
+ list_del_rcu(&read->segment_reads_list);
+
+ mi->mi_pending_reads_count--;
+
+ spin_unlock(&mi->pending_read_lock);
+
+ /* Don't free. Wait for readers */
+ call_rcu(&read->rcu, free_pending_read_entry);
+}
+
+static void notify_pending_reads(struct mount_info *mi,
+ struct data_file_segment *segment,
+ int index)
+{
+ struct pending_read *entry = NULL;
+
+ /* Notify pending reads waiting for this block. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &segment->reads_list_head,
+ segment_reads_list) {
+ if (entry->block_index == index)
+ set_read_done(entry);
+ }
+ rcu_read_unlock();
+ wake_up_all(&segment->new_data_arrival_wq);
+
+ atomic_inc(&mi->mi_blocks_written);
+ wake_up_all(&mi->mi_blocks_written_notif_wq);
+}
+
+static int wait_for_data_block(struct data_file *df, int block_index,
+ struct data_file_block *res_block,
+ struct incfs_read_data_file_timeouts *timeouts,
+ unsigned int *delayed_min_us)
+{
+ struct data_file_block block = {};
+ struct data_file_segment *segment = NULL;
+ struct pending_read *read = NULL;
+ struct mount_info *mi = NULL;
+ int error;
+ int wait_res = 0;
+ unsigned int delayed_pending_us = 0;
+ bool delayed_pending = false;
+
+ if (!df || !res_block)
+ return -EFAULT;
+
+ if (block_index < 0 || block_index >= df->df_data_block_count)
+ return -EINVAL;
+
+ if (df->df_blockmap_off <= 0 || !df->df_mount_info)
+ return -ENODATA;
+
+ mi = df->df_mount_info;
+ segment = get_file_segment(df, block_index);
+
+ error = down_read_killable(&segment->rwsem);
+ if (error)
+ return error;
+
+ /* Look up the given block */
+ error = get_data_file_block(df, block_index, &block);
+
+ up_read(&segment->rwsem);
+
+ if (error)
+ return error;
+
+ /* If the block was found, just return it. No need to wait. */
+ if (is_data_block_present(&block)) {
+ *res_block = block;
+ if (timeouts && timeouts->min_time_us) {
+ *delayed_min_us = timeouts->min_time_us;
+ goto out;
+ }
+ return 0;
+ } else {
+ /* If it's not found, create a pending read */
+ if (timeouts && timeouts->max_pending_time_us) {
+ read = add_pending_read(df, block_index);
+ if (!read)
+ return -ENOMEM;
+ } else {
+ log_block_read(mi, &df->df_id, block_index);
+ return -ETIME;
+ }
+ }
+
+ /* Rest of function only applies if timeouts != NULL */
+ if (!timeouts) {
+ pr_warn("incfs: timeouts unexpectedly NULL\n");
+ return -EFSCORRUPTED;
+ }
+
+ /* Wait for notifications about block's arrival */
+ wait_res =
+ wait_event_interruptible_timeout(segment->new_data_arrival_wq,
+ (is_read_done(read)),
+ usecs_to_jiffies(timeouts->max_pending_time_us));
+
+ /* Woke up, the pending read is no longer needed. */
+ remove_pending_read(df, read);
+
+ if (wait_res == 0) {
+ /* Wait has timed out */
+ log_block_read(mi, &df->df_id, block_index);
+ return -ETIME;
+ }
+ if (wait_res < 0) {
+ /*
+ * Only ERESTARTSYS is really expected here when a signal
+ * comes while we wait.
+ */
+ return wait_res;
+ }
+
+ delayed_pending = true;
+ delayed_pending_us = timeouts->max_pending_time_us -
+ jiffies_to_usecs(wait_res);
+ if (timeouts->min_pending_time_us > delayed_pending_us)
+ *delayed_min_us = timeouts->min_pending_time_us -
+ delayed_pending_us;
+
+ error = down_read_killable(&segment->rwsem);
+ if (error)
+ return error;
+
+ /*
+ * Re-read blocks info now, it has just arrived and
+ * should be available.
+ */
+ error = get_data_file_block(df, block_index, &block);
+ if (!error) {
+ if (is_data_block_present(&block))
+ *res_block = block;
+ else {
+ /*
+ * Somehow wait finished successfully but block still
+ * can't be found. It's not normal.
+ */
+ pr_warn("incfs: Wait succeeded but block not found.\n");
+ error = -ENODATA;
+ }
+ }
+ up_read(&segment->rwsem);
+
+out:
+ if (error)
+ return error;
+
+ if (delayed_pending) {
+ mi->mi_reads_delayed_pending++;
+ mi->mi_reads_delayed_pending_us +=
+ delayed_pending_us;
+ }
+
+ if (delayed_min_us && *delayed_min_us) {
+ mi->mi_reads_delayed_min++;
+ mi->mi_reads_delayed_min_us += *delayed_min_us;
+ }
+
+ return 0;
+}
+
+static int incfs_update_sysfs_error(struct file *file, int index, int result,
+ struct mount_info *mi, struct data_file *df)
+{
+ int error;
+
+ if (result >= 0)
+ return 0;
+
+ error = mutex_lock_interruptible(&mi->mi_le_mutex);
+ if (error)
+ return error;
+
+ mi->mi_le_file_id = df->df_id;
+ mi->mi_le_time_us = ktime_to_us(ktime_get());
+ mi->mi_le_page = index;
+ mi->mi_le_errno = result;
+ mi->mi_le_uid = current_uid().val;
+ mutex_unlock(&mi->mi_le_mutex);
+
+ return 0;
+}
+
+ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
+ int index, struct mem_range tmp,
+ struct incfs_read_data_file_timeouts *timeouts,
+ unsigned int *delayed_min_us)
+{
+ loff_t pos;
+ ssize_t result;
+ size_t bytes_to_read;
+ struct mount_info *mi = NULL;
+ struct backing_file_context *bfc = NULL;
+ struct data_file_block block = {};
+ struct data_file *df = get_incfs_data_file(f);
+
+ if (!dst.data || !df || !tmp.data)
+ return -EFAULT;
+
+ if (tmp.len < 2 * INCFS_DATA_FILE_BLOCK_SIZE)
+ return -ERANGE;
+
+ mi = df->df_mount_info;
+ bfc = df->df_backing_file_context;
+
+ result = wait_for_data_block(df, index, &block, timeouts,
+ delayed_min_us);
+ if (result < 0)
+ goto out;
+
+ pos = block.db_backing_file_data_offset;
+ if (block.db_comp_alg == COMPRESSION_NONE) {
+ bytes_to_read = min(dst.len, block.db_stored_size);
+ result = incfs_kread(bfc, dst.data, bytes_to_read, pos);
+
+ /* Some data was read, but not enough */
+ if (result >= 0 && result != bytes_to_read)
+ result = -EIO;
+ } else {
+ bytes_to_read = min(tmp.len, block.db_stored_size);
+ result = incfs_kread(bfc, tmp.data, bytes_to_read, pos);
+ if (result == bytes_to_read) {
+ result =
+ decompress(mi, range(tmp.data, bytes_to_read),
+ dst, block.db_comp_alg);
+ if (result < 0) {
+ const char *name =
+ bfc->bc_file->f_path.dentry->d_name.name;
+
+ pr_warn_once("incfs: Decompression error. %s",
+ name);
+ }
+ } else if (result >= 0) {
+ /* Some data was read, but not enough */
+ result = -EIO;
+ }
+ }
+
+ if (result > 0) {
+ int err = validate_hash_tree(bfc, f, index, dst, tmp.data);
+
+ if (err < 0)
+ result = err;
+ }
+
+ if (result >= 0)
+ log_block_read(mi, &df->df_id, index);
+
+out:
+ if (result == -ETIME)
+ mi->mi_reads_failed_timed_out++;
+ else if (result == -EBADMSG)
+ mi->mi_reads_failed_hash_verification++;
+ else if (result < 0)
+ mi->mi_reads_failed_other++;
+
+ incfs_update_sysfs_error(f, index, result, mi, df);
+
+ return result;
+}
+
+ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
+ struct data_file *df, size_t offset)
+{
+ struct backing_file_context *bfc = NULL;
+ struct incfs_df_signature *sig = NULL;
+ size_t to_read = dst.len;
+
+ if (!dst.data || !df)
+ return -EFAULT;
+
+ sig = df->df_signature;
+ bfc = df->df_backing_file_context;
+
+ if (offset > sig->hash_size)
+ return -ERANGE;
+
+ if (offset + to_read > sig->hash_size)
+ to_read = sig->hash_size - offset;
+
+ return incfs_kread(bfc, dst.data, to_read, sig->hash_offset + offset);
+}
+
+int incfs_process_new_data_block(struct data_file *df,
+ struct incfs_fill_block *block, u8 *data,
+ bool *complete)
+{
+ struct mount_info *mi = NULL;
+ struct backing_file_context *bfc = NULL;
+ struct data_file_segment *segment = NULL;
+ struct data_file_block existing_block = {};
+ u16 flags = 0;
+ int error = 0;
+
+ if (!df || !block)
+ return -EFAULT;
+
+ bfc = df->df_backing_file_context;
+ mi = df->df_mount_info;
+
+ if (block->block_index >= df->df_data_block_count)
+ return -ERANGE;
+
+ segment = get_file_segment(df, block->block_index);
+ if (!segment)
+ return -EFAULT;
+
+ if (block->compression == COMPRESSION_LZ4)
+ flags |= INCFS_BLOCK_COMPRESSED_LZ4;
+ else if (block->compression == COMPRESSION_ZSTD)
+ flags |= INCFS_BLOCK_COMPRESSED_ZSTD;
+ else if (block->compression)
+ return -EINVAL;
+
+ error = down_read_killable(&segment->rwsem);
+ if (error)
+ return error;
+
+ error = get_data_file_block(df, block->block_index, &existing_block);
+
+ up_read(&segment->rwsem);
+
+ if (error)
+ return error;
+ if (is_data_block_present(&existing_block))
+ /* Block is already present, nothing to do here */
+ return 0;
+
+ error = down_write_killable(&segment->rwsem);
+ if (error)
+ return error;
+
+ /* Recheck inside write lock */
+ error = get_data_file_block(df, block->block_index, &existing_block);
+ if (error)
+ goto out_up_write;
+
+ if (is_data_block_present(&existing_block))
+ goto out_up_write;
+
+ error = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (error)
+ goto out_up_write;
+
+ error = incfs_write_data_block_to_backing_file(bfc,
+ range(data, block->data_len), block->block_index,
+ df->df_blockmap_off, flags);
+ if (error)
+ goto out_mutex_unlock;
+
+ if (atomic_inc_return(&df->df_data_blocks_written)
+ >= df->df_data_block_count)
+ *complete = true;
+
+out_mutex_unlock:
+ mutex_unlock(&bfc->bc_mutex);
+ if (!error)
+ notify_pending_reads(mi, segment, block->block_index);
+
+out_up_write:
+ up_write(&segment->rwsem);
+
+ if (error)
+ pr_debug("%d error: %d\n", block->block_index, error);
+ return error;
+}
+
+int incfs_read_file_signature(struct data_file *df, struct mem_range dst)
+{
+ struct backing_file_context *bfc = df->df_backing_file_context;
+ struct incfs_df_signature *sig;
+ int read_res = 0;
+
+ if (!dst.data)
+ return -EFAULT;
+
+ sig = df->df_signature;
+ if (!sig)
+ return 0;
+
+ if (dst.len < sig->sig_size)
+ return -E2BIG;
+
+ read_res = incfs_kread(bfc, dst.data, sig->sig_size, sig->sig_offset);
+
+ if (read_res < 0)
+ return read_res;
+
+ if (read_res != sig->sig_size)
+ return -EIO;
+
+ return read_res;
+}
+
+int incfs_process_new_hash_block(struct data_file *df,
+ struct incfs_fill_block *block, u8 *data)
+{
+ struct backing_file_context *bfc = NULL;
+ struct mount_info *mi = NULL;
+ struct mtree *hash_tree = NULL;
+ struct incfs_df_signature *sig = NULL;
+ loff_t hash_area_base = 0;
+ loff_t hash_area_size = 0;
+ int error = 0;
+
+ if (!df || !block)
+ return -EFAULT;
+
+ if (!(block->flags & INCFS_BLOCK_FLAGS_HASH))
+ return -EINVAL;
+
+ bfc = df->df_backing_file_context;
+ mi = df->df_mount_info;
+
+ if (!df)
+ return -ENOENT;
+
+ hash_tree = df->df_hash_tree;
+ sig = df->df_signature;
+ if (!hash_tree || !sig || sig->hash_offset == 0)
+ return -ENOTSUPP;
+
+ hash_area_base = sig->hash_offset;
+ hash_area_size = sig->hash_size;
+ if (hash_area_size < block->block_index * INCFS_DATA_FILE_BLOCK_SIZE
+ + block->data_len) {
+ /* Hash block goes beyond dedicated hash area of this file. */
+ return -ERANGE;
+ }
+
+ error = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (!error) {
+ error = incfs_write_hash_block_to_backing_file(
+ bfc, range(data, block->data_len), block->block_index,
+ hash_area_base, df->df_blockmap_off, df->df_size);
+ mutex_unlock(&bfc->bc_mutex);
+ }
+ if (!error)
+ atomic_inc(&df->df_hash_blocks_written);
+
+ return error;
+}
+
+static int process_blockmap_md(struct incfs_blockmap *bm,
+ struct metadata_handler *handler)
+{
+ struct data_file *df = handler->context;
+ int error = 0;
+ loff_t base_off = le64_to_cpu(bm->m_base_offset);
+ u32 block_count = le32_to_cpu(bm->m_block_count);
+
+ if (!df)
+ return -EFAULT;
+
+ if (df->df_data_block_count > block_count)
+ return -EBADMSG;
+
+ df->df_total_block_count = block_count;
+ df->df_blockmap_off = base_off;
+ return error;
+}
+
+static int process_file_signature_md(struct incfs_file_signature *sg,
+ struct metadata_handler *handler)
+{
+ struct data_file *df = handler->context;
+ struct mtree *hash_tree = NULL;
+ int error = 0;
+ struct incfs_df_signature *signature =
+ kzalloc(sizeof(*signature), GFP_NOFS);
+ void *buf = NULL;
+ ssize_t read;
+
+ if (!signature)
+ return -ENOMEM;
+
+ if (!df || !df->df_backing_file_context ||
+ !df->df_backing_file_context->bc_file) {
+ error = -ENOENT;
+ goto out;
+ }
+
+ signature->hash_offset = le64_to_cpu(sg->sg_hash_tree_offset);
+ signature->hash_size = le32_to_cpu(sg->sg_hash_tree_size);
+ signature->sig_offset = le64_to_cpu(sg->sg_sig_offset);
+ signature->sig_size = le32_to_cpu(sg->sg_sig_size);
+
+ buf = kzalloc(signature->sig_size, GFP_NOFS);
+ if (!buf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ read = incfs_kread(df->df_backing_file_context, buf,
+ signature->sig_size, signature->sig_offset);
+ if (read < 0) {
+ error = read;
+ goto out;
+ }
+
+ if (read != signature->sig_size) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ hash_tree = incfs_alloc_mtree(range(buf, signature->sig_size),
+ df->df_data_block_count);
+ if (IS_ERR(hash_tree)) {
+ error = PTR_ERR(hash_tree);
+ hash_tree = NULL;
+ goto out;
+ }
+ if (hash_tree->hash_tree_area_size != signature->hash_size) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (signature->hash_size > 0 &&
+ handler->md_record_offset <= signature->hash_offset) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (handler->md_record_offset <= signature->sig_offset) {
+ error = -EINVAL;
+ goto out;
+ }
+ df->df_hash_tree = hash_tree;
+ hash_tree = NULL;
+ df->df_signature = signature;
+ signature = NULL;
+out:
+ incfs_free_mtree(hash_tree);
+ kfree(signature);
+ kfree(buf);
+
+ return error;
+}
+
+static int process_status_md(struct incfs_status *is,
+ struct metadata_handler *handler)
+{
+ struct data_file *df = handler->context;
+
+ df->df_initial_data_blocks_written =
+ le32_to_cpu(is->is_data_blocks_written);
+ atomic_set(&df->df_data_blocks_written,
+ df->df_initial_data_blocks_written);
+
+ df->df_initial_hash_blocks_written =
+ le32_to_cpu(is->is_hash_blocks_written);
+ atomic_set(&df->df_hash_blocks_written,
+ df->df_initial_hash_blocks_written);
+
+ df->df_status_offset = handler->md_record_offset;
+ return 0;
+}
+
+static int process_file_verity_signature_md(
+ struct incfs_file_verity_signature *vs,
+ struct metadata_handler *handler)
+{
+ struct data_file *df = handler->context;
+ struct incfs_df_verity_signature *verity_signature;
+
+ if (!df)
+ return -EFAULT;
+
+ verity_signature = kzalloc(sizeof(*verity_signature), GFP_NOFS);
+ if (!verity_signature)
+ return -ENOMEM;
+
+ verity_signature->offset = le64_to_cpu(vs->vs_offset);
+ verity_signature->size = le32_to_cpu(vs->vs_size);
+ if (verity_signature->size > FS_VERITY_MAX_SIGNATURE_SIZE) {
+ kfree(verity_signature);
+ return -EFAULT;
+ }
+
+ df->df_verity_signature = verity_signature;
+ return 0;
+}
+
+static int incfs_scan_metadata_chain(struct data_file *df)
+{
+ struct metadata_handler *handler = NULL;
+ int result = 0;
+ int records_count = 0;
+ int error = 0;
+ struct backing_file_context *bfc = NULL;
+ int nondata_block_count;
+
+ if (!df || !df->df_backing_file_context)
+ return -EFAULT;
+
+ bfc = df->df_backing_file_context;
+
+ handler = kzalloc(sizeof(*handler), GFP_NOFS);
+ if (!handler)
+ return -ENOMEM;
+
+ handler->md_record_offset = df->df_metadata_off;
+ handler->context = df;
+ handler->handle_blockmap = process_blockmap_md;
+ handler->handle_signature = process_file_signature_md;
+ handler->handle_status = process_status_md;
+ handler->handle_verity_signature = process_file_verity_signature_md;
+
+ while (handler->md_record_offset > 0) {
+ error = incfs_read_next_metadata_record(bfc, handler);
+ if (error) {
+ pr_warn("incfs: Error during reading incfs-metadata record. Offset: %lld Record #%d Error code: %d\n",
+ handler->md_record_offset, records_count + 1,
+ -error);
+ break;
+ }
+ records_count++;
+ }
+ if (error) {
+ pr_warn("incfs: Error %d after reading %d incfs-metadata records.\n",
+ -error, records_count);
+ result = error;
+ } else
+ result = records_count;
+
+ nondata_block_count = df->df_total_block_count -
+ df->df_data_block_count;
+ if (df->df_hash_tree) {
+ int hash_block_count = get_blocks_count_for_size(
+ df->df_hash_tree->hash_tree_area_size);
+
+ /*
+ * Files that were created with a hash tree have the hash tree
+ * included in the block map, i.e. nondata_block_count ==
+ * hash_block_count. Files whose hash tree was added by
+ * FS_IOC_ENABLE_VERITY will still have the original block
+ * count, i.e. nondata_block_count == 0.
+ */
+ if (nondata_block_count != hash_block_count &&
+ nondata_block_count != 0)
+ result = -EINVAL;
+ } else if (nondata_block_count != 0) {
+ result = -EINVAL;
+ }
+
+ kfree(handler);
+ return result;
+}
+
+/*
+ * Quickly checks if there are pending reads with a serial number larger
+ * than a given one.
+ */
+bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number)
+{
+ bool result = false;
+
+ spin_lock(&mi->pending_read_lock);
+ result = (mi->mi_last_pending_read_number > last_number) &&
+ (mi->mi_pending_reads_count > 0);
+ spin_unlock(&mi->pending_read_lock);
+ return result;
+}
+
+int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
+ struct incfs_pending_read_info *reads,
+ struct incfs_pending_read_info2 *reads2,
+ int reads_size, int *new_max_sn)
+{
+ int reported_reads = 0;
+ struct pending_read *entry = NULL;
+
+ if (!mi)
+ return -EFAULT;
+
+ if (reads_size <= 0)
+ return 0;
+
+ if (!incfs_fresh_pending_reads_exist(mi, sn_lowerbound))
+ return 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &mi->mi_reads_list_head, mi_reads_list) {
+ if (entry->serial_number <= sn_lowerbound)
+ continue;
+
+ if (reads) {
+ reads[reported_reads].file_id = entry->file_id;
+ reads[reported_reads].block_index = entry->block_index;
+ reads[reported_reads].serial_number =
+ entry->serial_number;
+ reads[reported_reads].timestamp_us =
+ entry->timestamp_us;
+ }
+
+ if (reads2) {
+ reads2[reported_reads].file_id = entry->file_id;
+ reads2[reported_reads].block_index = entry->block_index;
+ reads2[reported_reads].serial_number =
+ entry->serial_number;
+ reads2[reported_reads].timestamp_us =
+ entry->timestamp_us;
+ reads2[reported_reads].uid = entry->uid;
+ }
+
+ if (entry->serial_number > *new_max_sn)
+ *new_max_sn = entry->serial_number;
+
+ reported_reads++;
+ if (reported_reads >= reads_size)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return reported_reads;
+}
+
+struct read_log_state incfs_get_log_state(struct mount_info *mi)
+{
+ struct read_log *log = &mi->mi_log;
+ struct read_log_state result;
+
+ spin_lock(&log->rl_lock);
+ result = log->rl_head;
+ spin_unlock(&log->rl_lock);
+ return result;
+}
+
+int incfs_get_uncollected_logs_count(struct mount_info *mi,
+ const struct read_log_state *state)
+{
+ struct read_log *log = &mi->mi_log;
+ u32 generation;
+ u64 head_no, tail_no;
+
+ spin_lock(&log->rl_lock);
+ tail_no = log->rl_tail.current_record_no;
+ head_no = log->rl_head.current_record_no;
+ generation = log->rl_head.generation_id;
+ spin_unlock(&log->rl_lock);
+
+ if (generation != state->generation_id)
+ return head_no - tail_no;
+ else
+ return head_no - max_t(u64, tail_no, state->current_record_no);
+}
+
+int incfs_collect_logged_reads(struct mount_info *mi,
+ struct read_log_state *state,
+ struct incfs_pending_read_info *reads,
+ struct incfs_pending_read_info2 *reads2,
+ int reads_size)
+{
+ int dst_idx;
+ struct read_log *log = &mi->mi_log;
+ struct read_log_state *head, *tail;
+
+ spin_lock(&log->rl_lock);
+ head = &log->rl_head;
+ tail = &log->rl_tail;
+
+ if (state->generation_id != head->generation_id) {
+ pr_debug("read ptr is wrong generation: %u/%u",
+ state->generation_id, head->generation_id);
+
+ *state = (struct read_log_state){
+ .generation_id = head->generation_id,
+ };
+ }
+
+ if (state->current_record_no < tail->current_record_no) {
+ pr_debug("read ptr is behind, moving: %u/%u -> %u/%u\n",
+ (u32)state->next_offset,
+ (u32)state->current_pass_no,
+ (u32)tail->next_offset, (u32)tail->current_pass_no);
+
+ *state = *tail;
+ }
+
+ for (dst_idx = 0; dst_idx < reads_size; dst_idx++) {
+ if (state->current_record_no == head->current_record_no)
+ break;
+
+ log_read_one_record(log, state);
+
+ if (reads)
+ reads[dst_idx] = (struct incfs_pending_read_info) {
+ .file_id = state->base_record.file_id,
+ .block_index = state->base_record.block_index,
+ .serial_number = state->current_record_no,
+ .timestamp_us =
+ state->base_record.absolute_ts_us,
+ };
+
+ if (reads2)
+ reads2[dst_idx] = (struct incfs_pending_read_info2) {
+ .file_id = state->base_record.file_id,
+ .block_index = state->base_record.block_index,
+ .serial_number = state->current_record_no,
+ .timestamp_us =
+ state->base_record.absolute_ts_us,
+ .uid = state->base_record.uid,
+ };
+ }
+
+ spin_unlock(&log->rl_lock);
+ return dst_idx;
+}
+
diff --git a/fs/incfs/data_mgmt.h b/fs/incfs/data_mgmt.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/data_mgmt.h
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+#ifndef _INCFS_DATA_MGMT_H
+#define _INCFS_DATA_MGMT_H
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
+#include <linux/zstd.h>
+#include <crypto/hash.h>
+#include <linux/rwsem.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "internal.h"
+#include "pseudo_files.h"
+
+#define SEGMENTS_PER_FILE 3
+
+enum LOG_RECORD_TYPE {
+ FULL,
+ SAME_FILE,
+ SAME_FILE_CLOSE_BLOCK,
+ SAME_FILE_CLOSE_BLOCK_SHORT,
+ SAME_FILE_NEXT_BLOCK,
+ SAME_FILE_NEXT_BLOCK_SHORT,
+};
+
+struct full_record {
+ enum LOG_RECORD_TYPE type : 3; /* FULL */
+ u32 block_index : 29;
+ incfs_uuid_t file_id;
+ u64 absolute_ts_us;
+ uid_t uid;
+} __packed; /* 32 bytes */
+
+struct same_file {
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */
+ u32 block_index : 29;
+ uid_t uid;
+ u16 relative_ts_us; /* max 2^16 us ~= 64 ms */
+} __packed; /* 10 bytes */
+
+struct same_file_close_block {
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */
+ u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
+ s16 block_index_delta;
+} __packed; /* 4 bytes */
+
+struct same_file_close_block_short {
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */
+ u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
+ s8 block_index_delta;
+} __packed; /* 2 bytes */
+
+struct same_file_next_block {
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */
+ u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
+} __packed; /* 2 bytes */
+
+struct same_file_next_block_short {
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */
+ u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
+} __packed; /* 1 byte */
+
+union log_record {
+ struct full_record full_record;
+ struct same_file same_file;
+ struct same_file_close_block same_file_close_block;
+ struct same_file_close_block_short same_file_close_block_short;
+ struct same_file_next_block same_file_next_block;
+ struct same_file_next_block_short same_file_next_block_short;
+};
+
+struct read_log_state {
+ /* Log buffer generation id, incremented on configuration changes */
+ u32 generation_id;
+
+ /* Offset in rl_ring_buf to write into. */
+ u32 next_offset;
+
+ /* Current number of writer passes over rl_ring_buf */
+ u32 current_pass_no;
+
+ /* Current full_record to diff against */
+ struct full_record base_record;
+
+ /* Current record number counting from configuration change */
+ u64 current_record_no;
+};
+
+/* A ring buffer to save records about data blocks which were recently read. */
+struct read_log {
+ void *rl_ring_buf;
+
+ int rl_size;
+
+ struct read_log_state rl_head;
+
+ struct read_log_state rl_tail;
+
+ /* A lock to protect the above fields */
+ spinlock_t rl_lock;
+
+ /* A queue of waiters who want to be notified about reads */
+ wait_queue_head_t ml_notif_wq;
+
+ /* A work item to wake up those waiters without slowing down readers */
+ struct delayed_work ml_wakeup_work;
+};
+
+struct mount_options {
+ unsigned int read_timeout_ms;
+ unsigned int readahead_pages;
+ unsigned int read_log_pages;
+ unsigned int read_log_wakeup_count;
+ bool report_uid;
+ char *sysfs_name;
+};
+
+struct mount_info {
+ struct super_block *mi_sb;
+
+ struct path mi_backing_dir_path;
+
+ struct dentry *mi_index_dir;
+ /* For stacking mounts, if true, this indicates if the index dir needs
+ * to be freed for this SB otherwise it was created by lower level SB */
+ bool mi_index_free;
+
+ struct dentry *mi_incomplete_dir;
+ /* For stacking mounts, if true, this indicates if the incomplete dir
+ * needs to be freed for this SB. Similar to mi_index_free */
+ bool mi_incomplete_free;
+
+ const struct cred *mi_owner;
+
+ struct mount_options mi_options;
+
+ /* This mutex is to be taken before create, rename, delete */
+ struct mutex mi_dir_struct_mutex;
+
+ /*
+ * A queue of waiters who want to be notified about new pending reads.
+ */
+ wait_queue_head_t mi_pending_reads_notif_wq;
+
+ /*
+ * Protects - RCU safe:
+ * - reads_list_head
+ * - mi_pending_reads_count
+ * - mi_last_pending_read_number
+ * - data_file_segment.reads_list_head
+ */
+ spinlock_t pending_read_lock;
+
+ /* List of active pending_read objects */
+ struct list_head mi_reads_list_head;
+
+ /* Total number of items in reads_list_head */
+ int mi_pending_reads_count;
+
+ /*
+ * Last serial number that was assigned to a pending read.
+ * 0 means no pending reads have been seen yet.
+ */
+ int mi_last_pending_read_number;
+
+ /* Temporary buffer for read logger. */
+ struct read_log mi_log;
+
+ /* SELinux needs special xattrs on our pseudo files */
+ struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT];
+
+ /* A queue of waiters who want to be notified about blocks_written */
+ wait_queue_head_t mi_blocks_written_notif_wq;
+
+ /* Number of blocks written since mount */
+ atomic_t mi_blocks_written;
+
+ /* Per UID read timeouts */
+ spinlock_t mi_per_uid_read_timeouts_lock;
+ struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts;
+ int mi_per_uid_read_timeouts_size;
+
+ /* zstd workspace */
+ struct mutex mi_zstd_workspace_mutex;
+ void *mi_zstd_workspace;
+ ZSTD_DStream *mi_zstd_stream;
+ struct delayed_work mi_zstd_cleanup_work;
+
+ /* sysfs node */
+ struct incfs_sysfs_node *mi_sysfs_node;
+
+ /* Last error information */
+ struct mutex mi_le_mutex;
+ incfs_uuid_t mi_le_file_id;
+ u64 mi_le_time_us;
+ u32 mi_le_page;
+ u32 mi_le_errno;
+ uid_t mi_le_uid;
+
+ /* Number of reads timed out */
+ u32 mi_reads_failed_timed_out;
+
+ /* Number of reads failed because hash verification failed */
+ u32 mi_reads_failed_hash_verification;
+
+ /* Number of reads failed for another reason */
+ u32 mi_reads_failed_other;
+
+ /* Number of reads delayed because page had to be fetched */
+ u32 mi_reads_delayed_pending;
+
+ /* Total time waiting for pages to be fetched */
+ u64 mi_reads_delayed_pending_us;
+
+ /*
+ * Number of reads delayed because of per-uid min_time_us or
+ * min_pending_time_us settings
+ */
+ u32 mi_reads_delayed_min;
+
+ /* Total time waiting because of per-uid min_time_us or
+ * min_pending_time_us settings.
+ *
+ * Note that if a read is initially delayed because we have to wait for
+ * the page, then further delayed because of min_pending_time_us
+ * setting, this counter gets incremented by only the further delay
+ * time.
+ */
+ u64 mi_reads_delayed_min_us;
+};
+
+struct data_file_block {
+ loff_t db_backing_file_data_offset;
+
+ size_t db_stored_size;
+
+ enum incfs_compression_alg db_comp_alg;
+};
+
+struct pending_read {
+ incfs_uuid_t file_id;
+
+ s64 timestamp_us;
+
+ atomic_t done;
+
+ int block_index;
+
+ int serial_number;
+
+ uid_t uid;
+
+ struct list_head mi_reads_list;
+
+ struct list_head segment_reads_list;
+
+ struct rcu_head rcu;
+};
+
+struct data_file_segment {
+ wait_queue_head_t new_data_arrival_wq;
+
+ /* Protects reads and writes from the blockmap */
+ struct rw_semaphore rwsem;
+
+ /* List of active pending_read objects belonging to this segment */
+ /* Protected by mount_info.pending_reads_mutex */
+ struct list_head reads_list_head;
+};
+
+/*
+ * Extra info associated with a file. Just a few bytes set by a user.
+ */
+struct file_attr {
+ loff_t fa_value_offset;
+
+ size_t fa_value_size;
+
+ u32 fa_crc;
+};
+
+
+struct data_file {
+ struct backing_file_context *df_backing_file_context;
+
+ struct mount_info *df_mount_info;
+
+ incfs_uuid_t df_id;
+
+ /*
+ * Array of segments used to reduce lock contention for the file.
+ * Segment is chosen for a block depends on the block's index.
+ */
+ struct data_file_segment df_segments[SEGMENTS_PER_FILE];
+
+ /* Base offset of the first metadata record. */
+ loff_t df_metadata_off;
+
+ /* Base offset of the block map. */
+ loff_t df_blockmap_off;
+
+ /* File size in bytes */
+ loff_t df_size;
+
+ /* File header flags */
+ u32 df_header_flags;
+
+ /* File size in DATA_FILE_BLOCK_SIZE blocks */
+ int df_data_block_count;
+
+ /* Total number of blocks, data + hash */
+ int df_total_block_count;
+
+ /* For mapped files, the offset into the actual file */
+ loff_t df_mapped_offset;
+
+ /* Number of data blocks written to file */
+ atomic_t df_data_blocks_written;
+
+ /* Number of data blocks in the status block */
+ u32 df_initial_data_blocks_written;
+
+ /* Number of hash blocks written to file */
+ atomic_t df_hash_blocks_written;
+
+ /* Number of hash blocks in the status block */
+ u32 df_initial_hash_blocks_written;
+
+ /* Offset to status metadata header */
+ loff_t df_status_offset;
+
+ /*
+ * Mutex acquired while enabling verity. Note that df_hash_tree is set
+ * by enable verity.
+ *
+ * The backing file mutex bc_mutex may be taken while this mutex is
+ * held.
+ */
+ struct mutex df_enable_verity;
+
+ /*
+ * Set either at construction time or during enabling verity. In the
+ * latter case, set via smp_store_release, so use smp_load_acquire to
+ * read it.
+ */
+ struct mtree *df_hash_tree;
+
+ /* Guaranteed set if df_hash_tree is set. */
+ struct incfs_df_signature *df_signature;
+
+ /*
+ * The verity file digest, set when verity is enabled and the file has
+ * been opened
+ */
+ struct mem_range df_verity_file_digest;
+
+ struct incfs_df_verity_signature *df_verity_signature;
+};
+
+struct dir_file {
+ struct mount_info *mount_info;
+
+ struct file *backing_dir;
+};
+
+struct inode_info {
+ struct mount_info *n_mount_info; /* A mount, this file belongs to */
+
+ struct inode *n_backing_inode;
+
+ struct data_file *n_file;
+
+ struct inode n_vfs_inode;
+};
+
+struct dentry_info {
+ struct path backing_path;
+};
+
+enum FILL_PERMISSION {
+ CANT_FILL = 0,
+ CAN_FILL = 1,
+};
+
+struct incfs_file_data {
+ /* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */
+ enum FILL_PERMISSION fd_fill_permission;
+
+ /* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */
+ int fd_get_block_pos;
+
+ /* And how many filled blocks are there up to that point */
+ int fd_filled_data_blocks;
+ int fd_filled_hash_blocks;
+};
+
+struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
+ struct mount_options *options,
+ struct path *backing_dir_path);
+
+int incfs_realloc_mount_info(struct mount_info *mi,
+ struct mount_options *options);
+
+void incfs_free_mount_info(struct mount_info *mi);
+
+char *file_id_to_str(incfs_uuid_t id);
+struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name);
+struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf);
+void incfs_free_data_file(struct data_file *df);
+
+struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf);
+void incfs_free_dir_file(struct dir_file *dir);
+
+struct incfs_read_data_file_timeouts {
+ u32 min_time_us;
+ u32 min_pending_time_us;
+ u32 max_pending_time_us;
+};
+
+ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
+ int index, struct mem_range tmp,
+ struct incfs_read_data_file_timeouts *timeouts,
+ unsigned int *delayed_min_us);
+
+ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
+ struct data_file *df, size_t offset);
+
+int incfs_get_filled_blocks(struct data_file *df,
+ struct incfs_file_data *fd,
+ struct incfs_get_filled_blocks_args *arg);
+
+int incfs_read_file_signature(struct data_file *df, struct mem_range dst);
+
+int incfs_process_new_data_block(struct data_file *df,
+ struct incfs_fill_block *block, u8 *data,
+ bool *complete);
+
+int incfs_process_new_hash_block(struct data_file *df,
+ struct incfs_fill_block *block, u8 *data);
+
+bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number);
+
+/*
+ * Collects pending reads and saves them into the array (reads/reads_size).
+ * Only reads with serial_number > sn_lowerbound are reported.
+ * Returns how many reads were saved into the array.
+ */
+int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
+ struct incfs_pending_read_info *reads,
+ struct incfs_pending_read_info2 *reads2,
+ int reads_size, int *new_max_sn);
+
+int incfs_collect_logged_reads(struct mount_info *mi,
+ struct read_log_state *start_state,
+ struct incfs_pending_read_info *reads,
+ struct incfs_pending_read_info2 *reads2,
+ int reads_size);
+struct read_log_state incfs_get_log_state(struct mount_info *mi);
+int incfs_get_uncollected_logs_count(struct mount_info *mi,
+ const struct read_log_state *state);
+
+static inline struct inode_info *get_incfs_node(struct inode *inode)
+{
+ if (!inode)
+ return NULL;
+
+ if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) {
+ /* This inode doesn't belong to us. */
+ pr_warn_once("incfs: %s on an alien inode.", __func__);
+ return NULL;
+ }
+
+ return container_of(inode, struct inode_info, n_vfs_inode);
+}
+
+static inline struct data_file *get_incfs_data_file(struct file *f)
+{
+ struct inode_info *node = NULL;
+
+ if (!f)
+ return NULL;
+
+ if (!S_ISREG(f->f_inode->i_mode))
+ return NULL;
+
+ node = get_incfs_node(f->f_inode);
+ if (!node)
+ return NULL;
+
+ return node->n_file;
+}
+
+static inline struct dir_file *get_incfs_dir_file(struct file *f)
+{
+ if (!f)
+ return NULL;
+
+ if (!S_ISDIR(f->f_inode->i_mode))
+ return NULL;
+
+ return (struct dir_file *)f->private_data;
+}
+
+/*
+ * Make sure that inode_info.n_file is initialized and inode can be used
+ * for reading and writing data from/to the backing file.
+ */
+int make_inode_ready_for_data_ops(struct mount_info *mi,
+ struct inode *inode,
+ struct file *backing_file);
+
+static inline struct dentry_info *get_incfs_dentry(const struct dentry *d)
+{
+ if (!d)
+ return NULL;
+
+ return (struct dentry_info *)d->d_fsdata;
+}
+
+static inline void get_incfs_backing_path(const struct dentry *d,
+ struct path *path)
+{
+ struct dentry_info *di = get_incfs_dentry(d);
+
+ if (!di) {
+ *path = (struct path) {};
+ return;
+ }
+
+ *path = di->backing_path;
+ path_get(path);
+}
+
+static inline int get_blocks_count_for_size(u64 size)
+{
+ if (size == 0)
+ return 0;
+ return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE;
+}
+
+#endif /* _INCFS_DATA_MGMT_H */
diff --git a/fs/incfs/format.c b/fs/incfs/format.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/format.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/kernel.h>
+
+#include "format.h"
+#include "data_mgmt.h"
+
+struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi,
+ struct file *backing_file)
+{
+ struct backing_file_context *result = NULL;
+
+ result = kzalloc(sizeof(*result), GFP_NOFS);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ result->bc_file = get_file(backing_file);
+ result->bc_cred = mi->mi_owner;
+ mutex_init(&result->bc_mutex);
+ return result;
+}
+
+void incfs_free_bfc(struct backing_file_context *bfc)
+{
+ if (!bfc)
+ return;
+
+ if (bfc->bc_file)
+ fput(bfc->bc_file);
+
+ mutex_destroy(&bfc->bc_mutex);
+ kfree(bfc);
+}
+
+static loff_t incfs_get_end_offset(struct file *f)
+{
+ /*
+ * This function assumes that file size and the end-offset
+ * are the same. This is not always true.
+ */
+ return i_size_read(file_inode(f));
+}
+
+/*
+ * Truncate the tail of the file to the given length.
+ * Used to rollback partially successful multistep writes.
+ */
+static int truncate_backing_file(struct backing_file_context *bfc,
+ loff_t new_end)
+{
+ struct inode *inode = NULL;
+ struct dentry *dentry = NULL;
+ loff_t old_end = 0;
+ struct iattr attr;
+ int result = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ if (!bfc->bc_file)
+ return -EFAULT;
+
+ old_end = incfs_get_end_offset(bfc->bc_file);
+ if (old_end == new_end)
+ return 0;
+ if (old_end < new_end)
+ return -EINVAL;
+
+ inode = bfc->bc_file->f_inode;
+ dentry = bfc->bc_file->f_path.dentry;
+
+ attr.ia_size = new_end;
+ attr.ia_valid = ATTR_SIZE;
+
+ inode_lock(inode);
+ result = notify_change(&nop_mnt_idmap, dentry, &attr, NULL);
+ inode_unlock(inode);
+
+ return result;
+}
+
+static int write_to_bf(struct backing_file_context *bfc, const void *buf,
+ size_t count, loff_t pos)
+{
+ ssize_t res = incfs_kwrite(bfc, buf, count, pos);
+
+ if (res < 0)
+ return res;
+ if (res != count)
+ return -EIO;
+ return 0;
+}
+
+static int append_zeros_no_fallocate(struct backing_file_context *bfc,
+ size_t file_size, size_t len)
+{
+ u8 buffer[256] = {};
+ size_t i;
+
+ for (i = 0; i < len; i += sizeof(buffer)) {
+ int to_write = len - i > sizeof(buffer)
+ ? sizeof(buffer) : len - i;
+ int err = write_to_bf(bfc, buffer, to_write, file_size + i);
+
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Append a given number of zero bytes to the end of the backing file. */
+static int append_zeros(struct backing_file_context *bfc, size_t len)
+{
+ loff_t file_size = 0;
+ loff_t new_last_byte_offset = 0;
+ int result;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (len == 0)
+ return 0;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /*
+ * Allocate only one byte at the new desired end of the file.
+ * It will increase file size and create a zeroed area of
+ * a given size.
+ */
+ file_size = incfs_get_end_offset(bfc->bc_file);
+ new_last_byte_offset = file_size + len - 1;
+ result = vfs_fallocate(bfc->bc_file, 0, new_last_byte_offset, 1);
+ if (result != -EOPNOTSUPP)
+ return result;
+
+ return append_zeros_no_fallocate(bfc, file_size, len);
+}
+
+/*
+ * Append a given metadata record to the backing file and update a previous
+ * record to add the new record the the metadata list.
+ */
+static int append_md_to_backing_file(struct backing_file_context *bfc,
+ struct incfs_md_header *record)
+{
+ int result = 0;
+ loff_t record_offset;
+ loff_t file_pos;
+ __le64 new_md_offset;
+ size_t record_size;
+
+ if (!bfc || !record)
+ return -EFAULT;
+
+ if (bfc->bc_last_md_record_offset < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ record_size = le16_to_cpu(record->h_record_size);
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ record->h_next_md_offset = 0;
+
+ /* Write the metadata record to the end of the backing file */
+ record_offset = file_pos;
+ new_md_offset = cpu_to_le64(record_offset);
+ result = write_to_bf(bfc, record, record_size, file_pos);
+ if (result)
+ return result;
+
+ /* Update next metadata offset in a previous record or a superblock. */
+ if (bfc->bc_last_md_record_offset) {
+ /*
+ * Find a place in the previous md record where new record's
+ * offset needs to be saved.
+ */
+ file_pos = bfc->bc_last_md_record_offset +
+ offsetof(struct incfs_md_header, h_next_md_offset);
+ } else {
+ /*
+ * No metadata yet, file a place to update in the
+ * file_header.
+ */
+ file_pos = offsetof(struct incfs_file_header,
+ fh_first_md_offset);
+ }
+ result = write_to_bf(bfc, &new_md_offset, sizeof(new_md_offset),
+ file_pos);
+ if (result)
+ return result;
+
+ bfc->bc_last_md_record_offset = record_offset;
+ return result;
+}
+
+/*
+ * Reserve 0-filled space for the blockmap body, and append
+ * incfs_blockmap metadata record pointing to it.
+ */
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u32 block_count)
+{
+ struct incfs_blockmap blockmap = {};
+ int result = 0;
+ loff_t file_end = 0;
+ size_t map_size = block_count * sizeof(struct incfs_blockmap_entry);
+
+ if (!bfc)
+ return -EFAULT;
+
+ blockmap.m_header.h_md_entry_type = INCFS_MD_BLOCK_MAP;
+ blockmap.m_header.h_record_size = cpu_to_le16(sizeof(blockmap));
+ blockmap.m_header.h_next_md_offset = cpu_to_le64(0);
+ blockmap.m_block_count = cpu_to_le32(block_count);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /* Reserve 0-filled space for the blockmap body in the backing file. */
+ file_end = incfs_get_end_offset(bfc->bc_file);
+ result = append_zeros(bfc, map_size);
+ if (result)
+ return result;
+
+ /* Write blockmap metadata record pointing to the body written above. */
+ blockmap.m_base_offset = cpu_to_le64(file_end);
+ result = append_md_to_backing_file(bfc, &blockmap.m_header);
+ if (result)
+ /* Error, rollback file changes */
+ truncate_backing_file(bfc, file_end);
+
+ return result;
+}
+
+int incfs_write_signature_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range sig, u32 tree_size,
+ loff_t *tree_offset, loff_t *sig_offset)
+{
+ struct incfs_file_signature sg = {};
+ int result = 0;
+ loff_t rollback_pos = 0;
+ loff_t tree_area_pos = 0;
+ size_t alignment = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ rollback_pos = incfs_get_end_offset(bfc->bc_file);
+
+ sg.sg_header.h_md_entry_type = INCFS_MD_SIGNATURE;
+ sg.sg_header.h_record_size = cpu_to_le16(sizeof(sg));
+ sg.sg_header.h_next_md_offset = cpu_to_le64(0);
+ if (sig.data != NULL && sig.len > 0) {
+ sg.sg_sig_size = cpu_to_le32(sig.len);
+ sg.sg_sig_offset = cpu_to_le64(rollback_pos);
+
+ result = write_to_bf(bfc, sig.data, sig.len, rollback_pos);
+ if (result)
+ goto err;
+ }
+
+ tree_area_pos = incfs_get_end_offset(bfc->bc_file);
+ if (tree_size > 0) {
+ if (tree_size > 5 * INCFS_DATA_FILE_BLOCK_SIZE) {
+ /*
+ * If hash tree is big enough, it makes sense to
+ * align in the backing file for faster access.
+ */
+ loff_t offset = round_up(tree_area_pos, PAGE_SIZE);
+
+ alignment = offset - tree_area_pos;
+ tree_area_pos = offset;
+ }
+
+ /*
+ * If root hash is not the only hash in the tree.
+ * reserve 0-filled space for the tree.
+ */
+ result = append_zeros(bfc, tree_size + alignment);
+ if (result)
+ goto err;
+
+ sg.sg_hash_tree_size = cpu_to_le32(tree_size);
+ sg.sg_hash_tree_offset = cpu_to_le64(tree_area_pos);
+ }
+
+ /* Write a hash tree metadata record pointing to the hash tree above. */
+ result = append_md_to_backing_file(bfc, &sg.sg_header);
+err:
+ if (result)
+ /* Error, rollback file changes */
+ truncate_backing_file(bfc, rollback_pos);
+ else {
+ if (tree_offset)
+ *tree_offset = tree_area_pos;
+ if (sig_offset)
+ *sig_offset = rollback_pos;
+ }
+
+ return result;
+}
+
+static int write_new_status_to_backing_file(struct backing_file_context *bfc,
+ u32 data_blocks_written,
+ u32 hash_blocks_written)
+{
+ int result;
+ loff_t rollback_pos;
+ struct incfs_status is = {
+ .is_header = {
+ .h_md_entry_type = INCFS_MD_STATUS,
+ .h_record_size = cpu_to_le16(sizeof(is)),
+ },
+ .is_data_blocks_written = cpu_to_le32(data_blocks_written),
+ .is_hash_blocks_written = cpu_to_le32(hash_blocks_written),
+ };
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ rollback_pos = incfs_get_end_offset(bfc->bc_file);
+ result = append_md_to_backing_file(bfc, &is.is_header);
+ if (result)
+ truncate_backing_file(bfc, rollback_pos);
+
+ return result;
+}
+
+int incfs_write_status_to_backing_file(struct backing_file_context *bfc,
+ loff_t status_offset,
+ u32 data_blocks_written,
+ u32 hash_blocks_written)
+{
+ struct incfs_status is;
+ int result;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (status_offset == 0)
+ return write_new_status_to_backing_file(bfc,
+ data_blocks_written, hash_blocks_written);
+
+ result = incfs_kread(bfc, &is, sizeof(is), status_offset);
+ if (result != sizeof(is))
+ return -EIO;
+
+ is.is_data_blocks_written = cpu_to_le32(data_blocks_written);
+ is.is_hash_blocks_written = cpu_to_le32(hash_blocks_written);
+ result = incfs_kwrite(bfc, &is, sizeof(is), status_offset);
+ if (result != sizeof(is))
+ return -EIO;
+
+ return 0;
+}
+
+int incfs_write_verity_signature_to_backing_file(
+ struct backing_file_context *bfc, struct mem_range signature,
+ loff_t *offset)
+{
+ struct incfs_file_verity_signature vs = {};
+ int result;
+ loff_t pos;
+
+ /* No verity signature section is equivalent to an empty section */
+ if (signature.data == NULL || signature.len == 0)
+ return 0;
+
+ pos = incfs_get_end_offset(bfc->bc_file);
+
+ vs = (struct incfs_file_verity_signature) {
+ .vs_header = (struct incfs_md_header) {
+ .h_md_entry_type = INCFS_MD_VERITY_SIGNATURE,
+ .h_record_size = cpu_to_le16(sizeof(vs)),
+ .h_next_md_offset = cpu_to_le64(0),
+ },
+ .vs_size = cpu_to_le32(signature.len),
+ .vs_offset = cpu_to_le64(pos),
+ };
+
+ result = write_to_bf(bfc, signature.data, signature.len, pos);
+ if (result)
+ goto err;
+
+ result = append_md_to_backing_file(bfc, &vs.vs_header);
+ if (result)
+ goto err;
+
+ *offset = pos;
+err:
+ if (result)
+ /* Error, rollback file changes */
+ truncate_backing_file(bfc, pos);
+ return result;
+}
+
+/*
+ * Write a backing file header
+ * It should always be called only on empty file.
+ * fh.fh_first_md_offset is 0 for now, but will be updated
+ * once first metadata record is added.
+ */
+int incfs_write_fh_to_backing_file(struct backing_file_context *bfc,
+ incfs_uuid_t *uuid, u64 file_size)
+{
+ struct incfs_file_header fh = {};
+ loff_t file_pos = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
+ fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
+ fh.fh_header_size = cpu_to_le16(sizeof(fh));
+ fh.fh_first_md_offset = cpu_to_le64(0);
+ fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
+
+ fh.fh_file_size = cpu_to_le64(file_size);
+ fh.fh_uuid = *uuid;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ if (file_pos != 0)
+ return -EEXIST;
+
+ return write_to_bf(bfc, &fh, sizeof(fh), file_pos);
+}
+
+/*
+ * Write a backing file header for a mapping file
+ * It should always be called only on empty file.
+ */
+int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc,
+ incfs_uuid_t *uuid, u64 file_size, u64 offset)
+{
+ struct incfs_file_header fh = {};
+ loff_t file_pos = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
+ fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
+ fh.fh_header_size = cpu_to_le16(sizeof(fh));
+ fh.fh_original_offset = cpu_to_le64(offset);
+ fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
+
+ fh.fh_mapped_file_size = cpu_to_le64(file_size);
+ fh.fh_original_uuid = *uuid;
+ fh.fh_flags = cpu_to_le32(INCFS_FILE_MAPPED);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ if (file_pos != 0)
+ return -EEXIST;
+
+ return write_to_bf(bfc, &fh, sizeof(fh), file_pos);
+}
+
+/* Write a given data block and update file's blockmap to point it. */
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block, int block_index,
+ loff_t bm_base_off, u16 flags)
+{
+ struct incfs_blockmap_entry bm_entry = {};
+ int result = 0;
+ loff_t data_offset = 0;
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (block.len >= (1 << 16) || block_index < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ data_offset = incfs_get_end_offset(bfc->bc_file);
+ if (data_offset <= bm_entry_off) {
+ /* Blockmap entry is beyond the file's end. It is not normal. */
+ return -EINVAL;
+ }
+
+ /* Write the block data at the end of the backing file. */
+ result = write_to_bf(bfc, block.data, block.len, data_offset);
+ if (result)
+ return result;
+
+ /* Update the blockmap to point to the newly written data. */
+ bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
+ bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
+ bm_entry.me_data_size = cpu_to_le16((u16)block.len);
+ bm_entry.me_flags = cpu_to_le16(flags);
+
+ return write_to_bf(bfc, &bm_entry, sizeof(bm_entry),
+ bm_entry_off);
+}
+
+int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block,
+ int block_index,
+ loff_t hash_area_off,
+ loff_t bm_base_off,
+ loff_t file_size)
+{
+ struct incfs_blockmap_entry bm_entry = {};
+ int result;
+ loff_t data_offset = 0;
+ loff_t file_end = 0;
+ loff_t bm_entry_off =
+ bm_base_off +
+ sizeof(struct incfs_blockmap_entry) *
+ (block_index + get_blocks_count_for_size(file_size));
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ data_offset = hash_area_off + block_index * INCFS_DATA_FILE_BLOCK_SIZE;
+ file_end = incfs_get_end_offset(bfc->bc_file);
+ if (data_offset + block.len > file_end) {
+ /* Block is located beyond the file's end. It is not normal. */
+ return -EINVAL;
+ }
+
+ result = write_to_bf(bfc, block.data, block.len, data_offset);
+ if (result)
+ return result;
+
+ bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
+ bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
+ bm_entry.me_data_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
+
+ return write_to_bf(bfc, &bm_entry, sizeof(bm_entry), bm_entry_off);
+}
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry)
+{
+ int error = incfs_read_blockmap_entries(bfc, bm_entry, block_index, 1,
+ bm_base_off);
+
+ if (error < 0)
+ return error;
+
+ if (error == 0)
+ return -EIO;
+
+ if (error != 1)
+ return -EFAULT;
+
+ return 0;
+}
+
+int incfs_read_blockmap_entries(struct backing_file_context *bfc,
+ struct incfs_blockmap_entry *entries,
+ int start_index, int blocks_number,
+ loff_t bm_base_off)
+{
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * start_index;
+ const size_t bytes_to_read = sizeof(struct incfs_blockmap_entry)
+ * blocks_number;
+ int result = 0;
+
+ if (!bfc || !entries)
+ return -EFAULT;
+
+ if (start_index < 0 || bm_base_off <= 0)
+ return -ENODATA;
+
+ result = incfs_kread(bfc, entries, bytes_to_read, bm_entry_off);
+ if (result < 0)
+ return result;
+ return result / sizeof(*entries);
+}
+
+int incfs_read_file_header(struct backing_file_context *bfc,
+ loff_t *first_md_off, incfs_uuid_t *uuid,
+ u64 *file_size, u32 *flags)
+{
+ ssize_t bytes_read = 0;
+ struct incfs_file_header fh = {};
+
+ if (!bfc || !first_md_off)
+ return -EFAULT;
+
+ bytes_read = incfs_kread(bfc, &fh, sizeof(fh), 0);
+ if (bytes_read < 0)
+ return bytes_read;
+
+ if (bytes_read < sizeof(fh))
+ return -EBADMSG;
+
+ if (le64_to_cpu(fh.fh_magic) != INCFS_MAGIC_NUMBER)
+ return -EILSEQ;
+
+ if (le64_to_cpu(fh.fh_version) > INCFS_FORMAT_CURRENT_VER)
+ return -EILSEQ;
+
+ if (le16_to_cpu(fh.fh_data_block_size) != INCFS_DATA_FILE_BLOCK_SIZE)
+ return -EILSEQ;
+
+ if (le16_to_cpu(fh.fh_header_size) != sizeof(fh))
+ return -EILSEQ;
+
+ if (first_md_off)
+ *first_md_off = le64_to_cpu(fh.fh_first_md_offset);
+ if (uuid)
+ *uuid = fh.fh_uuid;
+ if (file_size)
+ *file_size = le64_to_cpu(fh.fh_file_size);
+ if (flags)
+ *flags = le32_to_cpu(fh.fh_flags);
+ return 0;
+}
+
+/*
+ * Read through metadata records from the backing file one by one
+ * and call provided metadata handlers.
+ */
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler)
+{
+ const ssize_t max_md_size = INCFS_MAX_METADATA_RECORD_SIZE;
+ ssize_t bytes_read = 0;
+ size_t md_record_size = 0;
+ loff_t next_record = 0;
+ int res = 0;
+ struct incfs_md_header *md_hdr = NULL;
+
+ if (!bfc || !handler)
+ return -EFAULT;
+
+ if (handler->md_record_offset == 0)
+ return -EPERM;
+
+ memset(&handler->md_buffer, 0, max_md_size);
+ bytes_read = incfs_kread(bfc, &handler->md_buffer, max_md_size,
+ handler->md_record_offset);
+ if (bytes_read < 0)
+ return bytes_read;
+ if (bytes_read < sizeof(*md_hdr))
+ return -EBADMSG;
+
+ md_hdr = &handler->md_buffer.md_header;
+ next_record = le64_to_cpu(md_hdr->h_next_md_offset);
+ md_record_size = le16_to_cpu(md_hdr->h_record_size);
+
+ if (md_record_size > max_md_size) {
+ pr_warn("incfs: The record is too large. Size: %zu",
+ md_record_size);
+ return -EBADMSG;
+ }
+
+ if (bytes_read < md_record_size) {
+ pr_warn("incfs: The record hasn't been fully read.");
+ return -EBADMSG;
+ }
+
+ if (next_record <= handler->md_record_offset && next_record != 0) {
+ pr_warn("incfs: Next record (%lld) points back in file.",
+ next_record);
+ return -EBADMSG;
+ }
+
+ switch (md_hdr->h_md_entry_type) {
+ case INCFS_MD_NONE:
+ break;
+ case INCFS_MD_BLOCK_MAP:
+ if (handler->handle_blockmap)
+ res = handler->handle_blockmap(
+ &handler->md_buffer.blockmap, handler);
+ break;
+ case INCFS_MD_FILE_ATTR:
+ /*
+ * File attrs no longer supported, ignore section for
+ * compatibility
+ */
+ break;
+ case INCFS_MD_SIGNATURE:
+ if (handler->handle_signature)
+ res = handler->handle_signature(
+ &handler->md_buffer.signature, handler);
+ break;
+ case INCFS_MD_STATUS:
+ if (handler->handle_status)
+ res = handler->handle_status(
+ &handler->md_buffer.status, handler);
+ break;
+ case INCFS_MD_VERITY_SIGNATURE:
+ if (handler->handle_verity_signature)
+ res = handler->handle_verity_signature(
+ &handler->md_buffer.verity_signature, handler);
+ break;
+ default:
+ res = -ENOTSUPP;
+ break;
+ }
+
+ if (!res) {
+ if (next_record == 0) {
+ /*
+ * Zero offset for the next record means that the last
+ * metadata record has just been processed.
+ */
+ bfc->bc_last_md_record_offset =
+ handler->md_record_offset;
+ }
+ handler->md_prev_record_offset = handler->md_record_offset;
+ handler->md_record_offset = next_record;
+ }
+ return res;
+}
+
+ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size,
+ loff_t pos)
+{
+ const struct cred *old_cred = override_creds(bfc->bc_cred);
+ int ret = kernel_read(bfc->bc_file, buf, size, &pos);
+
+ revert_creds(old_cred);
+ return ret;
+}
+
+ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf,
+ size_t size, loff_t pos)
+{
+ const struct cred *old_cred = override_creds(bfc->bc_cred);
+ int ret = kernel_write(bfc->bc_file, buf, size, &pos);
+
+ revert_creds(old_cred);
+ return ret;
+}
diff --git a/fs/incfs/format.h b/fs/incfs/format.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/format.h
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Overview
+ * --------
+ * The backbone of the incremental-fs ondisk format is an append only linked
+ * list of metadata blocks. Each metadata block contains an offset of the next
+ * one. These blocks describe files and directories on the
+ * file system. They also represent actions of adding and removing file names
+ * (hard links).
+ *
+ * Every time incremental-fs instance is mounted, it reads through this list
+ * to recreate filesystem's state in memory. An offset of the first record in
+ * the metadata list is stored in the superblock at the beginning of the backing
+ * file.
+ *
+ * Most of the backing file is taken by data areas and blockmaps.
+ * Since data blocks can be compressed and have different sizes,
+ * single per-file data area can't be pre-allocated. That's why blockmaps are
+ * needed in order to find a location and size of each data block in
+ * the backing file. Each time a file is created, a corresponding block map is
+ * allocated to store future offsets of data blocks.
+ *
+ * Whenever a data block is given by data loader to incremental-fs:
+ * - A data area with the given block is appended to the end of
+ * the backing file.
+ * - A record in the blockmap for the given block index is updated to reflect
+ * its location, size, and compression algorithm.
+
+ * Metadata records
+ * ----------------
+ * incfs_blockmap - metadata record that specifies size and location
+ * of a blockmap area for a given file. This area
+ * contains an array of incfs_blockmap_entry-s.
+ * incfs_file_signature - metadata record that specifies where file signature
+ * and its hash tree can be found in the backing file.
+ *
+ * incfs_file_attr - metadata record that specifies where additional file
+ * attributes blob can be found.
+ *
+ * Metadata header
+ * ---------------
+ * incfs_md_header - header of a metadata record. It's always a part
+ * of other structures and served purpose of metadata
+ * bookkeeping.
+ *
+ * +-----------------------------------------------+ ^
+ * | incfs_md_header | |
+ * | 1. type of body(BLOCKMAP, FILE_ATTR..) | |
+ * | 2. size of the whole record header + body | |
+ * | 3. CRC the whole record header + body | |
+ * | 4. offset of the previous md record |]------+
+ * | 5. offset of the next md record (md link) |]---+
+ * +-----------------------------------------------+ |
+ * | Metadata record body with useful data | |
+ * +-----------------------------------------------+ |
+ * +--->
+ *
+ * Other ondisk structures
+ * -----------------------
+ * incfs_super_block - backing file header
+ * incfs_blockmap_entry - a record in a blockmap area that describes size
+ * and location of a data block.
+ * Data blocks dont have any particular structure, they are written to the
+ * backing file in a raw form as they come from a data loader.
+ *
+ * Backing file layout
+ * -------------------
+ *
+ *
+ * +-------------------------------------------+
+ * | incfs_file_header |]---+
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_file_signature |]---+
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ | metadata
+ * +------->| blockmap area | | list links
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | +--[| [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | +-------------------------------------------+ |
+ * | | ......................... |
+ * | | +-------------------------------------------+ |
+ * | | | metadata |<---+
+ * +----|--[| incfs_blockmap |]---+
+ * | +-------------------------------------------+ |
+ * | ......................... |
+ * | +-------------------------------------------+ |
+ * +-->| data block | |
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_file_attr |
+ * +-------------------------------------------+
+ */
+#ifndef _INCFS_FORMAT_H
+#define _INCFS_FORMAT_H
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <uapi/linux/incrementalfs.h>
+
+#include "internal.h"
+
+#define INCFS_MAX_NAME_LEN 255
+#define INCFS_FORMAT_V1 1
+#define INCFS_FORMAT_CURRENT_VER INCFS_FORMAT_V1
+
+enum incfs_metadata_type {
+ INCFS_MD_NONE = 0,
+ INCFS_MD_BLOCK_MAP = 1,
+ INCFS_MD_FILE_ATTR = 2,
+ INCFS_MD_SIGNATURE = 3,
+ INCFS_MD_STATUS = 4,
+ INCFS_MD_VERITY_SIGNATURE = 5,
+};
+
+enum incfs_file_header_flags {
+ INCFS_FILE_MAPPED = 1 << 1,
+};
+
+/* Header included at the beginning of all metadata records on the disk. */
+struct incfs_md_header {
+ __u8 h_md_entry_type;
+
+ /*
+ * Size of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le16 h_record_size;
+
+ /*
+ * Was: CRC32 of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le32 h_unused1;
+
+ /* Offset of the next metadata entry if any */
+ __le64 h_next_md_offset;
+
+ /* Was: Offset of the previous metadata entry if any */
+ __le64 h_unused2;
+
+} __packed;
+
+/* Backing file header */
+struct incfs_file_header {
+ /* Magic number: INCFS_MAGIC_NUMBER */
+ __le64 fh_magic;
+
+ /* Format version: INCFS_FORMAT_CURRENT_VER */
+ __le64 fh_version;
+
+ /* sizeof(incfs_file_header) */
+ __le16 fh_header_size;
+
+ /* INCFS_DATA_FILE_BLOCK_SIZE */
+ __le16 fh_data_block_size;
+
+ /* File flags, from incfs_file_header_flags */
+ __le32 fh_flags;
+
+ union {
+ /* Standard incfs file */
+ struct {
+ /* Offset of the first metadata record */
+ __le64 fh_first_md_offset;
+
+ /* Full size of the file's content */
+ __le64 fh_file_size;
+
+ /* File uuid */
+ incfs_uuid_t fh_uuid;
+ };
+
+ /* Mapped file - INCFS_FILE_MAPPED set in fh_flags */
+ struct {
+ /* Offset in original file */
+ __le64 fh_original_offset;
+
+ /* Full size of the file's content */
+ __le64 fh_mapped_file_size;
+
+ /* Original file's uuid */
+ incfs_uuid_t fh_original_uuid;
+ };
+ };
+} __packed;
+
+enum incfs_block_map_entry_flags {
+ INCFS_BLOCK_COMPRESSED_LZ4 = 1,
+ INCFS_BLOCK_COMPRESSED_ZSTD = 2,
+
+ /* Reserve 3 bits for compression alg */
+ INCFS_BLOCK_COMPRESSED_MASK = 7,
+};
+
+/* Block map entry pointing to an actual location of the data block. */
+struct incfs_blockmap_entry {
+ /* Offset of the actual data block. Lower 32 bits */
+ __le32 me_data_offset_lo;
+
+ /* Offset of the actual data block. Higher 16 bits */
+ __le16 me_data_offset_hi;
+
+ /* How many bytes the data actually occupies in the backing file */
+ __le16 me_data_size;
+
+ /* Block flags from incfs_block_map_entry_flags */
+ __le16 me_flags;
+} __packed;
+
+/* Metadata record for locations of file blocks. Type = INCFS_MD_BLOCK_MAP */
+struct incfs_blockmap {
+ struct incfs_md_header m_header;
+
+ /* Base offset of the array of incfs_blockmap_entry */
+ __le64 m_base_offset;
+
+ /* Size of the map entry array in blocks */
+ __le32 m_block_count;
+} __packed;
+
+/*
+ * Metadata record for file signature. Type = INCFS_MD_SIGNATURE
+ *
+ * The signature stored here is the APK V4 signature data blob. See the
+ * definition of incfs_new_file_args::signature_info for an explanation of this
+ * blob. Specifically, it contains the root hash, but it does *not* contain
+ * anything that the kernel treats as a signature.
+ *
+ * When FS_IOC_ENABLE_VERITY is called on a file without this record, an APK V4
+ * signature blob and a hash tree are added to the file, and then this metadata
+ * record is created to record their locations.
+ */
+struct incfs_file_signature {
+ struct incfs_md_header sg_header;
+
+ __le32 sg_sig_size; /* The size of the signature. */
+
+ __le64 sg_sig_offset; /* Signature's offset in the backing file */
+
+ __le32 sg_hash_tree_size; /* The size of the hash tree. */
+
+ __le64 sg_hash_tree_offset; /* Hash tree offset in the backing file */
+} __packed;
+
+/* In memory version of above */
+struct incfs_df_signature {
+ u32 sig_size;
+ u64 sig_offset;
+ u32 hash_size;
+ u64 hash_offset;
+};
+
+struct incfs_status {
+ struct incfs_md_header is_header;
+
+ __le32 is_data_blocks_written; /* Number of data blocks written */
+
+ __le32 is_hash_blocks_written; /* Number of hash blocks written */
+
+ __le32 is_dummy[6]; /* Spare fields */
+} __packed;
+
+/*
+ * Metadata record for verity signature. Type = INCFS_MD_VERITY_SIGNATURE
+ *
+ * This record will only exist for verity-enabled files with signatures. Verity
+ * enabled files without signatures do not have this record.
+ *
+ * This is obsolete, as incfs no longer checks this type of signature.
+ */
+struct incfs_file_verity_signature {
+ struct incfs_md_header vs_header;
+
+ /* The size of the signature */
+ __le32 vs_size;
+
+ /* Signature's offset in the backing file */
+ __le64 vs_offset;
+} __packed;
+
+/* In memory version of above */
+struct incfs_df_verity_signature {
+ u32 size;
+ u64 offset;
+};
+
+/* State of the backing file. */
+struct backing_file_context {
+ /* Protects writes to bc_file */
+ struct mutex bc_mutex;
+
+ /* File object to read data from */
+ struct file *bc_file;
+
+ /*
+ * Offset of the last known metadata record in the backing file.
+ * 0 means there are no metadata records.
+ */
+ loff_t bc_last_md_record_offset;
+
+ /*
+ * Credentials to set before reads/writes
+ * Note that this is a pointer to the mount_info mi_owner field so
+ * there is no need to get/put the creds
+ */
+ const struct cred *bc_cred;
+};
+
+struct metadata_handler {
+ loff_t md_record_offset;
+ loff_t md_prev_record_offset;
+ void *context;
+
+ union {
+ struct incfs_md_header md_header;
+ struct incfs_blockmap blockmap;
+ struct incfs_file_signature signature;
+ struct incfs_status status;
+ struct incfs_file_verity_signature verity_signature;
+ } md_buffer;
+
+ int (*handle_blockmap)(struct incfs_blockmap *bm,
+ struct metadata_handler *handler);
+ int (*handle_signature)(struct incfs_file_signature *sig,
+ struct metadata_handler *handler);
+ int (*handle_status)(struct incfs_status *sig,
+ struct metadata_handler *handler);
+ int (*handle_verity_signature)(struct incfs_file_verity_signature *s,
+ struct metadata_handler *handler);
+};
+#define INCFS_MAX_METADATA_RECORD_SIZE \
+ sizeof_field(struct metadata_handler, md_buffer)
+
+/* Backing file context management */
+struct mount_info;
+struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi,
+ struct file *backing_file);
+
+void incfs_free_bfc(struct backing_file_context *bfc);
+
+/* Writing stuff */
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u32 block_count);
+
+int incfs_write_fh_to_backing_file(struct backing_file_context *bfc,
+ incfs_uuid_t *uuid, u64 file_size);
+
+int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc,
+ incfs_uuid_t *uuid, u64 file_size, u64 offset);
+
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block,
+ int block_index, loff_t bm_base_off,
+ u16 flags);
+
+int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block,
+ int block_index,
+ loff_t hash_area_off,
+ loff_t bm_base_off,
+ loff_t file_size);
+
+int incfs_write_signature_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range sig, u32 tree_size,
+ loff_t *tree_offset, loff_t *sig_offset);
+
+int incfs_write_status_to_backing_file(struct backing_file_context *bfc,
+ loff_t status_offset,
+ u32 data_blocks_written,
+ u32 hash_blocks_written);
+int incfs_write_verity_signature_to_backing_file(
+ struct backing_file_context *bfc, struct mem_range signature,
+ loff_t *offset);
+
+/* Reading stuff */
+int incfs_read_file_header(struct backing_file_context *bfc,
+ loff_t *first_md_off, incfs_uuid_t *uuid,
+ u64 *file_size, u32 *flags);
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry);
+
+int incfs_read_blockmap_entries(struct backing_file_context *bfc,
+ struct incfs_blockmap_entry *entries,
+ int start_index, int blocks_number,
+ loff_t bm_base_off);
+
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler);
+
+ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size,
+ loff_t pos);
+ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf,
+ size_t size, loff_t pos);
+
+#endif /* _INCFS_FORMAT_H */
diff --git a/fs/incfs/integrity.c b/fs/incfs/integrity.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/integrity.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+#include <crypto/sha2.h>
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/version.h>
+
+#include "integrity.h"
+
+struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id)
+{
+ static struct incfs_hash_alg sha256 = {
+ .name = "sha256",
+ .digest_size = SHA256_DIGEST_SIZE,
+ .id = INCFS_HASH_TREE_SHA256
+ };
+ struct incfs_hash_alg *result = NULL;
+ struct crypto_shash *shash;
+
+ if (id == INCFS_HASH_TREE_SHA256) {
+ BUILD_BUG_ON(INCFS_MAX_HASH_SIZE < SHA256_DIGEST_SIZE);
+ result = &sha256;
+ }
+
+ if (result == NULL)
+ return ERR_PTR(-ENOENT);
+
+ /* pairs with cmpxchg_release() below */
+ shash = smp_load_acquire(&result->shash);
+ if (shash)
+ return result;
+
+ shash = crypto_alloc_shash(result->name, 0, 0);
+ if (IS_ERR(shash)) {
+ int err = PTR_ERR(shash);
+
+ pr_err("Can't allocate hash alg %s, error code:%d",
+ result->name, err);
+ return ERR_PTR(err);
+ }
+
+ /* pairs with smp_load_acquire() above */
+ if (cmpxchg_release(&result->shash, NULL, shash) != NULL)
+ crypto_free_shash(shash);
+
+ return result;
+}
+
+struct signature_info {
+ u32 version;
+ enum incfs_hash_tree_algorithm hash_algorithm;
+ u8 log2_blocksize;
+ struct mem_range salt;
+ struct mem_range root_hash;
+};
+
+static bool read_u32(u8 **p, u8 *top, u32 *result)
+{
+ if (*p + sizeof(u32) > top)
+ return false;
+
+ *result = le32_to_cpu(*(__le32 *)*p);
+ *p += sizeof(u32);
+ return true;
+}
+
+static bool read_u8(u8 **p, u8 *top, u8 *result)
+{
+ if (*p + sizeof(u8) > top)
+ return false;
+
+ *result = *(u8 *)*p;
+ *p += sizeof(u8);
+ return true;
+}
+
+static bool read_mem_range(u8 **p, u8 *top, struct mem_range *range)
+{
+ u32 len;
+
+ if (!read_u32(p, top, &len) || *p + len > top)
+ return false;
+
+ range->len = len;
+ range->data = *p;
+ *p += len;
+ return true;
+}
+
+static int incfs_parse_signature(struct mem_range signature,
+ struct signature_info *si)
+{
+ u8 *p = signature.data;
+ u8 *top = signature.data + signature.len;
+ u32 hash_section_size;
+
+ if (signature.len > INCFS_MAX_SIGNATURE_SIZE)
+ return -EINVAL;
+
+ if (!read_u32(&p, top, &si->version) ||
+ si->version != INCFS_SIGNATURE_VERSION)
+ return -EINVAL;
+
+ if (!read_u32(&p, top, &hash_section_size) ||
+ p + hash_section_size > top)
+ return -EINVAL;
+ top = p + hash_section_size;
+
+ if (!read_u32(&p, top, &si->hash_algorithm) ||
+ si->hash_algorithm != INCFS_HASH_TREE_SHA256)
+ return -EINVAL;
+
+ if (!read_u8(&p, top, &si->log2_blocksize) || si->log2_blocksize != 12)
+ return -EINVAL;
+
+ if (!read_mem_range(&p, top, &si->salt))
+ return -EINVAL;
+
+ if (!read_mem_range(&p, top, &si->root_hash))
+ return -EINVAL;
+
+ if (p != top)
+ return -EINVAL;
+
+ return 0;
+}
+
+struct mtree *incfs_alloc_mtree(struct mem_range signature,
+ int data_block_count)
+{
+ int error;
+ struct signature_info si;
+ struct mtree *result = NULL;
+ struct incfs_hash_alg *hash_alg = NULL;
+ int hash_per_block;
+ int lvl;
+ int total_blocks = 0;
+ int blocks_in_level[INCFS_MAX_MTREE_LEVELS];
+ int blocks = data_block_count;
+
+ if (data_block_count <= 0)
+ return ERR_PTR(-EINVAL);
+
+ error = incfs_parse_signature(signature, &si);
+ if (error)
+ return ERR_PTR(error);
+
+ hash_alg = incfs_get_hash_alg(si.hash_algorithm);
+ if (IS_ERR(hash_alg))
+ return ERR_PTR(PTR_ERR(hash_alg));
+
+ if (si.root_hash.len < hash_alg->digest_size)
+ return ERR_PTR(-EINVAL);
+
+ result = kzalloc(sizeof(*result), GFP_NOFS);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ result->alg = hash_alg;
+ hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / result->alg->digest_size;
+
+ /* Calculating tree geometry. */
+ /* First pass: calculate how many blocks in each tree level. */
+ for (lvl = 0; blocks > 1; lvl++) {
+ if (lvl >= INCFS_MAX_MTREE_LEVELS) {
+ pr_err("incfs: too much data in mtree");
+ goto err;
+ }
+
+ blocks = (blocks + hash_per_block - 1) / hash_per_block;
+ blocks_in_level[lvl] = blocks;
+ total_blocks += blocks;
+ }
+ result->depth = lvl;
+ result->hash_tree_area_size = total_blocks * INCFS_DATA_FILE_BLOCK_SIZE;
+ if (result->hash_tree_area_size > INCFS_MAX_HASH_AREA_SIZE)
+ goto err;
+
+ blocks = 0;
+ /* Second pass: calculate offset of each level. 0th level goes last. */
+ for (lvl = 0; lvl < result->depth; lvl++) {
+ u32 suboffset;
+
+ blocks += blocks_in_level[lvl];
+ suboffset = (total_blocks - blocks)
+ * INCFS_DATA_FILE_BLOCK_SIZE;
+
+ result->hash_level_suboffset[lvl] = suboffset;
+ }
+
+ /* Root hash is stored separately from the rest of the tree. */
+ memcpy(result->root_hash, si.root_hash.data, hash_alg->digest_size);
+ return result;
+
+err:
+ kfree(result);
+ return ERR_PTR(-E2BIG);
+}
+
+void incfs_free_mtree(struct mtree *tree)
+{
+ kfree(tree);
+}
+
+int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data,
+ struct mem_range digest)
+{
+ SHASH_DESC_ON_STACK(desc, alg->shash);
+
+ if (!alg || !alg->shash || !data.data || !digest.data)
+ return -EFAULT;
+
+ if (alg->digest_size > digest.len)
+ return -EINVAL;
+
+ desc->tfm = alg->shash;
+
+ if (data.len < INCFS_DATA_FILE_BLOCK_SIZE) {
+ int err;
+ void *buf = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS);
+
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, data.data, data.len);
+ err = crypto_shash_digest(desc, buf, INCFS_DATA_FILE_BLOCK_SIZE,
+ digest.data);
+ kfree(buf);
+ return err;
+ }
+ return crypto_shash_digest(desc, data.data, data.len, digest.data);
+}
+
diff --git a/fs/incfs/integrity.h b/fs/incfs/integrity.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/integrity.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+#ifndef _INCFS_INTEGRITY_H
+#define _INCFS_INTEGRITY_H
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <crypto/hash.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "internal.h"
+
+#define INCFS_MAX_MTREE_LEVELS 8
+#define INCFS_MAX_HASH_AREA_SIZE (1280 * 1024 * 1024)
+
+struct incfs_hash_alg {
+ const char *name;
+ int digest_size;
+ enum incfs_hash_tree_algorithm id;
+
+ struct crypto_shash *shash;
+};
+
+/* Merkle tree structure. */
+struct mtree {
+ struct incfs_hash_alg *alg;
+
+ u8 root_hash[INCFS_MAX_HASH_SIZE];
+
+ /* Offset of each hash level in the hash area. */
+ u32 hash_level_suboffset[INCFS_MAX_MTREE_LEVELS];
+
+ u32 hash_tree_area_size;
+
+ /* Number of levels in hash_level_suboffset */
+ int depth;
+};
+
+struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id);
+
+struct mtree *incfs_alloc_mtree(struct mem_range signature,
+ int data_block_count);
+
+void incfs_free_mtree(struct mtree *tree);
+
+size_t incfs_get_mtree_depth(enum incfs_hash_tree_algorithm alg, loff_t size);
+
+size_t incfs_get_mtree_hash_count(enum incfs_hash_tree_algorithm alg,
+ loff_t size);
+
+int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data,
+ struct mem_range digest);
+
+#endif /* _INCFS_INTEGRITY_H */
diff --git a/fs/incfs/internal.h b/fs/incfs/internal.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/internal.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+#ifndef _INCFS_INTERNAL_H
+#define _INCFS_INTERNAL_H
+#include <linux/types.h>
+
+struct mem_range {
+ u8 *data;
+ size_t len;
+};
+
+static inline struct mem_range range(u8 *data, size_t len)
+{
+ return (struct mem_range){ .data = data, .len = len };
+}
+
+#define LOCK_REQUIRED(lock) WARN_ON_ONCE(!mutex_is_locked(&lock))
+
+#define EFSCORRUPTED EUCLEAN
+
+#endif /* _INCFS_INTERNAL_H */
diff --git a/fs/incfs/main.c b/fs/incfs/main.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/main.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "sysfs.h"
+#include "vfs.h"
+
+static struct file_system_type incfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = INCFS_NAME,
+ .mount = incfs_mount_fs,
+ .kill_sb = incfs_kill_sb,
+ .fs_flags = 0
+};
+
+static int __init init_incfs_module(void)
+{
+ int err = 0;
+
+ err = incfs_init_sysfs();
+ if (err)
+ return err;
+
+ err = register_filesystem(&incfs_fs_type);
+ if (err)
+ incfs_cleanup_sysfs();
+
+ return err;
+}
+
+static void __exit cleanup_incfs_module(void)
+{
+ incfs_cleanup_sysfs();
+ unregister_filesystem(&incfs_fs_type);
+}
+
+module_init(init_incfs_module);
+module_exit(cleanup_incfs_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eugene Zemtsov <ezemtsov@google.com>");
+MODULE_DESCRIPTION("Incremental File System");
diff --git a/fs/incfs/pseudo_files.c b/fs/incfs/pseudo_files.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/pseudo_files.c
@@ -0,0 +1,1394 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Google LLC
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/filelock.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "pseudo_files.h"
+
+#include "data_mgmt.h"
+#include "format.h"
+#include "integrity.h"
+#include "vfs.h"
+
+#define READ_WRITE_FILE_MODE 0666
+
+static bool is_pseudo_filename(struct mem_range name);
+
+/*******************************************************************************
+ * .pending_reads pseudo file definition
+ ******************************************************************************/
+#define INCFS_PENDING_READS_INODE 2
+static const char pending_reads_file_name[] = INCFS_PENDING_READS_FILENAME;
+
+/* State of an open .pending_reads file, unique for each file descriptor. */
+struct pending_reads_state {
+ /* A serial number of the last pending read obtained from this file. */
+ int last_pending_read_sn;
+};
+
+static ssize_t pending_reads_read(struct file *f, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct pending_reads_state *pr_state = f->private_data;
+ struct mount_info *mi = get_mount_info(file_superblock(f));
+ bool report_uid;
+ unsigned long page = 0;
+ struct incfs_pending_read_info *reads_buf = NULL;
+ struct incfs_pending_read_info2 *reads_buf2 = NULL;
+ size_t record_size;
+ size_t reads_to_collect;
+ int last_known_read_sn = READ_ONCE(pr_state->last_pending_read_sn);
+ int new_max_sn = last_known_read_sn;
+ int reads_collected = 0;
+ ssize_t result = 0;
+
+ if (!mi)
+ return -EFAULT;
+
+ report_uid = mi->mi_options.report_uid;
+ record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf);
+ reads_to_collect = len / record_size;
+
+ if (!incfs_fresh_pending_reads_exist(mi, last_known_read_sn))
+ return 0;
+
+ page = get_zeroed_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ if (report_uid)
+ reads_buf2 = (struct incfs_pending_read_info2 *) page;
+ else
+ reads_buf = (struct incfs_pending_read_info *) page;
+
+ reads_to_collect =
+ min_t(size_t, PAGE_SIZE / record_size, reads_to_collect);
+
+ reads_collected = incfs_collect_pending_reads(mi, last_known_read_sn,
+ reads_buf, reads_buf2, reads_to_collect,
+ &new_max_sn);
+
+ if (reads_collected < 0) {
+ result = reads_collected;
+ goto out;
+ }
+
+ /*
+ * Just to make sure that we don't accidentally copy more data
+ * to reads buffer than userspace can handle.
+ */
+ reads_collected = min_t(size_t, reads_collected, reads_to_collect);
+ result = reads_collected * record_size;
+
+ /* Copy reads info to the userspace buffer */
+ if (copy_to_user(buf, (void *)page, result)) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ WRITE_ONCE(pr_state->last_pending_read_sn, new_max_sn);
+ *ppos = 0;
+
+out:
+ free_page(page);
+ return result;
+}
+
+static __poll_t pending_reads_poll(struct file *file, poll_table *wait)
+{
+ struct pending_reads_state *state = file->private_data;
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+ __poll_t ret = 0;
+
+ poll_wait(file, &mi->mi_pending_reads_notif_wq, wait);
+ if (incfs_fresh_pending_reads_exist(mi,
+ state->last_pending_read_sn))
+ ret = EPOLLIN | EPOLLRDNORM;
+
+ return ret;
+}
+
+static int pending_reads_open(struct inode *inode, struct file *file)
+{
+ struct pending_reads_state *state = NULL;
+
+ state = kzalloc(sizeof(*state), GFP_NOFS);
+ if (!state)
+ return -ENOMEM;
+
+ file->private_data = state;
+ return 0;
+}
+
+static int pending_reads_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static long ioctl_permit_fill(struct file *f, void __user *arg)
+{
+ struct incfs_permit_fill __user *usr_permit_fill = arg;
+ struct incfs_permit_fill permit_fill;
+ long error = 0;
+ struct file *file = NULL;
+ struct incfs_file_data *fd;
+
+ if (copy_from_user(&permit_fill, usr_permit_fill, sizeof(permit_fill)))
+ return -EFAULT;
+
+ file = fget(permit_fill.file_descriptor);
+ if (IS_ERR_OR_NULL(file)) {
+ if (!file)
+ return -ENOENT;
+
+ return PTR_ERR(file);
+ }
+
+ if (file->f_op != &incfs_file_ops) {
+ error = -EPERM;
+ goto out;
+ }
+
+ if (file->f_inode->i_sb != f->f_inode->i_sb) {
+ error = -EPERM;
+ goto out;
+ }
+
+ fd = file->private_data;
+
+ switch (fd->fd_fill_permission) {
+ case CANT_FILL:
+ fd->fd_fill_permission = CAN_FILL;
+ break;
+
+ case CAN_FILL:
+ pr_debug("CAN_FILL already set");
+ break;
+
+ default:
+ pr_warn("Invalid file private data");
+ error = -EFAULT;
+ goto out;
+ }
+
+out:
+ fput(file);
+ return error;
+}
+
+static int chmod(struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode = dentry->d_inode;
+ struct inode *delegated_inode = NULL;
+ struct iattr newattrs;
+ int error;
+
+retry_deleg:
+ inode_lock(inode);
+ newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ error = notify_change(&nop_mnt_idmap, dentry, &newattrs, &delegated_inode);
+ inode_unlock(inode);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+ return error;
+}
+
+static bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
+{
+ if (lhs.len != rhs.len)
+ return false;
+ return memcmp(lhs.data, rhs.data, lhs.len) == 0;
+}
+
+static int validate_name(char *file_name)
+{
+ struct mem_range name = range(file_name, strlen(file_name));
+ int i = 0;
+
+ if (name.len > INCFS_MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ if (is_pseudo_filename(name))
+ return -EINVAL;
+
+ for (i = 0; i < name.len; i++)
+ if (name.data[i] == '/')
+ return -EINVAL;
+
+ return 0;
+}
+
+static int dir_relative_path_resolve(
+ struct mount_info *mi,
+ const char __user *relative_path,
+ struct path *result_path,
+ struct path *base_path)
+{
+ int dir_fd = get_unused_fd_flags(0);
+ struct file *dir_f = NULL;
+ int error = 0;
+
+ if (!base_path)
+ base_path = &mi->mi_backing_dir_path;
+
+ if (dir_fd < 0)
+ return dir_fd;
+
+ dir_f = dentry_open(base_path, O_RDONLY | O_NOATIME, current_cred());
+
+ if (IS_ERR(dir_f)) {
+ error = PTR_ERR(dir_f);
+ goto out;
+ }
+ fd_install(dir_fd, dir_f);
+
+ if (!relative_path) {
+ /* No relative path given, just return the base dir. */
+ *result_path = *base_path;
+ path_get(result_path);
+ goto out;
+ }
+
+ error = user_path_at_empty(dir_fd, relative_path,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, result_path, NULL);
+
+out:
+ close_fd(dir_fd);
+ if (error)
+ pr_debug("Error: %d\n", error);
+ return error;
+}
+
+static struct mem_range incfs_copy_signature_info_from_user(u8 __user *original,
+ u64 size)
+{
+ u8 *result;
+
+ if (!original)
+ return range(NULL, 0);
+
+ if (size > INCFS_MAX_SIGNATURE_SIZE)
+ return range(ERR_PTR(-EFAULT), 0);
+
+ result = kzalloc(size, GFP_NOFS | __GFP_COMP);
+ if (!result)
+ return range(ERR_PTR(-ENOMEM), 0);
+
+ if (copy_from_user(result, original, size)) {
+ kfree(result);
+ return range(ERR_PTR(-EFAULT), 0);
+ }
+
+ return range(result, size);
+}
+
+static int init_new_file(struct mount_info *mi, struct dentry *dentry,
+ incfs_uuid_t *uuid, u64 size, struct mem_range attr,
+ u8 __user *user_signature_info, u64 signature_size)
+{
+ struct path path = {};
+ struct file *new_file;
+ int error = 0;
+ struct backing_file_context *bfc = NULL;
+ u32 block_count;
+ struct mem_range raw_signature = { NULL };
+ struct mtree *hash_tree = NULL;
+
+ if (!mi || !dentry || !uuid)
+ return -EFAULT;
+
+ /* Resize newly created file to its true size. */
+ path = (struct path) {
+ .mnt = mi->mi_backing_dir_path.mnt,
+ .dentry = dentry
+ };
+
+ new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
+ current_cred());
+
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto out;
+ }
+
+ bfc = incfs_alloc_bfc(mi, new_file);
+ fput(new_file);
+ if (IS_ERR(bfc)) {
+ error = PTR_ERR(bfc);
+ bfc = NULL;
+ goto out;
+ }
+
+ mutex_lock(&bfc->bc_mutex);
+ error = incfs_write_fh_to_backing_file(bfc, uuid, size);
+ if (error)
+ goto out;
+
+ block_count = (u32)get_blocks_count_for_size(size);
+
+ if (user_signature_info) {
+ raw_signature = incfs_copy_signature_info_from_user(
+ user_signature_info, signature_size);
+
+ if (IS_ERR(raw_signature.data)) {
+ error = PTR_ERR(raw_signature.data);
+ raw_signature.data = NULL;
+ goto out;
+ }
+
+ hash_tree = incfs_alloc_mtree(raw_signature, block_count);
+ if (IS_ERR(hash_tree)) {
+ error = PTR_ERR(hash_tree);
+ hash_tree = NULL;
+ goto out;
+ }
+
+ error = incfs_write_signature_to_backing_file(bfc,
+ raw_signature, hash_tree->hash_tree_area_size,
+ NULL, NULL);
+ if (error)
+ goto out;
+
+ block_count += get_blocks_count_for_size(
+ hash_tree->hash_tree_area_size);
+ }
+
+ if (block_count)
+ error = incfs_write_blockmap_to_backing_file(bfc, block_count);
+
+ if (error)
+ goto out;
+
+out:
+ if (bfc) {
+ mutex_unlock(&bfc->bc_mutex);
+ incfs_free_bfc(bfc);
+ }
+ incfs_free_mtree(hash_tree);
+ kfree(raw_signature.data);
+
+ if (error)
+ pr_debug("incfs: %s error: %d\n", __func__, error);
+ return error;
+}
+
+static void notify_create(struct file *pending_reads_file,
+ const char __user *dir_name, const char *file_name,
+ const char *file_id_str, bool incomplete_file)
+{
+ struct mount_info *mi =
+ get_mount_info(file_superblock(pending_reads_file));
+ struct path base_path = {
+ .mnt = pending_reads_file->f_path.mnt,
+ .dentry = pending_reads_file->f_path.dentry->d_parent,
+ };
+ struct path dir_path = {};
+ struct dentry *file = NULL;
+ struct dentry *dir = NULL;
+ int error;
+
+ error = dir_relative_path_resolve(mi, dir_name, &dir_path, &base_path);
+ if (error)
+ goto out;
+
+ file = incfs_lookup_dentry(dir_path.dentry, file_name);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ file = NULL;
+ goto out;
+ }
+
+ fsnotify_create(d_inode(dir_path.dentry), file);
+
+ if (file_id_str) {
+ dir = incfs_lookup_dentry(base_path.dentry, INCFS_INDEX_NAME);
+ if (IS_ERR(dir)) {
+ error = PTR_ERR(dir);
+ dir = NULL;
+ goto out;
+ }
+
+ dput(file);
+ file = incfs_lookup_dentry(dir, file_id_str);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ file = NULL;
+ goto out;
+ }
+
+ fsnotify_create(d_inode(dir), file);
+
+ if (incomplete_file) {
+ dput(dir);
+ dir = incfs_lookup_dentry(base_path.dentry,
+ INCFS_INCOMPLETE_NAME);
+ if (IS_ERR(dir)) {
+ error = PTR_ERR(dir);
+ dir = NULL;
+ goto out;
+ }
+
+ dput(file);
+ file = incfs_lookup_dentry(dir, file_id_str);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ file = NULL;
+ goto out;
+ }
+
+ fsnotify_create(d_inode(dir), file);
+ }
+ }
+out:
+ if (error)
+ pr_warn("%s failed with error %d\n", __func__, error);
+
+ dput(dir);
+ dput(file);
+ path_put(&dir_path);
+}
+
+static long ioctl_create_file(struct file *file,
+ struct incfs_new_file_args __user *usr_args)
+{
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+ struct incfs_new_file_args args;
+ char *file_id_str = NULL;
+ struct dentry *index_file_dentry = NULL;
+ struct dentry *named_file_dentry = NULL;
+ struct dentry *incomplete_file_dentry = NULL;
+ struct path parent_dir_path = {};
+ struct inode *index_dir_inode = NULL;
+ __le64 size_attr_value = 0;
+ char *file_name = NULL;
+ char *attr_value = NULL;
+ int error = 0;
+ bool locked = false;
+ bool index_linked = false;
+ bool name_linked = false;
+ bool incomplete_linked = false;
+
+ if (!mi || !mi->mi_index_dir || !mi->mi_incomplete_dir) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ if (copy_from_user(&args, usr_args, sizeof(args)) > 0) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX);
+ if (IS_ERR(file_name)) {
+ error = PTR_ERR(file_name);
+ file_name = NULL;
+ goto out;
+ }
+
+ error = validate_name(file_name);
+ if (error)
+ goto out;
+
+ file_id_str = file_id_to_str(args.file_id);
+ if (!file_id_str) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (error)
+ goto out;
+ locked = true;
+
+ /* Find a directory to put the file into. */
+ error = dir_relative_path_resolve(mi,
+ u64_to_user_ptr(args.directory_path),
+ &parent_dir_path, NULL);
+ if (error)
+ goto out;
+
+ if (parent_dir_path.dentry == mi->mi_index_dir) {
+ /* Can't create a file directly inside .index */
+ error = -EBUSY;
+ goto out;
+ }
+
+ if (parent_dir_path.dentry == mi->mi_incomplete_dir) {
+ /* Can't create a file directly inside .incomplete */
+ error = -EBUSY;
+ goto out;
+ }
+
+ /* Look up a dentry in the parent dir. It should be negative. */
+ named_file_dentry = incfs_lookup_dentry(parent_dir_path.dentry,
+ file_name);
+ if (!named_file_dentry) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (IS_ERR(named_file_dentry)) {
+ error = PTR_ERR(named_file_dentry);
+ named_file_dentry = NULL;
+ goto out;
+ }
+ if (d_really_is_positive(named_file_dentry)) {
+ /* File with this path already exists. */
+ error = -EEXIST;
+ goto out;
+ }
+
+ /* Look up a dentry in the incomplete dir. It should be negative. */
+ incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir,
+ file_id_str);
+ if (!incomplete_file_dentry) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (IS_ERR(incomplete_file_dentry)) {
+ error = PTR_ERR(incomplete_file_dentry);
+ incomplete_file_dentry = NULL;
+ goto out;
+ }
+ if (d_really_is_positive(incomplete_file_dentry)) {
+ /* File with this path already exists. */
+ error = -EEXIST;
+ goto out;
+ }
+
+ /* Look up a dentry in the .index dir. It should be negative. */
+ index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str);
+ if (!index_file_dentry) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (IS_ERR(index_file_dentry)) {
+ error = PTR_ERR(index_file_dentry);
+ index_file_dentry = NULL;
+ goto out;
+ }
+ if (d_really_is_positive(index_file_dentry)) {
+ /* File with this ID already exists in index. */
+ error = -EEXIST;
+ goto out;
+ }
+
+ /* Creating a file in the .index dir. */
+ index_dir_inode = d_inode(mi->mi_index_dir);
+ inode_lock_nested(index_dir_inode, I_MUTEX_PARENT);
+ error = vfs_create(&nop_mnt_idmap, index_dir_inode, index_file_dentry,
+ args.mode | 0222, true);
+ inode_unlock(index_dir_inode);
+
+ if (error)
+ goto out;
+ if (!d_really_is_positive(index_file_dentry)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = chmod(index_file_dentry, args.mode | 0222);
+ if (error) {
+ pr_debug("incfs: chmod err: %d\n", error);
+ goto out;
+ }
+
+ /* Save the file's ID as an xattr for easy fetching in future. */
+ error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry, INCFS_XATTR_ID_NAME,
+ file_id_str, strlen(file_id_str), XATTR_CREATE);
+ if (error) {
+ pr_debug("incfs: vfs_setxattr err:%d\n", error);
+ goto out;
+ }
+
+ /* Save the file's size as an xattr for easy fetching in future. */
+ size_attr_value = cpu_to_le64(args.size);
+ error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry, INCFS_XATTR_SIZE_NAME,
+ (char *)&size_attr_value, sizeof(size_attr_value),
+ XATTR_CREATE);
+ if (error) {
+ pr_debug("incfs: vfs_setxattr err:%d\n", error);
+ goto out;
+ }
+
+ /* Save the file's attribute as an xattr */
+ if (args.file_attr_len && args.file_attr) {
+ if (args.file_attr_len > INCFS_MAX_FILE_ATTR_SIZE) {
+ error = -E2BIG;
+ goto out;
+ }
+
+ attr_value = kmalloc(args.file_attr_len, GFP_NOFS);
+ if (!attr_value) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ if (copy_from_user(attr_value,
+ u64_to_user_ptr(args.file_attr),
+ args.file_attr_len) > 0) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry,
+ INCFS_XATTR_METADATA_NAME,
+ attr_value, args.file_attr_len,
+ XATTR_CREATE);
+
+ if (error)
+ goto out;
+ }
+
+ /* Initializing a newly created file. */
+ error = init_new_file(mi, index_file_dentry, &args.file_id, args.size,
+ range(attr_value, args.file_attr_len),
+ u64_to_user_ptr(args.signature_info),
+ args.signature_size);
+ if (error)
+ goto out;
+ index_linked = true;
+
+ /* Linking a file with its real name from the requested dir. */
+ error = incfs_link(index_file_dentry, named_file_dentry);
+ if (error)
+ goto out;
+ name_linked = true;
+
+ if (args.size) {
+ /* Linking a file with its incomplete entry */
+ error = incfs_link(index_file_dentry, incomplete_file_dentry);
+ if (error)
+ goto out;
+ incomplete_linked = true;
+ }
+
+ notify_create(file, u64_to_user_ptr(args.directory_path), file_name,
+ file_id_str, args.size != 0);
+
+out:
+ if (error) {
+ pr_debug("incfs: %s err:%d\n", __func__, error);
+ if (index_linked)
+ incfs_unlink(index_file_dentry);
+ if (name_linked)
+ incfs_unlink(named_file_dentry);
+ if (incomplete_linked)
+ incfs_unlink(incomplete_file_dentry);
+ }
+
+ kfree(file_id_str);
+ kfree(file_name);
+ kfree(attr_value);
+ dput(named_file_dentry);
+ dput(index_file_dentry);
+ dput(incomplete_file_dentry);
+ path_put(&parent_dir_path);
+ if (locked)
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+
+ return error;
+}
+
+static int init_new_mapped_file(struct mount_info *mi, struct dentry *dentry,
+ incfs_uuid_t *uuid, u64 size, u64 offset)
+{
+ struct path path = {};
+ struct file *new_file;
+ int error = 0;
+ struct backing_file_context *bfc = NULL;
+
+ if (!mi || !dentry || !uuid)
+ return -EFAULT;
+
+ /* Resize newly created file to its true size. */
+ path = (struct path) {
+ .mnt = mi->mi_backing_dir_path.mnt,
+ .dentry = dentry
+ };
+ new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
+ current_cred());
+
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto out;
+ }
+
+ bfc = incfs_alloc_bfc(mi, new_file);
+ fput(new_file);
+ if (IS_ERR(bfc)) {
+ error = PTR_ERR(bfc);
+ bfc = NULL;
+ goto out;
+ }
+
+ mutex_lock(&bfc->bc_mutex);
+ error = incfs_write_mapping_fh_to_backing_file(bfc, uuid, size, offset);
+ if (error)
+ goto out;
+
+out:
+ if (bfc) {
+ mutex_unlock(&bfc->bc_mutex);
+ incfs_free_bfc(bfc);
+ }
+
+ if (error)
+ pr_debug("incfs: %s error: %d\n", __func__, error);
+ return error;
+}
+
+static long ioctl_create_mapped_file(struct file *file, void __user *arg)
+{
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+ struct incfs_create_mapped_file_args __user *args_usr_ptr = arg;
+ struct incfs_create_mapped_file_args args = {};
+ char *file_name;
+ int error = 0;
+ struct path parent_dir_path = {};
+ char *source_file_name = NULL;
+ struct dentry *source_file_dentry = NULL;
+ u64 source_file_size;
+ struct dentry *file_dentry = NULL;
+ struct inode *parent_inode;
+ __le64 size_attr_value;
+
+ if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
+ return -EINVAL;
+
+ file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX);
+ if (IS_ERR(file_name)) {
+ error = PTR_ERR(file_name);
+ file_name = NULL;
+ goto out;
+ }
+
+ error = validate_name(file_name);
+ if (error)
+ goto out;
+
+ if (args.source_offset % INCFS_DATA_FILE_BLOCK_SIZE) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ /* Validate file mapping is in range */
+ source_file_name = file_id_to_str(args.source_file_id);
+ if (!source_file_name) {
+ pr_warn("Failed to alloc source_file_name\n");
+ error = -ENOMEM;
+ goto out;
+ }
+
+ source_file_dentry = incfs_lookup_dentry(mi->mi_index_dir,
+ source_file_name);
+ if (!source_file_dentry) {
+ pr_warn("Source file does not exist\n");
+ error = -EINVAL;
+ goto out;
+ }
+ if (IS_ERR(source_file_dentry)) {
+ pr_warn("Error opening source file\n");
+ error = PTR_ERR(source_file_dentry);
+ source_file_dentry = NULL;
+ goto out;
+ }
+ if (!d_really_is_positive(source_file_dentry)) {
+ pr_warn("Source file dentry negative\n");
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = vfs_getxattr(&nop_mnt_idmap, source_file_dentry, INCFS_XATTR_SIZE_NAME,
+ (char *)&size_attr_value, sizeof(size_attr_value));
+ if (error < 0)
+ goto out;
+
+ if (error != sizeof(size_attr_value)) {
+ pr_warn("Mapped file has no size attr\n");
+ error = -EINVAL;
+ goto out;
+ }
+
+ source_file_size = le64_to_cpu(size_attr_value);
+ if (args.source_offset + args.size > source_file_size) {
+ pr_warn("Mapped file out of range\n");
+ error = -EINVAL;
+ goto out;
+ }
+
+ /* Find a directory to put the file into. */
+ error = dir_relative_path_resolve(mi,
+ u64_to_user_ptr(args.directory_path),
+ &parent_dir_path, NULL);
+ if (error)
+ goto out;
+
+ if (parent_dir_path.dentry == mi->mi_index_dir) {
+ /* Can't create a file directly inside .index */
+ error = -EBUSY;
+ goto out;
+ }
+
+ /* Look up a dentry in the parent dir. It should be negative. */
+ file_dentry = incfs_lookup_dentry(parent_dir_path.dentry,
+ file_name);
+ if (!file_dentry) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (IS_ERR(file_dentry)) {
+ error = PTR_ERR(file_dentry);
+ file_dentry = NULL;
+ goto out;
+ }
+ if (d_really_is_positive(file_dentry)) {
+ error = -EEXIST;
+ goto out;
+ }
+
+ parent_inode = d_inode(parent_dir_path.dentry);
+ inode_lock_nested(parent_inode, I_MUTEX_PARENT);
+ error = vfs_create(&nop_mnt_idmap, parent_inode, file_dentry,
+ args.mode | 0222, true);
+ inode_unlock(parent_inode);
+ if (error)
+ goto out;
+
+ error = chmod(file_dentry, args.mode | 0222);
+ if (error) {
+ pr_debug("incfs: chmod err: %d\n", error);
+ goto delete_file;
+ }
+
+ /* Save the file's size as an xattr for easy fetching in future. */
+ size_attr_value = cpu_to_le64(args.size);
+ error = vfs_setxattr(&nop_mnt_idmap, file_dentry, INCFS_XATTR_SIZE_NAME,
+ (char *)&size_attr_value, sizeof(size_attr_value),
+ XATTR_CREATE);
+ if (error) {
+ pr_debug("incfs: vfs_setxattr err:%d\n", error);
+ goto delete_file;
+ }
+
+ error = init_new_mapped_file(mi, file_dentry, &args.source_file_id,
+ args.size, args.source_offset);
+ if (error)
+ goto delete_file;
+
+ notify_create(file, u64_to_user_ptr(args.directory_path), file_name,
+ NULL, false);
+
+ goto out;
+
+delete_file:
+ incfs_unlink(file_dentry);
+
+out:
+ dput(file_dentry);
+ dput(source_file_dentry);
+ path_put(&parent_dir_path);
+ kfree(file_name);
+ kfree(source_file_name);
+ return error;
+}
+
+static long ioctl_get_read_timeouts(struct mount_info *mi, void __user *arg)
+{
+ struct incfs_get_read_timeouts_args __user *args_usr_ptr = arg;
+ struct incfs_get_read_timeouts_args args = {};
+ int error = 0;
+ struct incfs_per_uid_read_timeouts *buffer;
+ int size;
+
+ if (copy_from_user(&args, args_usr_ptr, sizeof(args)))
+ return -EINVAL;
+
+ if (args.timeouts_array_size > INCFS_DATA_FILE_BLOCK_SIZE)
+ return -EINVAL;
+
+ buffer = kzalloc(args.timeouts_array_size, GFP_NOFS);
+ if (!buffer)
+ return -ENOMEM;
+
+ spin_lock(&mi->mi_per_uid_read_timeouts_lock);
+ size = mi->mi_per_uid_read_timeouts_size;
+ if (args.timeouts_array_size < size)
+ error = -E2BIG;
+ else if (size)
+ memcpy(buffer, mi->mi_per_uid_read_timeouts, size);
+ spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
+
+ args.timeouts_array_size_out = size;
+ if (!error && size)
+ if (copy_to_user(u64_to_user_ptr(args.timeouts_array), buffer,
+ size))
+ error = -EFAULT;
+
+ if (!error || error == -E2BIG)
+ if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0)
+ error = -EFAULT;
+
+ kfree(buffer);
+ return error;
+}
+
+static long ioctl_set_read_timeouts(struct mount_info *mi, void __user *arg)
+{
+ struct incfs_set_read_timeouts_args __user *args_usr_ptr = arg;
+ struct incfs_set_read_timeouts_args args = {};
+ int error = 0;
+ int size;
+ struct incfs_per_uid_read_timeouts *buffer = NULL, *tmp;
+ int i;
+
+ if (copy_from_user(&args, args_usr_ptr, sizeof(args)))
+ return -EINVAL;
+
+ size = args.timeouts_array_size;
+ if (size) {
+ if (size > INCFS_DATA_FILE_BLOCK_SIZE ||
+ size % sizeof(*buffer) != 0)
+ return -EINVAL;
+
+ buffer = kzalloc(size, GFP_NOFS);
+ if (!buffer)
+ return -ENOMEM;
+
+ if (copy_from_user(buffer, u64_to_user_ptr(args.timeouts_array),
+ size)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < size / sizeof(*buffer); ++i) {
+ struct incfs_per_uid_read_timeouts *t = &buffer[i];
+
+ if (t->min_pending_time_us > t->max_pending_time_us) {
+ error = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ spin_lock(&mi->mi_per_uid_read_timeouts_lock);
+ mi->mi_per_uid_read_timeouts_size = size;
+ tmp = mi->mi_per_uid_read_timeouts;
+ mi->mi_per_uid_read_timeouts = buffer;
+ buffer = tmp;
+ spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
+
+out:
+ kfree(buffer);
+ return error;
+}
+
+static long ioctl_get_last_read_error(struct mount_info *mi, void __user *arg)
+{
+ struct incfs_get_last_read_error_args __user *args_usr_ptr = arg;
+ struct incfs_get_last_read_error_args args = {};
+ int error;
+
+ error = mutex_lock_interruptible(&mi->mi_le_mutex);
+ if (error)
+ return error;
+
+ args.file_id_out = mi->mi_le_file_id;
+ args.time_us_out = mi->mi_le_time_us;
+ args.page_out = mi->mi_le_page;
+ args.errno_out = mi->mi_le_errno;
+ args.uid_out = mi->mi_le_uid;
+
+ mutex_unlock(&mi->mi_le_mutex);
+ if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0)
+ error = -EFAULT;
+
+ return error;
+}
+
+static long pending_reads_dispatch_ioctl(struct file *f, unsigned int req,
+ unsigned long arg)
+{
+ struct mount_info *mi = get_mount_info(file_superblock(f));
+
+ switch (req) {
+ case INCFS_IOC_CREATE_FILE:
+ return ioctl_create_file(f, (void __user *)arg);
+ case INCFS_IOC_PERMIT_FILL:
+ return ioctl_permit_fill(f, (void __user *)arg);
+ case INCFS_IOC_CREATE_MAPPED_FILE:
+ return ioctl_create_mapped_file(f, (void __user *)arg);
+ case INCFS_IOC_GET_READ_TIMEOUTS:
+ return ioctl_get_read_timeouts(mi, (void __user *)arg);
+ case INCFS_IOC_SET_READ_TIMEOUTS:
+ return ioctl_set_read_timeouts(mi, (void __user *)arg);
+ case INCFS_IOC_GET_LAST_READ_ERROR:
+ return ioctl_get_last_read_error(mi, (void __user *)arg);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct file_operations incfs_pending_reads_file_ops = {
+ .read = pending_reads_read,
+ .poll = pending_reads_poll,
+ .open = pending_reads_open,
+ .release = pending_reads_release,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = pending_reads_dispatch_ioctl,
+ .compat_ioctl = pending_reads_dispatch_ioctl
+};
+
+/*******************************************************************************
+ * .log pseudo file definition
+ ******************************************************************************/
+#define INCFS_LOG_INODE 3
+static const char log_file_name[] = INCFS_LOG_FILENAME;
+
+/* State of an open .log file, unique for each file descriptor. */
+struct log_file_state {
+ struct read_log_state state;
+};
+
+static ssize_t log_read(struct file *f, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct log_file_state *log_state = f->private_data;
+ struct mount_info *mi = get_mount_info(file_superblock(f));
+ int total_reads_collected = 0;
+ int rl_size;
+ ssize_t result = 0;
+ bool report_uid;
+ unsigned long page = 0;
+ struct incfs_pending_read_info *reads_buf = NULL;
+ struct incfs_pending_read_info2 *reads_buf2 = NULL;
+ size_t record_size;
+ ssize_t reads_to_collect;
+ ssize_t reads_per_page;
+
+ if (!mi)
+ return -EFAULT;
+
+ report_uid = mi->mi_options.report_uid;
+ record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf);
+ reads_to_collect = len / record_size;
+ reads_per_page = PAGE_SIZE / record_size;
+
+ rl_size = READ_ONCE(mi->mi_log.rl_size);
+ if (rl_size == 0)
+ return 0;
+
+ page = __get_free_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ if (report_uid)
+ reads_buf2 = (struct incfs_pending_read_info2 *)page;
+ else
+ reads_buf = (struct incfs_pending_read_info *)page;
+
+ reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
+ while (reads_to_collect > 0) {
+ struct read_log_state next_state;
+ int reads_collected;
+
+ memcpy(&next_state, &log_state->state, sizeof(next_state));
+ reads_collected = incfs_collect_logged_reads(
+ mi, &next_state, reads_buf, reads_buf2,
+ min_t(ssize_t, reads_to_collect, reads_per_page));
+ if (reads_collected <= 0) {
+ result = total_reads_collected ?
+ total_reads_collected * record_size :
+ reads_collected;
+ goto out;
+ }
+ if (copy_to_user(buf, (void *)page,
+ reads_collected * record_size)) {
+ result = total_reads_collected ?
+ total_reads_collected * record_size :
+ -EFAULT;
+ goto out;
+ }
+
+ memcpy(&log_state->state, &next_state, sizeof(next_state));
+ total_reads_collected += reads_collected;
+ buf += reads_collected * record_size;
+ reads_to_collect -= reads_collected;
+ }
+
+ result = total_reads_collected * record_size;
+ *ppos = 0;
+out:
+ free_page(page);
+ return result;
+}
+
+static __poll_t log_poll(struct file *file, poll_table *wait)
+{
+ struct log_file_state *log_state = file->private_data;
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+ int count;
+ __poll_t ret = 0;
+
+ poll_wait(file, &mi->mi_log.ml_notif_wq, wait);
+ count = incfs_get_uncollected_logs_count(mi, &log_state->state);
+ if (count >= mi->mi_options.read_log_wakeup_count)
+ ret = EPOLLIN | EPOLLRDNORM;
+
+ return ret;
+}
+
+static int log_open(struct inode *inode, struct file *file)
+{
+ struct log_file_state *log_state = NULL;
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+
+ log_state = kzalloc(sizeof(*log_state), GFP_NOFS);
+ if (!log_state)
+ return -ENOMEM;
+
+ log_state->state = incfs_get_log_state(mi);
+ file->private_data = log_state;
+ return 0;
+}
+
+static int log_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static const struct file_operations incfs_log_file_ops = {
+ .read = log_read,
+ .poll = log_poll,
+ .open = log_open,
+ .release = log_release,
+ .llseek = noop_llseek,
+};
+
+/*******************************************************************************
+ * .blocks_written pseudo file definition
+ ******************************************************************************/
+#define INCFS_BLOCKS_WRITTEN_INODE 4
+static const char blocks_written_file_name[] = INCFS_BLOCKS_WRITTEN_FILENAME;
+
+/* State of an open .blocks_written file, unique for each file descriptor. */
+struct blocks_written_file_state {
+ unsigned long blocks_written;
+};
+
+static ssize_t blocks_written_read(struct file *f, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct mount_info *mi = get_mount_info(file_superblock(f));
+ struct blocks_written_file_state *state = f->private_data;
+ unsigned long blocks_written;
+ char string[21];
+ int result = 0;
+
+ if (!mi)
+ return -EFAULT;
+
+ blocks_written = atomic_read(&mi->mi_blocks_written);
+ if (state->blocks_written == blocks_written)
+ return 0;
+
+ result = snprintf(string, sizeof(string), "%lu", blocks_written);
+ if (result > len)
+ result = len;
+ if (copy_to_user(buf, string, result))
+ return -EFAULT;
+
+ state->blocks_written = blocks_written;
+ return result;
+}
+
+static __poll_t blocks_written_poll(struct file *f, poll_table *wait)
+{
+ struct mount_info *mi = get_mount_info(file_superblock(f));
+ struct blocks_written_file_state *state = f->private_data;
+ unsigned long blocks_written;
+
+ if (!mi)
+ return 0;
+
+ poll_wait(f, &mi->mi_blocks_written_notif_wq, wait);
+ blocks_written = atomic_read(&mi->mi_blocks_written);
+ if (state->blocks_written == blocks_written)
+ return 0;
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
+static int blocks_written_open(struct inode *inode, struct file *file)
+{
+ struct blocks_written_file_state *state =
+ kzalloc(sizeof(*state), GFP_NOFS);
+
+ if (!state)
+ return -ENOMEM;
+
+ state->blocks_written = -1;
+ file->private_data = state;
+ return 0;
+}
+
+static int blocks_written_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static const struct file_operations incfs_blocks_written_file_ops = {
+ .read = blocks_written_read,
+ .poll = blocks_written_poll,
+ .open = blocks_written_open,
+ .release = blocks_written_release,
+ .llseek = noop_llseek,
+};
+
+/*******************************************************************************
+ * Generic inode lookup functionality
+ ******************************************************************************/
+
+const struct mem_range incfs_pseudo_file_names[] = {
+ { .data = (u8 *)pending_reads_file_name,
+ .len = ARRAY_SIZE(pending_reads_file_name) - 1 },
+ { .data = (u8 *)log_file_name, .len = ARRAY_SIZE(log_file_name) - 1 },
+ { .data = (u8 *)blocks_written_file_name,
+ .len = ARRAY_SIZE(blocks_written_file_name) - 1 }
+};
+
+const unsigned long incfs_pseudo_file_inodes[] = { INCFS_PENDING_READS_INODE,
+ INCFS_LOG_INODE,
+ INCFS_BLOCKS_WRITTEN_INODE };
+
+static const struct file_operations *const pseudo_file_operations[] = {
+ &incfs_pending_reads_file_ops, &incfs_log_file_ops,
+ &incfs_blocks_written_file_ops
+};
+
+static bool is_pseudo_filename(struct mem_range name)
+{
+ int i = 0;
+
+ for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i)
+ if (incfs_equal_ranges(incfs_pseudo_file_names[i], name))
+ return true;
+ return false;
+}
+
+static bool get_pseudo_inode(int ino, struct inode *inode)
+{
+ int i = 0;
+
+ for (; i < ARRAY_SIZE(incfs_pseudo_file_inodes); ++i)
+ if (ino == incfs_pseudo_file_inodes[i])
+ break;
+ if (i == ARRAY_SIZE(incfs_pseudo_file_inodes))
+ return false;
+
+ inode_set_mtime(inode, 0, 0);
+ inode_set_atime(inode, 0, 0);
+ inode_set_ctime(inode, 0, 0);
+ inode->i_size = 0;
+ inode->i_ino = ino;
+ inode->i_private = NULL;
+ inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG | READ_WRITE_FILE_MODE);
+ inode->i_op = &incfs_file_inode_ops;
+ inode->i_fop = pseudo_file_operations[i];
+ return true;
+}
+
+struct inode_search {
+ unsigned long ino;
+};
+
+static int inode_test(struct inode *inode, void *opaque)
+{
+ struct inode_search *search = opaque;
+
+ return inode->i_ino == search->ino;
+}
+
+static int inode_set(struct inode *inode, void *opaque)
+{
+ struct inode_search *search = opaque;
+
+ if (get_pseudo_inode(search->ino, inode))
+ return 0;
+
+ /* Unknown inode requested. */
+ return -EINVAL;
+}
+
+static struct inode *fetch_inode(struct super_block *sb, unsigned long ino)
+{
+ struct inode_search search = {
+ .ino = ino
+ };
+ struct inode *inode = iget5_locked(sb, search.ino, inode_test,
+ inode_set, &search);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry)
+{
+ struct mem_range name_range =
+ range((u8 *)dentry->d_name.name, dentry->d_name.len);
+ unsigned long ino;
+ struct inode *inode;
+ int i = 0;
+
+ for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i)
+ if (incfs_equal_ranges(incfs_pseudo_file_names[i], name_range))
+ break;
+ if (i == ARRAY_SIZE(incfs_pseudo_file_names))
+ return -ENOENT;
+
+ ino = incfs_pseudo_file_inodes[i];
+
+ inode = fetch_inode(sb, ino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ d_add(dentry, inode);
+ return 0;
+}
+
+int emit_pseudo_files(struct dir_context *ctx)
+{
+ loff_t i = ctx->pos;
+
+ for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i) {
+ if (!dir_emit(ctx, incfs_pseudo_file_names[i].data,
+ incfs_pseudo_file_names[i].len,
+ incfs_pseudo_file_inodes[i], DT_REG))
+ return -EINVAL;
+
+ ctx->pos++;
+ }
+ return 0;
+}
diff --git a/fs/incfs/pseudo_files.h b/fs/incfs/pseudo_files.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/pseudo_files.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2020 Google LLC
+ */
+
+#ifndef _INCFS_PSEUDO_FILES_H
+#define _INCFS_PSEUDO_FILES_H
+
+#include "internal.h"
+
+#define PSEUDO_FILE_COUNT 3
+#define INCFS_START_INO_RANGE 10
+
+extern const struct mem_range incfs_pseudo_file_names[PSEUDO_FILE_COUNT];
+extern const unsigned long incfs_pseudo_file_inodes[PSEUDO_FILE_COUNT];
+
+int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry);
+int emit_pseudo_files(struct dir_context *ctx);
+
+#endif
diff --git a/fs/incfs/sysfs.c b/fs/incfs/sysfs.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/sysfs.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ */
+#include <linux/fs.h>
+#include <linux/kobject.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "sysfs.h"
+#include "data_mgmt.h"
+#include "vfs.h"
+
+/******************************************************************************
+ * Define sys/fs/incrementalfs & sys/fs/incrementalfs/features
+ *****************************************************************************/
+#define INCFS_NODE_FEATURES "features"
+#define INCFS_NODE_INSTANCES "instances"
+
+static struct kobject *sysfs_root;
+static struct kobject *features_node;
+static struct kobject *instances_node;
+
+#define DECLARE_FEATURE_FLAG(name) \
+ static ssize_t name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buff) \
+{ \
+ return sysfs_emit(buff, "supported\n"); \
+} \
+ \
+static struct kobj_attribute name##_attr = __ATTR_RO(name)
+
+DECLARE_FEATURE_FLAG(corefs);
+DECLARE_FEATURE_FLAG(zstd);
+DECLARE_FEATURE_FLAG(v2);
+DECLARE_FEATURE_FLAG(bugfix_throttling);
+DECLARE_FEATURE_FLAG(bugfix_inode_eviction);
+
+static struct attribute *attributes[] = {
+ &corefs_attr.attr,
+ &zstd_attr.attr,
+ &v2_attr.attr,
+ &bugfix_throttling_attr.attr,
+ &bugfix_inode_eviction_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group attr_group = {
+ .attrs = attributes,
+};
+
+int __init incfs_init_sysfs(void)
+{
+ int res = -ENOMEM;
+
+ sysfs_root = kobject_create_and_add(INCFS_NAME, fs_kobj);
+ if (!sysfs_root)
+ return -ENOMEM;
+
+ instances_node = kobject_create_and_add(INCFS_NODE_INSTANCES,
+ sysfs_root);
+ if (!instances_node)
+ goto err_put_root;
+
+ features_node = kobject_create_and_add(INCFS_NODE_FEATURES,
+ sysfs_root);
+ if (!features_node)
+ goto err_put_instances;
+
+ res = sysfs_create_group(features_node, &attr_group);
+ if (res)
+ goto err_put_features;
+
+ return 0;
+
+err_put_features:
+ kobject_put(features_node);
+err_put_instances:
+ kobject_put(instances_node);
+err_put_root:
+ kobject_put(sysfs_root);
+
+ return res;
+}
+
+void incfs_cleanup_sysfs(void)
+{
+ if (features_node) {
+ sysfs_remove_group(features_node, &attr_group);
+ kobject_put(features_node);
+ }
+
+ kobject_put(instances_node);
+ kobject_put(sysfs_root);
+}
+
+/******************************************************************************
+ * Define sys/fs/incrementalfs/instances/<name>/
+ *****************************************************************************/
+#define __DECLARE_STATUS_FLAG(name) \
+static ssize_t name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buff) \
+{ \
+ struct incfs_sysfs_node *node = container_of(kobj, \
+ struct incfs_sysfs_node, isn_sysfs_node); \
+ \
+ return sysfs_emit(buff, "%d\n", node->isn_mi->mi_##name); \
+} \
+ \
+static struct kobj_attribute name##_attr = __ATTR_RO(name)
+
+#define __DECLARE_STATUS_FLAG64(name) \
+static ssize_t name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buff) \
+{ \
+ struct incfs_sysfs_node *node = container_of(kobj, \
+ struct incfs_sysfs_node, isn_sysfs_node); \
+ \
+ return sysfs_emit(buff, "%lld\n", node->isn_mi->mi_##name); \
+} \
+ \
+static struct kobj_attribute name##_attr = __ATTR_RO(name)
+
+__DECLARE_STATUS_FLAG(reads_failed_timed_out);
+__DECLARE_STATUS_FLAG(reads_failed_hash_verification);
+__DECLARE_STATUS_FLAG(reads_failed_other);
+__DECLARE_STATUS_FLAG(reads_delayed_pending);
+__DECLARE_STATUS_FLAG64(reads_delayed_pending_us);
+__DECLARE_STATUS_FLAG(reads_delayed_min);
+__DECLARE_STATUS_FLAG64(reads_delayed_min_us);
+
+static struct attribute *mount_attributes[] = {
+ &reads_failed_timed_out_attr.attr,
+ &reads_failed_hash_verification_attr.attr,
+ &reads_failed_other_attr.attr,
+ &reads_delayed_pending_attr.attr,
+ &reads_delayed_pending_us_attr.attr,
+ &reads_delayed_min_attr.attr,
+ &reads_delayed_min_us_attr.attr,
+ NULL,
+};
+
+static void incfs_sysfs_release(struct kobject *kobj)
+{
+ struct incfs_sysfs_node *node = container_of(kobj,
+ struct incfs_sysfs_node, isn_sysfs_node);
+
+ complete(&node->isn_completion);
+}
+
+static const struct attribute_group mount_attr_group = {
+ .attrs = mount_attributes,
+};
+
+static struct kobj_type incfs_kobj_node_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = &incfs_sysfs_release,
+};
+
+struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name,
+ struct mount_info *mi)
+{
+ struct incfs_sysfs_node *node = NULL;
+ int error;
+
+ if (!name)
+ return NULL;
+
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+
+ node->isn_mi = mi;
+
+ init_completion(&node->isn_completion);
+ kobject_init(&node->isn_sysfs_node, &incfs_kobj_node_ktype);
+ error = kobject_add(&node->isn_sysfs_node, instances_node, "%s", name);
+ if (error)
+ goto err;
+
+ error = sysfs_create_group(&node->isn_sysfs_node, &mount_attr_group);
+ if (error)
+ goto err;
+
+ return node;
+
+err:
+ /*
+ * Note kobject_put always calls release, so incfs_sysfs_release will
+ * free node
+ */
+ kobject_put(&node->isn_sysfs_node);
+ return ERR_PTR(error);
+}
+
+void incfs_free_sysfs_node(struct incfs_sysfs_node *node)
+{
+ if (!node)
+ return;
+
+ sysfs_remove_group(&node->isn_sysfs_node, &mount_attr_group);
+ kobject_put(&node->isn_sysfs_node);
+ wait_for_completion_interruptible(&node->isn_completion);
+ kfree(node);
+}
diff --git a/fs/incfs/sysfs.h b/fs/incfs/sysfs.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/sysfs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+#ifndef _INCFS_SYSFS_H
+#define _INCFS_SYSFS_H
+
+struct incfs_sysfs_node {
+ struct kobject isn_sysfs_node;
+
+ struct completion isn_completion;
+
+ struct mount_info *isn_mi;
+};
+
+int incfs_init_sysfs(void);
+void incfs_cleanup_sysfs(void);
+struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name,
+ struct mount_info *mi);
+void incfs_free_sysfs_node(struct incfs_sysfs_node *node);
+
+#endif
diff --git a/fs/incfs/verity.c b/fs/incfs/verity.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/verity.c
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Google LLC
+ */
+
+/*
+ * fs-verity integration into incfs
+ *
+ * Since incfs has its own merkle tree implementation, most of fs/verity/ is not
+ * needed. incfs also only needs to support the case where
+ * CONFIG_FS_VERITY_BUILTIN_SIGNATURES=n. Therefore, the integration consists of
+ * the following modifications:
+ *
+ * 1. Add the (optional) verity signature to the incfs file format. (Not really
+ * needed anymore, but this is kept around since this is the behavior of
+ * fs/verity/ even when CONFIG_FS_VERITY_BUILTIN_SIGNATURES=n.)
+ * 2. Add a pointer to the digest of the fs-verity descriptor struct to the
+ * data_file struct that incfs attaches to each file inode.
+ * 3. Add the following ioclts:
+ * - FS_IOC_ENABLE_VERITY
+ * - FS_IOC_GETFLAGS
+ * - FS_IOC_MEASURE_VERITY
+ * 4. When FS_IOC_ENABLE_VERITY is called on a non-verity file, the
+ * fs-verity descriptor struct is populated and digested. Then the S_VERITY
+ * flag is set and the xattr incfs.verity is set. If the signature is
+ * non-NULL, an INCFS_MD_VERITY_SIGNATURE is added to the backing file
+ * containing the signature.
+ * 5. When a file with an incfs.verity xattr's inode is initialized, the
+ * inode’s S_VERITY flag is set.
+ * 6. When a file with the S_VERITY flag set on its inode is opened, the
+ * data_file is checked for its verity digest. If the file doesn’t have a
+ * digest, the file’s digest is calculated as above, checked, and set, or the
+ * open is denied if it is not valid.
+ * 7. FS_IOC_GETFLAGS simply returns the value of the S_VERITY flag
+ * 8. FS_IOC_MEASURE_VERITY simply returns the cached digest
+ * 9. The final complication is that if FS_IOC_ENABLE_VERITY is called on a file
+ * which doesn’t have a merkle tree, the merkle tree is calculated before the
+ * rest of the process is completed.
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <linux/fsverity.h>
+#include <linux/mount.h>
+
+#include "verity.h"
+
+#include "data_mgmt.h"
+#include "format.h"
+#include "integrity.h"
+#include "vfs.h"
+
+#define FS_VERITY_MAX_SIGNATURE_SIZE 16128
+
+static int incfs_get_root_hash(struct file *filp, u8 *root_hash)
+{
+ struct data_file *df = get_incfs_data_file(filp);
+
+ if (!df)
+ return -EINVAL;
+
+ memcpy(root_hash, df->df_hash_tree->root_hash,
+ df->df_hash_tree->alg->digest_size);
+
+ return 0;
+}
+
+static int incfs_end_enable_verity(struct file *filp, u8 *sig, size_t sig_size)
+{
+ struct inode *inode = file_inode(filp);
+ struct mem_range signature = {
+ .data = sig,
+ .len = sig_size,
+ };
+ struct data_file *df = get_incfs_data_file(filp);
+ struct backing_file_context *bfc;
+ int error;
+ struct incfs_df_verity_signature *vs = NULL;
+ loff_t offset;
+
+ if (!df || !df->df_backing_file_context)
+ return -EFSCORRUPTED;
+
+ if (sig) {
+ vs = kzalloc(sizeof(*vs), GFP_NOFS);
+ if (!vs)
+ return -ENOMEM;
+ }
+
+ bfc = df->df_backing_file_context;
+ error = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (error)
+ goto out;
+
+ error = incfs_write_verity_signature_to_backing_file(bfc, signature,
+ &offset);
+ mutex_unlock(&bfc->bc_mutex);
+ if (error)
+ goto out;
+
+ /*
+ * Set verity xattr so we can set S_VERITY without opening backing file
+ */
+ error = vfs_setxattr(&nop_mnt_idmap, bfc->bc_file->f_path.dentry,
+ INCFS_XATTR_VERITY_NAME, NULL, 0, XATTR_CREATE);
+ if (error) {
+ pr_warn("incfs: error setting verity xattr: %d\n", error);
+ goto out;
+ }
+
+ if (sig) {
+ *vs = (struct incfs_df_verity_signature) {
+ .size = signature.len,
+ .offset = offset,
+ };
+
+ df->df_verity_signature = vs;
+ vs = NULL;
+ }
+
+ inode_set_flags(inode, S_VERITY, S_VERITY);
+
+out:
+ kfree(vs);
+ return error;
+}
+
+static int incfs_compute_file_digest(struct incfs_hash_alg *alg,
+ struct fsverity_descriptor *desc,
+ u8 *digest)
+{
+ SHASH_DESC_ON_STACK(d, alg->shash);
+
+ d->tfm = alg->shash;
+ return crypto_shash_digest(d, (u8 *)desc, sizeof(*desc), digest);
+}
+
+static enum incfs_hash_tree_algorithm incfs_convert_fsverity_hash_alg(
+ int hash_alg)
+{
+ switch (hash_alg) {
+ case FS_VERITY_HASH_ALG_SHA256:
+ return INCFS_HASH_TREE_SHA256;
+ default:
+ return -EINVAL;
+ }
+}
+
+static struct mem_range incfs_get_verity_digest(struct inode *inode)
+{
+ struct inode_info *node = get_incfs_node(inode);
+ struct data_file *df;
+ struct mem_range verity_file_digest;
+
+ if (!node) {
+ pr_warn("Invalid inode\n");
+ return range(NULL, 0);
+ }
+
+ df = node->n_file;
+
+ /*
+ * Pairs with the cmpxchg_release() in incfs_set_verity_digest().
+ * I.e., another task may publish ->df_verity_file_digest concurrently,
+ * executing a RELEASE barrier. We need to use smp_load_acquire() here
+ * to safely ACQUIRE the memory the other task published.
+ */
+ verity_file_digest.data = smp_load_acquire(
+ &df->df_verity_file_digest.data);
+ verity_file_digest.len = df->df_verity_file_digest.len;
+ return verity_file_digest;
+}
+
+static void incfs_set_verity_digest(struct inode *inode,
+ struct mem_range verity_file_digest)
+{
+ struct inode_info *node = get_incfs_node(inode);
+ struct data_file *df;
+
+ if (!node) {
+ pr_warn("Invalid inode\n");
+ kfree(verity_file_digest.data);
+ return;
+ }
+
+ df = node->n_file;
+ df->df_verity_file_digest.len = verity_file_digest.len;
+
+ /*
+ * Multiple tasks may race to set ->df_verity_file_digest.data, so use
+ * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * incfs_get_verity_digest(). I.e., here we publish
+ * ->df_verity_file_digest.data, with a RELEASE barrier so that other
+ * tasks can ACQUIRE it.
+ */
+ if (cmpxchg_release(&df->df_verity_file_digest.data, NULL,
+ verity_file_digest.data) != NULL)
+ /* Lost the race, so free the file_digest we allocated. */
+ kfree(verity_file_digest.data);
+}
+
+/* Calculate the digest of the fsverity_descriptor. */
+static struct mem_range incfs_calc_verity_digest_from_desc(
+ const struct inode *inode,
+ struct fsverity_descriptor *desc)
+{
+ enum incfs_hash_tree_algorithm incfs_hash_alg;
+ struct mem_range verity_file_digest;
+ int err;
+ struct incfs_hash_alg *hash_alg;
+
+ incfs_hash_alg = incfs_convert_fsverity_hash_alg(desc->hash_algorithm);
+ if (incfs_hash_alg < 0)
+ return range(ERR_PTR(incfs_hash_alg), 0);
+
+ hash_alg = incfs_get_hash_alg(incfs_hash_alg);
+ if (IS_ERR(hash_alg))
+ return range((u8 *)hash_alg, 0);
+
+ verity_file_digest = range(kzalloc(hash_alg->digest_size, GFP_KERNEL),
+ hash_alg->digest_size);
+ if (!verity_file_digest.data)
+ return range(ERR_PTR(-ENOMEM), 0);
+
+ err = incfs_compute_file_digest(hash_alg, desc,
+ verity_file_digest.data);
+ if (err) {
+ pr_err("Error %d computing file digest", err);
+ kfree(verity_file_digest.data);
+ return range(ERR_PTR(err), 0);
+ }
+ pr_debug("Computed file digest: %s:%*phN\n",
+ hash_alg->name, (int) verity_file_digest.len,
+ verity_file_digest.data);
+ return verity_file_digest;
+}
+
+static struct fsverity_descriptor *incfs_get_fsverity_descriptor(
+ struct file *filp, int hash_algorithm)
+{
+ struct inode *inode = file_inode(filp);
+ struct fsverity_descriptor *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ int err;
+
+ if (!desc)
+ return ERR_PTR(-ENOMEM);
+
+ *desc = (struct fsverity_descriptor) {
+ .version = 1,
+ .hash_algorithm = hash_algorithm,
+ .log_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE),
+ .data_size = cpu_to_le64(inode->i_size),
+ };
+
+ err = incfs_get_root_hash(filp, desc->root_hash);
+ if (err) {
+ kfree(desc);
+ return ERR_PTR(err);
+ }
+
+ return desc;
+}
+
+static struct mem_range incfs_calc_verity_digest(
+ struct inode *inode, struct file *filp,
+ int hash_algorithm)
+{
+ struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp,
+ hash_algorithm);
+ struct mem_range verity_file_digest;
+
+ if (IS_ERR(desc))
+ return range((u8 *)desc, 0);
+ verity_file_digest = incfs_calc_verity_digest_from_desc(inode, desc);
+ kfree(desc);
+ return verity_file_digest;
+}
+
+static int incfs_build_merkle_tree(struct file *f, struct data_file *df,
+ struct backing_file_context *bfc,
+ struct mtree *hash_tree, loff_t hash_offset,
+ struct incfs_hash_alg *alg, struct mem_range hash)
+{
+ int error = 0;
+ int limit, lvl, i, result;
+ struct mem_range buf = {.len = INCFS_DATA_FILE_BLOCK_SIZE};
+ struct mem_range tmp = {.len = 2 * INCFS_DATA_FILE_BLOCK_SIZE};
+
+ buf.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(buf.len));
+ tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len));
+ if (!buf.data || !tmp.data) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * lvl - 1 is the level we are reading, lvl the level we are writing
+ * lvl == -1 means actual blocks
+ * lvl == hash_tree->depth means root hash
+ */
+ limit = df->df_data_block_count;
+ for (lvl = 0; lvl <= hash_tree->depth; lvl++) {
+ for (i = 0; i < limit; ++i) {
+ loff_t hash_level_offset;
+ struct mem_range partial_buf = buf;
+
+ if (lvl == 0)
+ result = incfs_read_data_file_block(partial_buf,
+ f, i, tmp, NULL, NULL);
+ else {
+ hash_level_offset = hash_offset +
+ hash_tree->hash_level_suboffset[lvl - 1];
+
+ result = incfs_kread(bfc, partial_buf.data,
+ partial_buf.len,
+ hash_level_offset + i *
+ INCFS_DATA_FILE_BLOCK_SIZE);
+ }
+
+ if (result < 0) {
+ error = result;
+ goto out;
+ }
+
+ partial_buf.len = result;
+ error = incfs_calc_digest(alg, partial_buf, hash);
+ if (error)
+ goto out;
+
+ /*
+ * last level - only one hash to take and it is stored
+ * in the incfs signature record
+ */
+ if (lvl == hash_tree->depth)
+ break;
+
+ hash_level_offset = hash_offset +
+ hash_tree->hash_level_suboffset[lvl];
+
+ result = incfs_kwrite(bfc, hash.data, hash.len,
+ hash_level_offset + hash.len * i);
+
+ if (result < 0) {
+ error = result;
+ goto out;
+ }
+
+ if (result != hash.len) {
+ error = -EIO;
+ goto out;
+ }
+ }
+ limit = DIV_ROUND_UP(limit,
+ INCFS_DATA_FILE_BLOCK_SIZE / hash.len);
+ }
+
+out:
+ free_pages((unsigned long)tmp.data, get_order(tmp.len));
+ free_pages((unsigned long)buf.data, get_order(buf.len));
+ return error;
+}
+
+/*
+ * incfs files have a signature record that is separate from the
+ * verity_signature record. The signature record does not actually contain a
+ * signature, rather it contains the size/offset of the hash tree, and a binary
+ * blob which contains the root hash and potentially a signature.
+ *
+ * If the file was created with a signature record, then this function simply
+ * returns.
+ *
+ * Otherwise it will create a signature record with a minimal binary blob as
+ * defined by the structure below, create space for the hash tree and then
+ * populate it using incfs_build_merkle_tree
+ */
+static int incfs_add_signature_record(struct file *f)
+{
+ /* See incfs_parse_signature */
+ struct {
+ __le32 version;
+ __le32 size_of_hash_info_section;
+ struct {
+ __le32 hash_algorithm;
+ u8 log2_blocksize;
+ __le32 salt_size;
+ u8 salt[0];
+ __le32 hash_size;
+ u8 root_hash[32];
+ } __packed hash_section;
+ __le32 size_of_signing_info_section;
+ u8 signing_info_section[0];
+ } __packed sig = {
+ .version = cpu_to_le32(INCFS_SIGNATURE_VERSION),
+ .size_of_hash_info_section =
+ cpu_to_le32(sizeof(sig.hash_section)),
+ .hash_section = {
+ .hash_algorithm = cpu_to_le32(INCFS_HASH_TREE_SHA256),
+ .log2_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE),
+ .hash_size = cpu_to_le32(SHA256_DIGEST_SIZE),
+ },
+ };
+
+ struct data_file *df = get_incfs_data_file(f);
+ struct mtree *hash_tree = NULL;
+ struct backing_file_context *bfc;
+ int error;
+ loff_t hash_offset, sig_offset;
+ struct incfs_hash_alg *alg = incfs_get_hash_alg(INCFS_HASH_TREE_SHA256);
+ u8 hash_buf[INCFS_MAX_HASH_SIZE];
+ int hash_size = alg->digest_size;
+ struct mem_range hash = range(hash_buf, hash_size);
+ int result;
+ struct incfs_df_signature *signature = NULL;
+
+ if (!df)
+ return -EINVAL;
+
+ if (df->df_header_flags & INCFS_FILE_MAPPED)
+ return -EINVAL;
+
+ /* Already signed? */
+ if (df->df_signature && df->df_hash_tree)
+ return 0;
+
+ if (df->df_signature || df->df_hash_tree)
+ return -EFSCORRUPTED;
+
+ /* Add signature metadata record to file */
+ hash_tree = incfs_alloc_mtree(range((u8 *)&sig, sizeof(sig)),
+ df->df_data_block_count);
+ if (IS_ERR(hash_tree))
+ return PTR_ERR(hash_tree);
+
+ bfc = df->df_backing_file_context;
+ if (!bfc) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ error = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (error)
+ goto out;
+
+ error = incfs_write_signature_to_backing_file(bfc,
+ range((u8 *)&sig, sizeof(sig)),
+ hash_tree->hash_tree_area_size,
+ &hash_offset, &sig_offset);
+ mutex_unlock(&bfc->bc_mutex);
+ if (error)
+ goto out;
+
+ /* Populate merkle tree */
+ error = incfs_build_merkle_tree(f, df, bfc, hash_tree, hash_offset, alg,
+ hash);
+ if (error)
+ goto out;
+
+ /* Update signature metadata record */
+ memcpy(sig.hash_section.root_hash, hash.data, alg->digest_size);
+ result = incfs_kwrite(bfc, &sig, sizeof(sig), sig_offset);
+ if (result < 0) {
+ error = result;
+ goto out;
+ }
+
+ if (result != sizeof(sig)) {
+ error = -EIO;
+ goto out;
+ }
+
+ /* Update in-memory records */
+ memcpy(hash_tree->root_hash, hash.data, alg->digest_size);
+ signature = kzalloc(sizeof(*signature), GFP_NOFS);
+ if (!signature) {
+ error = -ENOMEM;
+ goto out;
+ }
+ *signature = (struct incfs_df_signature) {
+ .hash_offset = hash_offset,
+ .hash_size = hash_tree->hash_tree_area_size,
+ .sig_offset = sig_offset,
+ .sig_size = sizeof(sig),
+ };
+ df->df_signature = signature;
+ signature = NULL;
+
+ /*
+ * Use memory barrier to prevent readpage seeing the hash tree until
+ * it's fully there
+ */
+ smp_store_release(&df->df_hash_tree, hash_tree);
+ hash_tree = NULL;
+
+out:
+ kfree(signature);
+ kfree(hash_tree);
+ return error;
+}
+
+static int incfs_enable_verity(struct file *filp,
+ const struct fsverity_enable_arg *arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct data_file *df = get_incfs_data_file(filp);
+ u8 *signature = NULL;
+ struct mem_range verity_file_digest = range(NULL, 0);
+ int err;
+
+ if (!df)
+ return -EFSCORRUPTED;
+
+ err = mutex_lock_interruptible(&df->df_enable_verity);
+ if (err)
+ return err;
+
+ if (IS_VERITY(inode)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = incfs_add_signature_record(filp);
+ if (err)
+ goto out;
+
+ /* Get the signature if the user provided one */
+ if (arg->sig_size) {
+ signature = memdup_user(u64_to_user_ptr(arg->sig_ptr),
+ arg->sig_size);
+ if (IS_ERR(signature)) {
+ err = PTR_ERR(signature);
+ signature = NULL;
+ goto out;
+ }
+ }
+
+ verity_file_digest = incfs_calc_verity_digest(inode, filp,
+ arg->hash_algorithm);
+ if (IS_ERR(verity_file_digest.data)) {
+ err = PTR_ERR(verity_file_digest.data);
+ verity_file_digest.data = NULL;
+ goto out;
+ }
+
+ err = incfs_end_enable_verity(filp, signature, arg->sig_size);
+ if (err)
+ goto out;
+
+ /* Successfully enabled verity */
+ incfs_set_verity_digest(inode, verity_file_digest);
+ verity_file_digest.data = NULL;
+out:
+ mutex_unlock(&df->df_enable_verity);
+ kfree(signature);
+ kfree(verity_file_digest.data);
+ if (err)
+ pr_err("%s failed with err %d\n", __func__, err);
+ return err;
+}
+
+int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg)
+{
+ struct inode *inode = file_inode(filp);
+ struct fsverity_enable_arg arg;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.version != 1)
+ return -EINVAL;
+
+ if (arg.__reserved1 ||
+ memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
+ return -EINVAL;
+
+ if (arg.hash_algorithm != FS_VERITY_HASH_ALG_SHA256)
+ return -EINVAL;
+
+ if (arg.block_size != PAGE_SIZE)
+ return -EINVAL;
+
+ if (arg.salt_size)
+ return -EINVAL;
+
+ if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)
+ return -EMSGSIZE;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ return incfs_enable_verity(filp, &arg);
+}
+
+static u8 *incfs_get_verity_signature(struct file *filp, size_t *sig_size)
+{
+ struct data_file *df = get_incfs_data_file(filp);
+ struct incfs_df_verity_signature *vs;
+ u8 *signature;
+ int res;
+
+ if (!df || !df->df_backing_file_context)
+ return ERR_PTR(-EFSCORRUPTED);
+
+ vs = df->df_verity_signature;
+ if (!vs) {
+ *sig_size = 0;
+ return NULL;
+ }
+
+ if (!vs->size) {
+ *sig_size = 0;
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
+ signature = kzalloc(vs->size, GFP_KERNEL);
+ if (!signature)
+ return ERR_PTR(-ENOMEM);
+
+ res = incfs_kread(df->df_backing_file_context,
+ signature, vs->size, vs->offset);
+
+ if (res < 0)
+ goto err_out;
+
+ if (res != vs->size) {
+ res = -EINVAL;
+ goto err_out;
+ }
+
+ *sig_size = vs->size;
+ return signature;
+
+err_out:
+ kfree(signature);
+ return ERR_PTR(res);
+}
+
+/* Ensure data_file->df_verity_file_digest is populated */
+static int ensure_verity_info(struct inode *inode, struct file *filp)
+{
+ struct mem_range verity_file_digest;
+
+ /* See if this file's verity file digest is already cached */
+ verity_file_digest = incfs_get_verity_digest(inode);
+ if (verity_file_digest.data)
+ return 0;
+
+ verity_file_digest = incfs_calc_verity_digest(inode, filp,
+ FS_VERITY_HASH_ALG_SHA256);
+ if (IS_ERR(verity_file_digest.data))
+ return PTR_ERR(verity_file_digest.data);
+
+ incfs_set_verity_digest(inode, verity_file_digest);
+ return 0;
+}
+
+/**
+ * incfs_fsverity_file_open() - prepare to open a file that may be
+ * verity-enabled
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * When opening a verity file, set up data_file->df_verity_file_digest if not
+ * already done. Note that incfs does not allow opening for writing, so there is
+ * no need for that check.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int incfs_fsverity_file_open(struct inode *inode, struct file *filp)
+{
+ if (IS_VERITY(inode))
+ return ensure_verity_info(inode, filp);
+
+ return 0;
+}
+
+int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg)
+{
+ struct inode *inode = file_inode(filp);
+ struct mem_range verity_file_digest = incfs_get_verity_digest(inode);
+ struct fsverity_digest __user *uarg = _uarg;
+ struct fsverity_digest arg;
+
+ if (!verity_file_digest.data || !verity_file_digest.len)
+ return -ENODATA; /* not a verity file */
+
+ /*
+ * The user specifies the digest_size their buffer has space for; we can
+ * return the digest if it fits in the available space. We write back
+ * the actual size, which may be shorter than the user-specified size.
+ */
+
+ if (get_user(arg.digest_size, &uarg->digest_size))
+ return -EFAULT;
+ if (arg.digest_size < verity_file_digest.len)
+ return -EOVERFLOW;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.digest_algorithm = FS_VERITY_HASH_ALG_SHA256;
+ arg.digest_size = verity_file_digest.len;
+
+ if (copy_to_user(uarg, &arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (copy_to_user(uarg->digest, verity_file_digest.data,
+ verity_file_digest.len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int incfs_read_merkle_tree(struct file *filp, void __user *buf,
+ u64 start_offset, int length)
+{
+ struct mem_range tmp_buf;
+ size_t offset;
+ int retval = 0;
+ int err = 0;
+ struct data_file *df = get_incfs_data_file(filp);
+
+ if (!df)
+ return -EINVAL;
+
+ tmp_buf = (struct mem_range) {
+ .data = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS),
+ .len = INCFS_DATA_FILE_BLOCK_SIZE,
+ };
+ if (!tmp_buf.data)
+ return -ENOMEM;
+
+ for (offset = start_offset; offset < start_offset + length;
+ offset += tmp_buf.len) {
+ err = incfs_read_merkle_tree_blocks(tmp_buf, df, offset);
+
+ if (err < 0)
+ break;
+
+ if (err != tmp_buf.len)
+ break;
+
+ if (copy_to_user(buf, tmp_buf.data, tmp_buf.len))
+ break;
+
+ buf += tmp_buf.len;
+ retval += tmp_buf.len;
+ }
+
+ kfree(tmp_buf.data);
+ return retval ? retval : err;
+}
+
+static int incfs_read_descriptor(struct file *filp,
+ void __user *buf, u64 offset, int length)
+{
+ int err;
+ struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp,
+ FS_VERITY_HASH_ALG_SHA256);
+
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+ length = min_t(u64, length, sizeof(*desc));
+ err = copy_to_user(buf, desc, length);
+ kfree(desc);
+ return err ? err : length;
+}
+
+static int incfs_read_signature(struct file *filp,
+ void __user *buf, u64 offset, int length)
+{
+ size_t sig_size;
+ static u8 *signature;
+ int err;
+
+ signature = incfs_get_verity_signature(filp, &sig_size);
+ if (IS_ERR(signature))
+ return PTR_ERR(signature);
+
+ if (!signature)
+ return -ENODATA;
+
+ length = min_t(u64, length, sig_size);
+ err = copy_to_user(buf, signature, length);
+ kfree(signature);
+ return err ? err : length;
+}
+
+int incfs_ioctl_read_verity_metadata(struct file *filp,
+ const void __user *uarg)
+{
+ struct fsverity_read_metadata_arg arg;
+ int length;
+ void __user *buf;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.__reserved)
+ return -EINVAL;
+
+ /* offset + length must not overflow. */
+ if (arg.offset + arg.length < arg.offset)
+ return -EINVAL;
+
+ /* Ensure that the return value will fit in INT_MAX. */
+ length = min_t(u64, arg.length, INT_MAX);
+
+ buf = u64_to_user_ptr(arg.buf_ptr);
+
+ switch (arg.metadata_type) {
+ case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
+ return incfs_read_merkle_tree(filp, buf, arg.offset, length);
+ case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
+ return incfs_read_descriptor(filp, buf, arg.offset, length);
+ case FS_VERITY_METADATA_TYPE_SIGNATURE:
+ return incfs_read_signature(filp, buf, arg.offset, length);
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/fs/incfs/verity.h b/fs/incfs/verity.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/verity.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2020 Google LLC
+ */
+
+#ifndef _INCFS_VERITY_H
+#define _INCFS_VERITY_H
+
+/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
+#define FS_VERITY_MAX_SIGNATURE_SIZE 16128
+
+#ifdef CONFIG_FS_VERITY
+
+int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg);
+int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg);
+
+int incfs_fsverity_file_open(struct inode *inode, struct file *filp);
+int incfs_ioctl_read_verity_metadata(struct file *filp,
+ const void __user *uarg);
+
+#else /* !CONFIG_FS_VERITY */
+
+static inline int incfs_ioctl_enable_verity(struct file *filp,
+ const void __user *uarg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int incfs_ioctl_measure_verity(struct file *filp,
+ void __user *_uarg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int incfs_fsverity_file_open(struct inode *inode,
+ struct file *filp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int incfs_ioctl_read_verity_metadata(struct file *filp,
+ const void __user *uarg)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* !CONFIG_FS_VERITY */
+
+#endif
diff --git a/fs/incfs/vfs.c b/fs/incfs/vfs.c
new file mode 100644
--- /dev/null
+++ b/fs/incfs/vfs.c
@@ -0,0 +1,1994 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fs_stack.h>
+#include <linux/fsnotify.h>
+#include <linux/fsverity.h>
+#include <linux/mmap_lock.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/backing-dev-defs.h>
+
+#include <uapi/linux/incrementalfs.h>
+
+#include "vfs.h"
+
+#include "data_mgmt.h"
+#include "format.h"
+#include "internal.h"
+#include "pseudo_files.h"
+#include "sysfs.h"
+#include "verity.h"
+
+static int incfs_remount_fs(struct super_block *sb, int *flags, char *data);
+
+static int dentry_revalidate(struct dentry *dentry, unsigned int flags);
+static void dentry_release(struct dentry *d);
+
+static int iterate_incfs_dir(struct file *file, struct dir_context *ctx);
+static struct dentry *dir_lookup(struct inode *dir_inode,
+ struct dentry *dentry, unsigned int flags);
+static int dir_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode);
+static int dir_unlink(struct inode *dir, struct dentry *dentry);
+static int dir_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry);
+static int dir_rmdir(struct inode *dir, struct dentry *dentry);
+static int dir_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags);
+
+static int file_open(struct inode *inode, struct file *file);
+static int file_release(struct inode *inode, struct file *file);
+static int read_folio(struct file *f, struct folio *folio);
+static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg);
+
+#ifdef CONFIG_COMPAT
+static long incfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+#endif
+
+static struct inode *alloc_inode(struct super_block *sb);
+static void free_inode(struct inode *inode);
+static void evict_inode(struct inode *inode);
+
+static int incfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *ia);
+static int incfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
+static ssize_t incfs_getxattr(struct dentry *d, const char *name,
+ void *value, size_t size);
+static ssize_t incfs_setxattr(struct mnt_idmap *idmap, struct dentry *d,
+ const char *name, void *value, size_t size,
+ int flags);
+static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size);
+
+static int show_options(struct seq_file *, struct dentry *);
+
+static const struct super_operations incfs_super_ops = {
+ .statfs = simple_statfs,
+ .remount_fs = incfs_remount_fs,
+ .alloc_inode = alloc_inode,
+ .destroy_inode = free_inode,
+ .evict_inode = evict_inode,
+ .show_options = show_options
+};
+
+static int dir_rename_wrap(struct mnt_idmap *idmap, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
+{
+ return dir_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+}
+
+static const struct inode_operations incfs_dir_inode_ops = {
+ .lookup = dir_lookup,
+ .mkdir = dir_mkdir,
+ .rename = dir_rename_wrap,
+ .unlink = dir_unlink,
+ .link = dir_link,
+ .rmdir = dir_rmdir,
+ .setattr = incfs_setattr,
+};
+
+WRAP_DIR_ITER(iterate_incfs_dir) // FIXME!
+static const struct file_operations incfs_dir_fops = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = shared_iterate_incfs_dir,
+ .open = file_open,
+ .release = file_release,
+};
+
+static const struct dentry_operations incfs_dentry_ops = {
+ .d_revalidate = dentry_revalidate,
+ .d_release = dentry_release
+};
+
+static const struct address_space_operations incfs_address_space_ops = {
+ .read_folio = read_folio,
+ /* .readpages = readpages */
+};
+
+static vm_fault_t incfs_fault(struct vm_fault *vmf)
+{
+ vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ return filemap_fault(vmf);
+}
+
+static const struct vm_operations_struct incfs_file_vm_ops = {
+ .fault = incfs_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+};
+
+/* This is used for a general mmap of a disk file */
+
+static int incfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->read_folio)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &incfs_file_vm_ops;
+ return 0;
+}
+
+const struct file_operations incfs_file_ops = {
+ .open = file_open,
+ .release = file_release,
+ .read_iter = generic_file_read_iter,
+ .mmap = incfs_file_mmap,
+ .splice_read = filemap_splice_read,
+ .llseek = generic_file_llseek,
+ .unlocked_ioctl = dispatch_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = incfs_compat_ioctl,
+#endif
+};
+
+const struct inode_operations incfs_file_inode_ops = {
+ .setattr = incfs_setattr,
+ .getattr = incfs_getattr,
+ .listxattr = incfs_listxattr
+};
+
+static int incfs_handler_getxattr(const struct xattr_handler *xh,
+ struct dentry *d, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return incfs_getxattr(d, name, buffer, size);
+}
+
+static int incfs_handler_setxattr(const struct xattr_handler *xh,
+ struct mnt_idmap *idmap,
+ struct dentry *d, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
+{
+ return incfs_setxattr(idmap, d, name, (void *)buffer, size, flags);
+}
+
+static const struct xattr_handler incfs_xattr_handler = {
+ .prefix = "", /* AKA all attributes */
+ .get = incfs_handler_getxattr,
+ .set = incfs_handler_setxattr,
+};
+
+static const struct xattr_handler *incfs_xattr_ops[] = {
+ &incfs_xattr_handler,
+ NULL,
+};
+
+struct inode_search {
+ unsigned long ino;
+
+ struct dentry *backing_dentry;
+
+ size_t size;
+
+ bool verity;
+};
+
+enum parse_parameter {
+ Opt_read_timeout,
+ Opt_readahead_pages,
+ Opt_rlog_pages,
+ Opt_rlog_wakeup_cnt,
+ Opt_report_uid,
+ Opt_sysfs_name,
+ Opt_err
+};
+
+static const match_table_t option_tokens = {
+ { Opt_read_timeout, "read_timeout_ms=%u" },
+ { Opt_readahead_pages, "readahead=%u" },
+ { Opt_rlog_pages, "rlog_pages=%u" },
+ { Opt_rlog_wakeup_cnt, "rlog_wakeup_cnt=%u" },
+ { Opt_report_uid, "report_uid" },
+ { Opt_sysfs_name, "sysfs_name=%s" },
+ { Opt_err, NULL }
+};
+
+static void free_options(struct mount_options *opts)
+{
+ kfree(opts->sysfs_name);
+ opts->sysfs_name = NULL;
+}
+
+static int parse_options(struct mount_options *opts, char *str)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int value;
+ char *position;
+
+ if (opts == NULL)
+ return -EFAULT;
+
+ *opts = (struct mount_options) {
+ .read_timeout_ms = 1000, /* Default: 1s */
+ .readahead_pages = 10,
+ .read_log_pages = 2,
+ .read_log_wakeup_count = 10,
+ };
+
+ if (str == NULL || *str == 0)
+ return 0;
+
+ while ((position = strsep(&str, ",")) != NULL) {
+ int token;
+
+ if (!*position)
+ continue;
+
+ token = match_token(position, option_tokens, args);
+
+ switch (token) {
+ case Opt_read_timeout:
+ if (match_int(&args[0], &value))
+ return -EINVAL;
+ if (value > 3600000)
+ return -EINVAL;
+ opts->read_timeout_ms = value;
+ break;
+ case Opt_readahead_pages:
+ if (match_int(&args[0], &value))
+ return -EINVAL;
+ opts->readahead_pages = value;
+ break;
+ case Opt_rlog_pages:
+ if (match_int(&args[0], &value))
+ return -EINVAL;
+ opts->read_log_pages = value;
+ break;
+ case Opt_rlog_wakeup_cnt:
+ if (match_int(&args[0], &value))
+ return -EINVAL;
+ opts->read_log_wakeup_count = value;
+ break;
+ case Opt_report_uid:
+ opts->report_uid = true;
+ break;
+ case Opt_sysfs_name:
+ opts->sysfs_name = match_strdup(&args[0]);
+ break;
+ default:
+ free_options(opts);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* Read file size from the attribute. Quicker than reading the header */
+static u64 read_size_attr(struct dentry *backing_dentry)
+{
+ __le64 attr_value;
+ ssize_t bytes_read;
+
+ bytes_read = vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_SIZE_NAME,
+ (char *)&attr_value, sizeof(attr_value));
+
+ if (bytes_read != sizeof(attr_value))
+ return 0;
+
+ return le64_to_cpu(attr_value);
+}
+
+/* Read verity flag from the attribute. Quicker than reading the header */
+static bool read_verity_attr(struct dentry *backing_dentry)
+{
+ return vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_VERITY_NAME, NULL, 0)
+ >= 0;
+}
+
+static int inode_test(struct inode *inode, void *opaque)
+{
+ struct inode_search *search = opaque;
+ struct inode_info *node = get_incfs_node(inode);
+ struct inode *backing_inode = d_inode(search->backing_dentry);
+
+ if (!node)
+ return 0;
+
+ return node->n_backing_inode == backing_inode &&
+ inode->i_ino == search->ino;
+}
+
+static int inode_set(struct inode *inode, void *opaque)
+{
+ struct inode_search *search = opaque;
+ struct inode_info *node = get_incfs_node(inode);
+ struct dentry *backing_dentry = search->backing_dentry;
+ struct inode *backing_inode = d_inode(backing_dentry);
+
+ fsstack_copy_attr_all(inode, backing_inode);
+ if (S_ISREG(inode->i_mode)) {
+ u64 size = search->size;
+
+ inode->i_size = size;
+ inode->i_blocks = get_blocks_count_for_size(size);
+ inode->i_mapping->a_ops = &incfs_address_space_ops;
+ inode->i_op = &incfs_file_inode_ops;
+ inode->i_fop = &incfs_file_ops;
+ inode->i_mode &= ~0222;
+ if (search->verity)
+ inode_set_flags(inode, S_VERITY, S_VERITY);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_size = 0;
+ inode->i_blocks = 1;
+ inode->i_mapping->a_ops = &incfs_address_space_ops;
+ inode->i_op = &incfs_dir_inode_ops;
+ inode->i_fop = &incfs_dir_fops;
+ } else {
+ pr_warn_once("incfs: Unexpected inode type\n");
+ return -EBADF;
+ }
+
+ ihold(backing_inode);
+ node->n_backing_inode = backing_inode;
+ node->n_mount_info = get_mount_info(inode->i_sb);
+ inode_set_ctime_to_ts(inode, inode_get_ctime(backing_inode));
+ inode_set_mtime_to_ts(inode, inode_get_mtime(backing_inode));
+ inode_set_atime_to_ts(inode, inode_get_atime(backing_inode));
+ inode->i_ino = backing_inode->i_ino;
+ if (backing_inode->i_ino < INCFS_START_INO_RANGE) {
+ pr_warn("incfs: ino conflict with backing FS %ld\n",
+ backing_inode->i_ino);
+ }
+
+ return 0;
+}
+
+static struct inode *fetch_regular_inode(struct super_block *sb,
+ struct dentry *backing_dentry)
+{
+ struct inode *backing_inode = d_inode(backing_dentry);
+ struct inode_search search = {
+ .ino = backing_inode->i_ino,
+ .backing_dentry = backing_dentry,
+ .size = read_size_attr(backing_dentry),
+ .verity = read_verity_attr(backing_dentry),
+ };
+ struct inode *inode = iget5_locked(sb, search.ino, inode_test,
+ inode_set, &search);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+static int iterate_incfs_dir(struct file *file, struct dir_context *ctx)
+{
+ struct dir_file *dir = get_incfs_dir_file(file);
+ int error = 0;
+ struct mount_info *mi = get_mount_info(file_superblock(file));
+ bool root;
+
+ if (!dir) {
+ error = -EBADF;
+ goto out;
+ }
+
+ root = dir->backing_dir->f_inode
+ == d_inode(mi->mi_backing_dir_path.dentry);
+
+ if (root) {
+ error = emit_pseudo_files(ctx);
+ if (error)
+ goto out;
+ }
+
+ ctx->pos -= PSEUDO_FILE_COUNT;
+ error = iterate_dir(dir->backing_dir, ctx);
+ ctx->pos += PSEUDO_FILE_COUNT;
+ file->f_pos = dir->backing_dir->f_pos;
+out:
+ if (error)
+ pr_warn("incfs: %s %s %d\n", __func__,
+ file->f_path.dentry->d_name.name, error);
+ return error;
+}
+
+static int incfs_init_dentry(struct dentry *dentry, struct path *path)
+{
+ struct dentry_info *d_info = NULL;
+
+ if (!dentry || !path)
+ return -EFAULT;
+
+ d_info = kzalloc(sizeof(*d_info), GFP_NOFS);
+ if (!d_info)
+ return -ENOMEM;
+
+ d_info->backing_path = *path;
+ path_get(path);
+
+ dentry->d_fsdata = d_info;
+ return 0;
+}
+
+static struct dentry *open_or_create_special_dir(struct dentry *backing_dir,
+ const char *name,
+ bool *created)
+{
+ struct dentry *index_dentry;
+ struct inode *backing_inode = d_inode(backing_dir);
+ int err = 0;
+
+ index_dentry = incfs_lookup_dentry(backing_dir, name);
+ if (!index_dentry) {
+ return ERR_PTR(-EINVAL);
+ } else if (IS_ERR(index_dentry)) {
+ return index_dentry;
+ } else if (d_really_is_positive(index_dentry)) {
+ /* Index already exists. */
+ *created = false;
+ return index_dentry;
+ }
+
+ /* Index needs to be created. */
+ inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+ err = vfs_mkdir(&nop_mnt_idmap, backing_inode, index_dentry, 0777);
+ inode_unlock(backing_inode);
+
+ if (err) {
+ dput(index_dentry);
+ return ERR_PTR(err);
+ }
+
+ if (!d_really_is_positive(index_dentry) ||
+ unlikely(d_unhashed(index_dentry))) {
+ dput(index_dentry);
+ return ERR_PTR(-EINVAL);
+ }
+
+ *created = true;
+ return index_dentry;
+}
+
+static int read_single_page_timeouts(struct data_file *df, struct file *f,
+ int block_index, struct mem_range range,
+ struct mem_range tmp,
+ unsigned int *delayed_min_us)
+{
+ struct mount_info *mi = df->df_mount_info;
+ struct incfs_read_data_file_timeouts timeouts = {
+ .max_pending_time_us = U32_MAX,
+ };
+ int uid = current_uid().val;
+ int i;
+
+ spin_lock(&mi->mi_per_uid_read_timeouts_lock);
+ for (i = 0; i < mi->mi_per_uid_read_timeouts_size /
+ sizeof(*mi->mi_per_uid_read_timeouts); ++i) {
+ struct incfs_per_uid_read_timeouts *t =
+ &mi->mi_per_uid_read_timeouts[i];
+
+ if(t->uid == uid) {
+ timeouts.min_time_us = t->min_time_us;
+ timeouts.min_pending_time_us = t->min_pending_time_us;
+ timeouts.max_pending_time_us = t->max_pending_time_us;
+ break;
+ }
+ }
+ spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
+ if (timeouts.max_pending_time_us == U32_MAX) {
+ u64 read_timeout_us = (u64)mi->mi_options.read_timeout_ms *
+ 1000;
+
+ timeouts.max_pending_time_us = read_timeout_us <= U32_MAX ?
+ read_timeout_us : U32_MAX;
+ }
+
+ return incfs_read_data_file_block(range, f, block_index, tmp,
+ &timeouts, delayed_min_us);
+}
+
+static int usleep_interruptible(u32 us)
+{
+ /* See:
+ * https://www.kernel.org/doc/Documentation/timers/timers-howto.txt
+ * for explanation
+ */
+ if (us < 10) {
+ udelay(us);
+ return 0;
+ } else if (us < 20000) {
+ usleep_range(us, us + us / 10);
+ return 0;
+ } else
+ return msleep_interruptible(us / 1000);
+}
+
+static int read_folio(struct file *f, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ loff_t offset = 0;
+ loff_t size = 0;
+ ssize_t bytes_to_read = 0;
+ ssize_t read_result = 0;
+ struct data_file *df = get_incfs_data_file(f);
+ int result = 0;
+ void *page_start;
+ int block_index;
+ unsigned int delayed_min_us = 0;
+
+ if (!df) {
+ SetPageError(page);
+ unlock_page(page);
+ return -EBADF;
+ }
+
+ page_start = kmap(page);
+ offset = page_offset(page);
+ block_index = (offset + df->df_mapped_offset) /
+ INCFS_DATA_FILE_BLOCK_SIZE;
+ size = df->df_size;
+
+ if (offset < size) {
+ struct mem_range tmp = {
+ .len = 2 * INCFS_DATA_FILE_BLOCK_SIZE
+ };
+ tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len));
+ if (!tmp.data) {
+ read_result = -ENOMEM;
+ goto err;
+ }
+ bytes_to_read = min_t(loff_t, size - offset, PAGE_SIZE);
+
+ read_result = read_single_page_timeouts(df, f, block_index,
+ range(page_start, bytes_to_read), tmp,
+ &delayed_min_us);
+
+ free_pages((unsigned long)tmp.data, get_order(tmp.len));
+ } else {
+ bytes_to_read = 0;
+ read_result = 0;
+ }
+
+err:
+ if (read_result < 0)
+ result = read_result;
+ else if (read_result < PAGE_SIZE)
+ zero_user(page, read_result, PAGE_SIZE - read_result);
+
+ if (result == 0)
+ SetPageUptodate(page);
+ else
+ SetPageError(page);
+
+ flush_dcache_page(page);
+ kunmap(page);
+ unlock_page(page);
+ if (delayed_min_us)
+ usleep_interruptible(delayed_min_us);
+ return result;
+}
+
+int incfs_link(struct dentry *what, struct dentry *where)
+{
+ struct dentry *parent_dentry = dget_parent(where);
+ struct inode *pinode = d_inode(parent_dentry);
+ int error = 0;
+
+ inode_lock_nested(pinode, I_MUTEX_PARENT);
+ error = vfs_link(what, &nop_mnt_idmap, pinode, where, NULL);
+ inode_unlock(pinode);
+
+ dput(parent_dentry);
+ return error;
+}
+
+int incfs_unlink(struct dentry *dentry)
+{
+ struct dentry *parent_dentry = dget_parent(dentry);
+ struct inode *pinode = d_inode(parent_dentry);
+ int error = 0;
+
+ inode_lock_nested(pinode, I_MUTEX_PARENT);
+ error = vfs_unlink(&nop_mnt_idmap, pinode, dentry, NULL);
+ inode_unlock(pinode);
+
+ dput(parent_dentry);
+ return error;
+}
+
+static int incfs_rmdir(struct dentry *dentry)
+{
+ struct dentry *parent_dentry = dget_parent(dentry);
+ struct inode *pinode = d_inode(parent_dentry);
+ int error = 0;
+
+ inode_lock_nested(pinode, I_MUTEX_PARENT);
+ error = vfs_rmdir(&nop_mnt_idmap, pinode, dentry);
+ inode_unlock(pinode);
+
+ dput(parent_dentry);
+ return error;
+}
+
+static void notify_unlink(struct dentry *dentry, const char *file_id_str,
+ const char *special_directory)
+{
+ struct dentry *root = dentry;
+ struct dentry *file = NULL;
+ struct dentry *dir = NULL;
+ int error = 0;
+ bool take_lock = root->d_parent != root->d_parent->d_parent;
+
+ while (root != root->d_parent)
+ root = root->d_parent;
+
+ if (take_lock)
+ dir = incfs_lookup_dentry(root, special_directory);
+ else
+ dir = lookup_one_len(special_directory, root,
+ strlen(special_directory));
+
+ if (IS_ERR(dir)) {
+ error = PTR_ERR(dir);
+ goto out;
+ }
+ if (d_is_negative(dir)) {
+ error = -ENOENT;
+ goto out;
+ }
+
+ file = incfs_lookup_dentry(dir, file_id_str);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out;
+ }
+ if (d_is_negative(file)) {
+ error = -ENOENT;
+ goto out;
+ }
+
+ fsnotify_unlink(d_inode(dir), file);
+ d_delete(file);
+
+out:
+ if (error)
+ pr_warn("%s failed with error %d\n", __func__, error);
+
+ dput(dir);
+ dput(file);
+}
+
+static void handle_file_completed(struct file *f, struct data_file *df)
+{
+ struct backing_file_context *bfc;
+ struct mount_info *mi = df->df_mount_info;
+ char *file_id_str = NULL;
+ struct dentry *incomplete_file_dentry = NULL;
+ const struct cred *old_cred = override_creds(mi->mi_owner);
+ int error;
+
+ /* Truncate file to remove any preallocated space */
+ bfc = df->df_backing_file_context;
+ if (bfc) {
+ struct file *f = bfc->bc_file;
+
+ if (f) {
+ loff_t size = i_size_read(file_inode(f));
+
+ error = vfs_truncate(&f->f_path, size);
+ if (error)
+ /* No useful action on failure */
+ pr_warn("incfs: Failed to truncate complete file: %d\n",
+ error);
+ }
+ }
+
+ /* This is best effort - there is no useful action to take on failure */
+ file_id_str = file_id_to_str(df->df_id);
+ if (!file_id_str)
+ goto out;
+
+ incomplete_file_dentry = incfs_lookup_dentry(
+ df->df_mount_info->mi_incomplete_dir,
+ file_id_str);
+ if (!incomplete_file_dentry || IS_ERR(incomplete_file_dentry)) {
+ incomplete_file_dentry = NULL;
+ goto out;
+ }
+
+ if (!d_really_is_positive(incomplete_file_dentry))
+ goto out;
+
+ vfs_fsync(df->df_backing_file_context->bc_file, 0);
+ error = incfs_unlink(incomplete_file_dentry);
+ if (error) {
+ pr_warn("incfs: Deleting incomplete file failed: %d\n", error);
+ goto out;
+ }
+
+ notify_unlink(f->f_path.dentry, file_id_str, INCFS_INCOMPLETE_NAME);
+
+out:
+ dput(incomplete_file_dentry);
+ kfree(file_id_str);
+ revert_creds(old_cred);
+}
+
+static long ioctl_fill_blocks(struct file *f, void __user *arg)
+{
+ struct incfs_fill_blocks __user *usr_fill_blocks = arg;
+ struct incfs_fill_blocks fill_blocks;
+ struct incfs_fill_block __user *usr_fill_block_array;
+ struct data_file *df = get_incfs_data_file(f);
+ struct incfs_file_data *fd = f->private_data;
+ const ssize_t data_buf_size = 2 * INCFS_DATA_FILE_BLOCK_SIZE;
+ u8 *data_buf = NULL;
+ ssize_t error = 0;
+ int i = 0;
+ bool complete = false;
+
+ if (!df)
+ return -EBADF;
+
+ if (!fd || fd->fd_fill_permission != CAN_FILL)
+ return -EPERM;
+
+ if (copy_from_user(&fill_blocks, usr_fill_blocks, sizeof(fill_blocks)))
+ return -EFAULT;
+
+ usr_fill_block_array = u64_to_user_ptr(fill_blocks.fill_blocks);
+ data_buf = (u8 *)__get_free_pages(GFP_NOFS | __GFP_COMP,
+ get_order(data_buf_size));
+ if (!data_buf)
+ return -ENOMEM;
+
+ for (i = 0; i < fill_blocks.count; i++) {
+ struct incfs_fill_block fill_block = {};
+
+ if (copy_from_user(&fill_block, &usr_fill_block_array[i],
+ sizeof(fill_block)) > 0) {
+ error = -EFAULT;
+ break;
+ }
+
+ if (fill_block.data_len > data_buf_size) {
+ error = -E2BIG;
+ break;
+ }
+
+ if (copy_from_user(data_buf, u64_to_user_ptr(fill_block.data),
+ fill_block.data_len) > 0) {
+ error = -EFAULT;
+ break;
+ }
+ fill_block.data = 0; /* To make sure nobody uses it. */
+ if (fill_block.flags & INCFS_BLOCK_FLAGS_HASH) {
+ error = incfs_process_new_hash_block(df, &fill_block,
+ data_buf);
+ } else {
+ error = incfs_process_new_data_block(df, &fill_block,
+ data_buf, &complete);
+ }
+ if (error)
+ break;
+ }
+
+ if (data_buf)
+ free_pages((unsigned long)data_buf, get_order(data_buf_size));
+
+ if (complete)
+ handle_file_completed(f, df);
+
+ /*
+ * Only report the error if no records were processed, otherwise
+ * just return how many were processed successfully.
+ */
+ if (i == 0)
+ return error;
+
+ return i;
+}
+
+static long ioctl_read_file_signature(struct file *f, void __user *arg)
+{
+ struct incfs_get_file_sig_args __user *args_usr_ptr = arg;
+ struct incfs_get_file_sig_args args = {};
+ u8 *sig_buffer = NULL;
+ size_t sig_buf_size = 0;
+ int error = 0;
+ int read_result = 0;
+ struct data_file *df = get_incfs_data_file(f);
+
+ if (!df)
+ return -EINVAL;
+
+ if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
+ return -EINVAL;
+
+ sig_buf_size = args.file_signature_buf_size;
+ if (sig_buf_size > INCFS_MAX_SIGNATURE_SIZE)
+ return -E2BIG;
+
+ sig_buffer = kzalloc(sig_buf_size, GFP_NOFS | __GFP_COMP);
+ if (!sig_buffer)
+ return -ENOMEM;
+
+ read_result = incfs_read_file_signature(df,
+ range(sig_buffer, sig_buf_size));
+
+ if (read_result < 0) {
+ error = read_result;
+ goto out;
+ }
+
+ if (copy_to_user(u64_to_user_ptr(args.file_signature), sig_buffer,
+ read_result)) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ args.file_signature_len_out = read_result;
+ if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
+ error = -EFAULT;
+
+out:
+ kfree(sig_buffer);
+
+ return error;
+}
+
+static long ioctl_get_filled_blocks(struct file *f, void __user *arg)
+{
+ struct incfs_get_filled_blocks_args __user *args_usr_ptr = arg;
+ struct incfs_get_filled_blocks_args args = {};
+ struct data_file *df = get_incfs_data_file(f);
+ struct incfs_file_data *fd = f->private_data;
+ int error;
+
+ if (!df || !fd)
+ return -EINVAL;
+
+ if (fd->fd_fill_permission != CAN_FILL)
+ return -EPERM;
+
+ if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
+ return -EINVAL;
+
+ error = incfs_get_filled_blocks(df, fd, &args);
+
+ if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
+ return -EFAULT;
+
+ return error;
+}
+
+static long ioctl_get_block_count(struct file *f, void __user *arg)
+{
+ struct incfs_get_block_count_args __user *args_usr_ptr = arg;
+ struct incfs_get_block_count_args args = {};
+ struct data_file *df = get_incfs_data_file(f);
+
+ if (!df)
+ return -EINVAL;
+
+ args.total_data_blocks_out = df->df_data_block_count;
+ args.filled_data_blocks_out = atomic_read(&df->df_data_blocks_written);
+ args.total_hash_blocks_out = df->df_total_block_count -
+ df->df_data_block_count;
+ args.filled_hash_blocks_out = atomic_read(&df->df_hash_blocks_written);
+
+ if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int incfs_ioctl_get_flags(struct file *f, void __user *arg)
+{
+ u32 flags = IS_VERITY(file_inode(f)) ? FS_VERITY_FL : 0;
+
+ return put_user(flags, (int __user *) arg);
+}
+
+static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg)
+{
+ switch (req) {
+ case INCFS_IOC_FILL_BLOCKS:
+ return ioctl_fill_blocks(f, (void __user *)arg);
+ case INCFS_IOC_READ_FILE_SIGNATURE:
+ return ioctl_read_file_signature(f, (void __user *)arg);
+ case INCFS_IOC_GET_FILLED_BLOCKS:
+ return ioctl_get_filled_blocks(f, (void __user *)arg);
+ case INCFS_IOC_GET_BLOCK_COUNT:
+ return ioctl_get_block_count(f, (void __user *)arg);
+ case FS_IOC_ENABLE_VERITY:
+ return incfs_ioctl_enable_verity(f, (const void __user *)arg);
+ case FS_IOC_GETFLAGS:
+ return incfs_ioctl_get_flags(f, (void __user *) arg);
+ case FS_IOC_MEASURE_VERITY:
+ return incfs_ioctl_measure_verity(f, (void __user *)arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ return incfs_ioctl_read_verity_metadata(f, (void __user *)arg);
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long incfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case INCFS_IOC_FILL_BLOCKS:
+ case INCFS_IOC_READ_FILE_SIGNATURE:
+ case INCFS_IOC_GET_FILLED_BLOCKS:
+ case INCFS_IOC_GET_BLOCK_COUNT:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return dispatch_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+static struct dentry *dir_lookup(struct inode *dir_inode, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct mount_info *mi = get_mount_info(dir_inode->i_sb);
+ struct dentry *dir_dentry = NULL;
+ struct dentry *backing_dentry = NULL;
+ struct path dir_backing_path = {};
+ struct inode_info *dir_info = get_incfs_node(dir_inode);
+ int err = 0;
+
+ if (!mi || !dir_info || !dir_info->n_backing_inode)
+ return ERR_PTR(-EBADF);
+
+ if (d_inode(mi->mi_backing_dir_path.dentry) ==
+ dir_info->n_backing_inode) {
+ /* We do lookup in the FS root. Show pseudo files. */
+ err = dir_lookup_pseudo_files(dir_inode->i_sb, dentry);
+ if (err != -ENOENT)
+ goto out;
+ err = 0;
+ }
+
+ dir_dentry = dget_parent(dentry);
+ get_incfs_backing_path(dir_dentry, &dir_backing_path);
+ backing_dentry = incfs_lookup_dentry(dir_backing_path.dentry,
+ dentry->d_name.name);
+
+ if (!backing_dentry || IS_ERR(backing_dentry)) {
+ err = IS_ERR(backing_dentry)
+ ? PTR_ERR(backing_dentry)
+ : -EFAULT;
+ backing_dentry = NULL;
+ goto out;
+ } else {
+ struct inode *inode = NULL;
+ struct path backing_path = {
+ .mnt = dir_backing_path.mnt,
+ .dentry = backing_dentry
+ };
+
+ err = incfs_init_dentry(dentry, &backing_path);
+ if (err)
+ goto out;
+
+ if (!d_really_is_positive(backing_dentry)) {
+ /*
+ * No such entry found in the backing dir.
+ * Create a negative entry.
+ */
+ d_add(dentry, NULL);
+ err = 0;
+ goto out;
+ }
+
+ if (d_inode(backing_dentry)->i_sb !=
+ dir_info->n_backing_inode->i_sb) {
+ /*
+ * Somehow after the path lookup we ended up in a
+ * different fs mount. If we keep going it's going
+ * to end badly.
+ */
+ err = -EXDEV;
+ goto out;
+ }
+
+ inode = fetch_regular_inode(dir_inode->i_sb, backing_dentry);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ d_add(dentry, inode);
+ }
+
+out:
+ dput(dir_dentry);
+ dput(backing_dentry);
+ path_put(&dir_backing_path);
+ if (err)
+ pr_debug("incfs: %s %s %d\n", __func__,
+ dentry->d_name.name, err);
+ return ERR_PTR(err);
+}
+
+static int dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct mount_info *mi = get_mount_info(dir->i_sb);
+ struct inode_info *dir_node = get_incfs_node(dir);
+ struct dentry *backing_dentry = NULL;
+ struct path backing_path = {};
+ int err = 0;
+
+
+ if (!mi || !dir_node || !dir_node->n_backing_inode)
+ return -EBADF;
+
+ err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (err)
+ return err;
+
+ get_incfs_backing_path(dentry, &backing_path);
+ backing_dentry = backing_path.dentry;
+
+ if (!backing_dentry) {
+ err = -EBADF;
+ goto path_err;
+ }
+
+ if (backing_dentry->d_parent == mi->mi_index_dir) {
+ /* Can't create a subdir inside .index */
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (backing_dentry->d_parent == mi->mi_incomplete_dir) {
+ /* Can't create a subdir inside .incomplete */
+ err = -EBUSY;
+ goto out;
+ }
+ inode_lock_nested(dir_node->n_backing_inode, I_MUTEX_PARENT);
+ err = vfs_mkdir(idmap, dir_node->n_backing_inode, backing_dentry, mode | 0222);
+ inode_unlock(dir_node->n_backing_inode);
+ if (!err) {
+ struct inode *inode = NULL;
+
+ if (d_really_is_negative(backing_dentry) ||
+ unlikely(d_unhashed(backing_dentry))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ inode = fetch_regular_inode(dir->i_sb, backing_dentry);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out;
+ }
+ d_instantiate(dentry, inode);
+ }
+
+out:
+ if (d_really_is_negative(dentry))
+ d_drop(dentry);
+ path_put(&backing_path);
+
+path_err:
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+ if (err)
+ pr_debug("incfs: %s err:%d\n", __func__, err);
+ return err;
+}
+
+/*
+ * Delete file referenced by backing_dentry and if appropriate its hardlink
+ * from .index and .incomplete
+ */
+static int file_delete(struct mount_info *mi, struct dentry *dentry,
+ struct dentry *backing_dentry, int nlink)
+{
+ struct dentry *index_file_dentry = NULL;
+ struct dentry *incomplete_file_dentry = NULL;
+ /* 2 chars per byte of file ID + 1 char for \0 */
+ char file_id_str[2 * sizeof(incfs_uuid_t) + 1] = {0};
+ ssize_t uuid_size = 0;
+ int error = 0;
+
+ WARN_ON(!mutex_is_locked(&mi->mi_dir_struct_mutex));
+
+ if (nlink > 3)
+ goto just_unlink;
+
+ uuid_size = vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_ID_NAME,
+ file_id_str, 2 * sizeof(incfs_uuid_t));
+ if (uuid_size < 0) {
+ error = uuid_size;
+ goto out;
+ }
+
+ if (uuid_size != 2 * sizeof(incfs_uuid_t)) {
+ error = -EBADMSG;
+ goto out;
+ }
+
+ index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str);
+ if (IS_ERR(index_file_dentry)) {
+ error = PTR_ERR(index_file_dentry);
+ index_file_dentry = NULL;
+ goto out;
+ }
+
+ if (d_really_is_positive(index_file_dentry) && nlink > 0)
+ nlink--;
+
+ if (nlink > 2)
+ goto just_unlink;
+
+ incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir,
+ file_id_str);
+ if (IS_ERR(incomplete_file_dentry)) {
+ error = PTR_ERR(incomplete_file_dentry);
+ incomplete_file_dentry = NULL;
+ goto out;
+ }
+
+ if (d_really_is_positive(incomplete_file_dentry) && nlink > 0)
+ nlink--;
+
+ if (nlink > 1)
+ goto just_unlink;
+
+ if (d_really_is_positive(index_file_dentry)) {
+ error = incfs_unlink(index_file_dentry);
+ if (error)
+ goto out;
+ notify_unlink(dentry, file_id_str, INCFS_INDEX_NAME);
+ }
+
+ if (d_really_is_positive(incomplete_file_dentry)) {
+ error = incfs_unlink(incomplete_file_dentry);
+ if (error)
+ goto out;
+ notify_unlink(dentry, file_id_str, INCFS_INCOMPLETE_NAME);
+ }
+
+just_unlink:
+ error = incfs_unlink(backing_dentry);
+
+out:
+ dput(index_file_dentry);
+ dput(incomplete_file_dentry);
+ if (error)
+ pr_debug("incfs: delete_file_from_index err:%d\n", error);
+ return error;
+}
+
+static int dir_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct mount_info *mi = get_mount_info(dir->i_sb);
+ struct path backing_path = {};
+ struct kstat stat;
+ int err = 0;
+
+ if (!mi)
+ return -EBADF;
+
+ err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (err)
+ return err;
+
+ get_incfs_backing_path(dentry, &backing_path);
+ if (!backing_path.dentry) {
+ err = -EBADF;
+ goto path_err;
+ }
+
+ if (backing_path.dentry->d_parent == mi->mi_index_dir) {
+ /* Direct unlink from .index are not allowed. */
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (backing_path.dentry->d_parent == mi->mi_incomplete_dir) {
+ /* Direct unlink from .incomplete are not allowed. */
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = vfs_getattr(&backing_path, &stat, STATX_NLINK,
+ AT_STATX_SYNC_AS_STAT);
+ if (err)
+ goto out;
+
+ err = file_delete(mi, dentry, backing_path.dentry, stat.nlink);
+
+ d_drop(dentry);
+out:
+ path_put(&backing_path);
+path_err:
+ if (err)
+ pr_debug("incfs: %s err:%d\n", __func__, err);
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+ return err;
+}
+
+static int dir_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ struct mount_info *mi = get_mount_info(dir->i_sb);
+ struct path backing_old_path = {};
+ struct path backing_new_path = {};
+ int error = 0;
+
+ if (!mi)
+ return -EBADF;
+
+ error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (error)
+ return error;
+
+ get_incfs_backing_path(old_dentry, &backing_old_path);
+ get_incfs_backing_path(new_dentry, &backing_new_path);
+
+ if (backing_new_path.dentry->d_parent == mi->mi_index_dir) {
+ /* Can't link to .index */
+ error = -EBUSY;
+ goto out;
+ }
+
+ if (backing_new_path.dentry->d_parent == mi->mi_incomplete_dir) {
+ /* Can't link to .incomplete */
+ error = -EBUSY;
+ goto out;
+ }
+
+ error = incfs_link(backing_old_path.dentry, backing_new_path.dentry);
+ if (!error) {
+ struct inode *inode = NULL;
+ struct dentry *bdentry = backing_new_path.dentry;
+
+ if (d_really_is_negative(bdentry)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ inode = fetch_regular_inode(dir->i_sb, bdentry);
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ goto out;
+ }
+ d_instantiate(new_dentry, inode);
+ }
+
+out:
+ path_put(&backing_old_path);
+ path_put(&backing_new_path);
+ if (error)
+ pr_debug("incfs: %s err:%d\n", __func__, error);
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+ return error;
+}
+
+static int dir_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct mount_info *mi = get_mount_info(dir->i_sb);
+ struct path backing_path = {};
+ int err = 0;
+
+ if (!mi)
+ return -EBADF;
+
+ err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (err)
+ return err;
+
+ get_incfs_backing_path(dentry, &backing_path);
+ if (!backing_path.dentry) {
+ err = -EBADF;
+ goto path_err;
+ }
+
+ if (backing_path.dentry == mi->mi_index_dir) {
+ /* Can't delete .index */
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (backing_path.dentry == mi->mi_incomplete_dir) {
+ /* Can't delete .incomplete */
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = incfs_rmdir(backing_path.dentry);
+ if (!err)
+ d_drop(dentry);
+out:
+ path_put(&backing_path);
+
+path_err:
+ if (err)
+ pr_debug("incfs: %s err:%d\n", __func__, err);
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+ return err;
+}
+
+static int dir_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct mount_info *mi = get_mount_info(old_dir->i_sb);
+ struct dentry *backing_old_dentry;
+ struct dentry *backing_new_dentry;
+ struct dentry *backing_old_dir_dentry;
+ struct dentry *backing_new_dir_dentry;
+ struct inode *target_inode;
+ struct dentry *trap;
+ struct renamedata rd = {};
+ int error = 0;
+
+ error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
+ if (error)
+ return error;
+
+ backing_old_dentry = get_incfs_dentry(old_dentry)->backing_path.dentry;
+
+ if (!backing_old_dentry || backing_old_dentry == mi->mi_index_dir ||
+ backing_old_dentry == mi->mi_incomplete_dir) {
+ /* Renaming .index or .incomplete not allowed */
+ error = -EBUSY;
+ goto exit;
+ }
+
+ backing_new_dentry = get_incfs_dentry(new_dentry)->backing_path.dentry;
+ dget(backing_old_dentry);
+ dget(backing_new_dentry);
+
+ backing_old_dir_dentry = dget_parent(backing_old_dentry);
+ backing_new_dir_dentry = dget_parent(backing_new_dentry);
+ target_inode = d_inode(new_dentry);
+
+ if (backing_old_dir_dentry == mi->mi_index_dir ||
+ backing_old_dir_dentry == mi->mi_incomplete_dir) {
+ /* Direct moves from .index or .incomplete are not allowed. */
+ error = -EBUSY;
+ goto out;
+ }
+
+ trap = lock_rename(backing_old_dir_dentry, backing_new_dir_dentry);
+
+ if (trap == backing_old_dentry) {
+ error = -EINVAL;
+ goto unlock_out;
+ }
+ if (trap == backing_new_dentry) {
+ error = -ENOTEMPTY;
+ goto unlock_out;
+ }
+
+ rd.old_dir = d_inode(backing_old_dir_dentry);
+ rd.old_dentry = backing_old_dentry;
+ rd.new_dir = d_inode(backing_new_dir_dentry);
+ rd.new_dentry = backing_new_dentry;
+ rd.flags = flags;
+ rd.old_mnt_idmap = &nop_mnt_idmap;
+ rd.new_mnt_idmap = &nop_mnt_idmap;
+ rd.delegated_inode = NULL;
+
+ error = vfs_rename(&rd);
+ if (error)
+ goto unlock_out;
+ if (target_inode)
+ fsstack_copy_attr_all(target_inode,
+ get_incfs_node(target_inode)->n_backing_inode);
+ fsstack_copy_attr_all(new_dir, d_inode(backing_new_dir_dentry));
+ if (new_dir != old_dir)
+ fsstack_copy_attr_all(old_dir, d_inode(backing_old_dir_dentry));
+
+unlock_out:
+ unlock_rename(backing_old_dir_dentry, backing_new_dir_dentry);
+
+out:
+ dput(backing_new_dir_dentry);
+ dput(backing_old_dir_dentry);
+ dput(backing_new_dentry);
+ dput(backing_old_dentry);
+
+exit:
+ mutex_unlock(&mi->mi_dir_struct_mutex);
+ if (error)
+ pr_debug("incfs: %s err:%d\n", __func__, error);
+ return error;
+}
+
+
+static int file_open(struct inode *inode, struct file *file)
+{
+ struct mount_info *mi = get_mount_info(inode->i_sb);
+ struct file *backing_file = NULL;
+ struct path backing_path = {};
+ int err = 0;
+ int flags = O_NOATIME | O_LARGEFILE |
+ (S_ISDIR(inode->i_mode) ? O_RDONLY : O_RDWR);
+ const struct cred *old_cred;
+
+ WARN_ON(file->private_data);
+
+ if (!mi)
+ return -EBADF;
+
+ get_incfs_backing_path(file->f_path.dentry, &backing_path);
+ if (!backing_path.dentry)
+ return -EBADF;
+
+ old_cred = override_creds(mi->mi_owner);
+ backing_file = dentry_open(&backing_path, flags, current_cred());
+ revert_creds(old_cred);
+ path_put(&backing_path);
+
+ if (IS_ERR(backing_file)) {
+ err = PTR_ERR(backing_file);
+ backing_file = NULL;
+ goto out;
+ }
+
+ if (S_ISREG(inode->i_mode)) {
+ struct incfs_file_data *fd = kzalloc(sizeof(*fd), GFP_NOFS);
+
+ if (!fd) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ *fd = (struct incfs_file_data) {
+ .fd_fill_permission = CANT_FILL,
+ };
+ file->private_data = fd;
+
+ err = make_inode_ready_for_data_ops(mi, inode, backing_file);
+ if (err)
+ goto out;
+
+ err = incfs_fsverity_file_open(inode, file);
+ if (err)
+ goto out;
+ } else if (S_ISDIR(inode->i_mode)) {
+ struct dir_file *dir = NULL;
+
+ dir = incfs_open_dir_file(mi, backing_file);
+ if (IS_ERR(dir))
+ err = PTR_ERR(dir);
+ else
+ file->private_data = dir;
+ } else
+ err = -EBADF;
+
+out:
+ if (err) {
+ pr_debug("name:%s err: %d\n",
+ file->f_path.dentry->d_name.name, err);
+ if (S_ISREG(inode->i_mode))
+ kfree(file->private_data);
+ else if (S_ISDIR(inode->i_mode))
+ incfs_free_dir_file(file->private_data);
+
+ file->private_data = NULL;
+ }
+
+ if (backing_file)
+ fput(backing_file);
+ return err;
+}
+
+static int file_release(struct inode *inode, struct file *file)
+{
+ if (S_ISREG(inode->i_mode)) {
+ kfree(file->private_data);
+ file->private_data = NULL;
+ } else if (S_ISDIR(inode->i_mode)) {
+ struct dir_file *dir = get_incfs_dir_file(file);
+
+ incfs_free_dir_file(dir);
+ }
+
+ return 0;
+}
+
+static int dentry_revalidate(struct dentry *d, unsigned int flags)
+{
+ struct path backing_path = {};
+ struct inode_info *info = get_incfs_node(d_inode(d));
+ struct inode *binode = (info == NULL) ? NULL : info->n_backing_inode;
+ struct dentry *backing_dentry = NULL;
+ int result = 0;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ get_incfs_backing_path(d, &backing_path);
+ backing_dentry = backing_path.dentry;
+ if (!backing_dentry)
+ goto out;
+
+ if (d_inode(backing_dentry) != binode) {
+ /*
+ * Backing inodes obtained via dentry and inode don't match.
+ * It indicates that most likely backing dir has changed
+ * directly bypassing Incremental FS interface.
+ */
+ goto out;
+ }
+
+ if (backing_dentry->d_flags & DCACHE_OP_REVALIDATE) {
+ result = backing_dentry->d_op->d_revalidate(backing_dentry,
+ flags);
+ } else
+ result = 1;
+
+out:
+ path_put(&backing_path);
+ return result;
+}
+
+static void dentry_release(struct dentry *d)
+{
+ struct dentry_info *di = get_incfs_dentry(d);
+
+ if (di)
+ path_put(&di->backing_path);
+ kfree(d->d_fsdata);
+ d->d_fsdata = NULL;
+}
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ struct inode_info *node = kzalloc(sizeof(*node), GFP_NOFS);
+
+ /* TODO: add a slab-based cache here. */
+ if (!node)
+ return NULL;
+ inode_init_once(&node->n_vfs_inode);
+ return &node->n_vfs_inode;
+}
+
+static void free_inode(struct inode *inode)
+{
+ struct inode_info *node = get_incfs_node(inode);
+
+ kfree(node);
+}
+
+static void evict_inode(struct inode *inode)
+{
+ struct inode_info *node = get_incfs_node(inode);
+
+ if (node) {
+ if (node->n_backing_inode) {
+ iput(node->n_backing_inode);
+ node->n_backing_inode = NULL;
+ }
+ if (node->n_file) {
+ incfs_free_data_file(node->n_file);
+ node->n_file = NULL;
+ }
+ }
+
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+static int incfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *ia)
+{
+ struct dentry_info *di = get_incfs_dentry(dentry);
+ struct dentry *backing_dentry;
+ struct inode *backing_inode;
+ int error;
+
+ if (ia->ia_valid & ATTR_SIZE)
+ return -EINVAL;
+
+ if ((ia->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
+ (ia->ia_valid & ATTR_MODE))
+ return -EINVAL;
+
+ if (!di)
+ return -EINVAL;
+ backing_dentry = di->backing_path.dentry;
+ if (!backing_dentry)
+ return -EINVAL;
+
+ backing_inode = d_inode(backing_dentry);
+
+ /* incfs files are readonly, but the backing files must be writeable */
+ if (S_ISREG(backing_inode->i_mode)) {
+ if ((ia->ia_valid & ATTR_MODE) && (ia->ia_mode & 0222))
+ return -EINVAL;
+
+ ia->ia_mode |= 0222;
+ }
+
+ inode_lock(d_inode(backing_dentry));
+ error = notify_change(idmap, backing_dentry, ia, NULL);
+ inode_unlock(d_inode(backing_dentry));
+
+ if (error)
+ return error;
+
+ if (S_ISREG(backing_inode->i_mode))
+ ia->ia_mode &= ~0222;
+
+ return simple_setattr(idmap, dentry, ia);
+}
+
+
+static int incfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+
+ if (inode->i_ino < INCFS_START_INO_RANGE)
+ return 0;
+
+ stat->attributes &= ~STATX_ATTR_VERITY;
+ if (IS_VERITY(inode))
+ stat->attributes |= STATX_ATTR_VERITY;
+ stat->attributes_mask |= STATX_ATTR_VERITY;
+
+ if (request_mask & STATX_BLOCKS) {
+ struct kstat backing_kstat;
+ struct dentry_info *di = get_incfs_dentry(path->dentry);
+ int error = 0;
+ struct path *backing_path;
+
+ if (!di)
+ return -EFSCORRUPTED;
+ backing_path = &di->backing_path;
+ error = vfs_getattr(backing_path, &backing_kstat, STATX_BLOCKS,
+ AT_STATX_SYNC_AS_STAT);
+ if (error)
+ return error;
+
+ stat->blocks = backing_kstat.blocks;
+ }
+
+ return 0;
+}
+
+static ssize_t incfs_getxattr(struct dentry *d, const char *name,
+ void *value, size_t size)
+{
+ struct dentry_info *di = get_incfs_dentry(d);
+ struct mount_info *mi = get_mount_info(d->d_sb);
+ char *stored_value;
+ size_t stored_size;
+ int i;
+
+ if (di && di->backing_path.dentry)
+ return vfs_getxattr(&nop_mnt_idmap, di->backing_path.dentry, name, value, size);
+
+ if (strcmp(name, "security.selinux"))
+ return -ENODATA;
+
+ for (i = 0; i < PSEUDO_FILE_COUNT; ++i)
+ if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data))
+ break;
+ if (i == PSEUDO_FILE_COUNT)
+ return -ENODATA;
+
+ stored_value = mi->pseudo_file_xattr[i].data;
+ stored_size = mi->pseudo_file_xattr[i].len;
+ if (!stored_value)
+ return -ENODATA;
+
+ if (stored_size > size)
+ return -E2BIG;
+
+ memcpy(value, stored_value, stored_size);
+ return stored_size;
+}
+
+
+static ssize_t incfs_setxattr(struct mnt_idmap *idmap, struct dentry *d,
+ const char *name, void *value, size_t size,
+ int flags)
+{
+ struct dentry_info *di = get_incfs_dentry(d);
+ struct mount_info *mi = get_mount_info(d->d_sb);
+ u8 **stored_value;
+ size_t *stored_size;
+ int i;
+
+ if (di && di->backing_path.dentry)
+ return vfs_setxattr(idmap, di->backing_path.dentry, name, value,
+ size, flags);
+
+ if (strcmp(name, "security.selinux"))
+ return -ENODATA;
+
+ if (size > INCFS_MAX_FILE_ATTR_SIZE)
+ return -E2BIG;
+
+ for (i = 0; i < PSEUDO_FILE_COUNT; ++i)
+ if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data))
+ break;
+ if (i == PSEUDO_FILE_COUNT)
+ return -ENODATA;
+
+ stored_value = &mi->pseudo_file_xattr[i].data;
+ stored_size = &mi->pseudo_file_xattr[i].len;
+ kfree (*stored_value);
+ *stored_value = kzalloc(size, GFP_NOFS);
+ if (!*stored_value)
+ return -ENOMEM;
+
+ memcpy(*stored_value, value, size);
+ *stored_size = size;
+ return 0;
+}
+
+static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+ struct dentry_info *di = get_incfs_dentry(d);
+
+ if (!di || !di->backing_path.dentry)
+ return -ENODATA;
+
+ return vfs_listxattr(di->backing_path.dentry, list, size);
+}
+
+struct dentry *incfs_mount_fs(struct file_system_type *type, int flags,
+ const char *dev_name, void *data)
+{
+ struct mount_options options = {};
+ struct mount_info *mi = NULL;
+ struct path backing_dir_path = {};
+ struct dentry *index_dir = NULL;
+ struct dentry *incomplete_dir = NULL;
+ struct super_block *src_fs_sb = NULL;
+ struct inode *root_inode = NULL;
+ struct super_block *sb = sget(type, NULL, set_anon_super, flags, NULL);
+ bool dir_created = false;
+ int error = 0;
+
+ if (IS_ERR(sb))
+ return ERR_CAST(sb);
+
+ sb->s_op = &incfs_super_ops;
+ sb->s_d_op = &incfs_dentry_ops;
+ sb->s_flags |= S_NOATIME;
+ sb->s_magic = INCFS_MAGIC_NUMBER;
+ sb->s_time_gran = 1;
+ sb->s_blocksize = INCFS_DATA_FILE_BLOCK_SIZE;
+ sb->s_blocksize_bits = blksize_bits(sb->s_blocksize);
+ sb->s_xattr = incfs_xattr_ops;
+
+ BUILD_BUG_ON(PAGE_SIZE != INCFS_DATA_FILE_BLOCK_SIZE);
+
+ if (!dev_name) {
+ pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n");
+ error = -ENOENT;
+ goto err_deactivate;
+ }
+
+ error = parse_options(&options, (char *)data);
+ if (error != 0) {
+ pr_err("incfs: Options parsing error. %d\n", error);
+ goto err_deactivate;
+ }
+
+ sb->s_bdi->ra_pages = options.readahead_pages;
+ if (!dev_name) {
+ pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n");
+ error = -ENOENT;
+ goto err_free_opts;
+ }
+
+ error = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+ &backing_dir_path);
+ if (error || backing_dir_path.dentry == NULL ||
+ !d_really_is_positive(backing_dir_path.dentry)) {
+ pr_err("incfs: Error accessing: %s.\n",
+ dev_name);
+ goto err_free_opts;
+ }
+ src_fs_sb = backing_dir_path.dentry->d_sb;
+ sb->s_maxbytes = src_fs_sb->s_maxbytes;
+ sb->s_stack_depth = src_fs_sb->s_stack_depth + 1;
+
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ error = -EINVAL;
+ goto err_put_path;
+ }
+
+ mi = incfs_alloc_mount_info(sb, &options, &backing_dir_path);
+ if (IS_ERR_OR_NULL(mi)) {
+ error = PTR_ERR(mi);
+ pr_err("incfs: Error allocating mount info. %d\n", error);
+ goto err_put_path;
+ }
+
+ sb->s_fs_info = mi;
+ mi->mi_backing_dir_path = backing_dir_path;
+ index_dir = open_or_create_special_dir(backing_dir_path.dentry,
+ INCFS_INDEX_NAME, &dir_created);
+ if (IS_ERR_OR_NULL(index_dir)) {
+ error = PTR_ERR(index_dir);
+ pr_err("incfs: Can't find or create .index dir in %s\n",
+ dev_name);
+ /* No need to null index_dir since we don't put it */
+ goto err_put_path;
+ }
+
+ mi->mi_index_dir = index_dir;
+ mi->mi_index_free = dir_created;
+
+ incomplete_dir = open_or_create_special_dir(backing_dir_path.dentry,
+ INCFS_INCOMPLETE_NAME,
+ &dir_created);
+ if (IS_ERR_OR_NULL(incomplete_dir)) {
+ error = PTR_ERR(incomplete_dir);
+ pr_err("incfs: Can't find or create .incomplete dir in %s\n",
+ dev_name);
+ /* No need to null incomplete_dir since we don't put it */
+ goto err_put_path;
+ }
+ mi->mi_incomplete_dir = incomplete_dir;
+ mi->mi_incomplete_free = dir_created;
+
+ root_inode = fetch_regular_inode(sb, backing_dir_path.dentry);
+ if (IS_ERR(root_inode)) {
+ error = PTR_ERR(root_inode);
+ goto err_put_path;
+ }
+
+ sb->s_root = d_make_root(root_inode);
+ if (!sb->s_root) {
+ error = -ENOMEM;
+ goto err_put_path;
+ }
+ error = incfs_init_dentry(sb->s_root, &backing_dir_path);
+ if (error)
+ goto err_put_path;
+
+ path_put(&backing_dir_path);
+ sb->s_flags |= SB_ACTIVE;
+
+ pr_debug("incfs: mount\n");
+ return dget(sb->s_root);
+
+err_put_path:
+ path_put(&backing_dir_path);
+err_free_opts:
+ free_options(&options);
+err_deactivate:
+ deactivate_locked_super(sb);
+ pr_err("incfs: mount failed %d\n", error);
+ return ERR_PTR(error);
+}
+
+static int incfs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct mount_options options;
+ struct mount_info *mi = get_mount_info(sb);
+ int err = 0;
+
+ sync_filesystem(sb);
+ err = parse_options(&options, (char *)data);
+ if (err)
+ return err;
+
+ if (options.report_uid != mi->mi_options.report_uid) {
+ pr_err("incfs: Can't change report_uid mount option on remount\n");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = incfs_realloc_mount_info(mi, &options);
+ if (err)
+ goto out;
+
+ pr_debug("incfs: remount\n");
+
+out:
+ free_options(&options);
+ return err;
+}
+
+void incfs_kill_sb(struct super_block *sb)
+{
+ struct mount_info *mi = sb->s_fs_info;
+ struct inode *dinode = NULL;
+
+ pr_debug("incfs: unmount\n");
+
+ /*
+ * We must kill the super before freeing mi, since killing the super
+ * triggers inode eviction, which triggers the final update of the
+ * backing file, which uses certain information for mi
+ */
+ kill_anon_super(sb);
+
+ if (mi) {
+ if (mi->mi_backing_dir_path.dentry)
+ dinode = d_inode(mi->mi_backing_dir_path.dentry);
+
+ if (dinode) {
+ if (mi->mi_index_dir && mi->mi_index_free)
+ vfs_rmdir(&nop_mnt_idmap, dinode,
+ mi->mi_index_dir);
+
+ if (mi->mi_incomplete_dir && mi->mi_incomplete_free)
+ vfs_rmdir(&nop_mnt_idmap, dinode,
+ mi->mi_incomplete_dir);
+ }
+
+ incfs_free_mount_info(mi);
+ sb->s_fs_info = NULL;
+ }
+}
+
+static int show_options(struct seq_file *m, struct dentry *root)
+{
+ struct mount_info *mi = get_mount_info(root->d_sb);
+
+ seq_printf(m, ",read_timeout_ms=%u", mi->mi_options.read_timeout_ms);
+ seq_printf(m, ",readahead=%u", mi->mi_options.readahead_pages);
+ if (mi->mi_options.read_log_pages != 0) {
+ seq_printf(m, ",rlog_pages=%u", mi->mi_options.read_log_pages);
+ seq_printf(m, ",rlog_wakeup_cnt=%u",
+ mi->mi_options.read_log_wakeup_count);
+ }
+ if (mi->mi_options.report_uid)
+ seq_puts(m, ",report_uid");
+
+ if (mi->mi_sysfs_node)
+ seq_printf(m, ",sysfs_name=%s",
+ kobject_name(&mi->mi_sysfs_node->isn_sysfs_node));
+ return 0;
+}
diff --git a/fs/incfs/vfs.h b/fs/incfs/vfs.h
new file mode 100644
--- /dev/null
+++ b/fs/incfs/vfs.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+
+#ifndef _INCFS_VFS_H
+#define _INCFS_VFS_H
+
+extern const struct file_operations incfs_file_ops;
+extern const struct inode_operations incfs_file_inode_ops;
+
+void incfs_kill_sb(struct super_block *sb);
+struct dentry *incfs_mount_fs(struct file_system_type *type, int flags,
+ const char *dev_name, void *data);
+int incfs_link(struct dentry *what, struct dentry *where);
+int incfs_unlink(struct dentry *dentry);
+
+static inline struct mount_info *get_mount_info(struct super_block *sb)
+{
+ struct mount_info *result = sb->s_fs_info;
+
+ WARN_ON(!result);
+ return result;
+}
+
+static inline struct super_block *file_superblock(struct file *f)
+{
+ struct inode *inode = file_inode(f);
+
+ return inode->i_sb;
+}
+
+#endif
diff --git a/include/uapi/linux/incrementalfs.h b/include/uapi/linux/incrementalfs.h
new file mode 100644
--- /dev/null
+++ b/include/uapi/linux/incrementalfs.h
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Userspace interface for Incremental FS.
+ *
+ * Incremental FS is special-purpose Linux virtual file system that allows
+ * execution of a program while its binary and resource files are still being
+ * lazily downloaded over the network, USB etc.
+ *
+ * Copyright 2019 Google LLC
+ */
+#ifndef _UAPI_LINUX_INCREMENTALFS_H
+#define _UAPI_LINUX_INCREMENTALFS_H
+
+#include <linux/limits.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <linux/xattr.h>
+
+/* ===== constants ===== */
+#define INCFS_NAME "incremental-fs"
+
+/*
+ * Magic number used in file header and in memory superblock
+ * Note that it is a 5 byte unsigned long. Thus on 32 bit kernels, it is
+ * truncated to a 4 byte number
+ */
+#define INCFS_MAGIC_NUMBER (0x5346434e49ul & ULONG_MAX)
+
+#define INCFS_DATA_FILE_BLOCK_SIZE 4096
+#define INCFS_HEADER_VER 1
+
+/* TODO: This value is assumed in incfs_copy_signature_info_from_user to be the
+ * actual signature length. Set back to 64 when fixed.
+ */
+#define INCFS_MAX_HASH_SIZE 32
+#define INCFS_MAX_FILE_ATTR_SIZE 512
+
+#define INCFS_INDEX_NAME ".index"
+#define INCFS_INCOMPLETE_NAME ".incomplete"
+#define INCFS_PENDING_READS_FILENAME ".pending_reads"
+#define INCFS_LOG_FILENAME ".log"
+#define INCFS_BLOCKS_WRITTEN_FILENAME ".blocks_written"
+#define INCFS_XATTR_ID_NAME (XATTR_USER_PREFIX "incfs.id")
+#define INCFS_XATTR_SIZE_NAME (XATTR_USER_PREFIX "incfs.size")
+#define INCFS_XATTR_METADATA_NAME (XATTR_USER_PREFIX "incfs.metadata")
+#define INCFS_XATTR_VERITY_NAME (XATTR_USER_PREFIX "incfs.verity")
+
+#define INCFS_MAX_SIGNATURE_SIZE 8096
+#define INCFS_SIGNATURE_VERSION 2
+#define INCFS_SIGNATURE_SECTIONS 2
+
+#define INCFS_IOCTL_BASE_CODE 'g'
+
+/* ===== ioctl requests on the command dir ===== */
+
+/*
+ * Create a new file
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_CREATE_FILE \
+ _IOWR(INCFS_IOCTL_BASE_CODE, 30, struct incfs_new_file_args)
+
+/* Read file signature */
+#define INCFS_IOC_READ_FILE_SIGNATURE \
+ _IOR(INCFS_IOCTL_BASE_CODE, 31, struct incfs_get_file_sig_args)
+
+/*
+ * Fill in one or more data block. This may only be called on a handle
+ * passed as a parameter to INCFS_IOC_PERMIT_FILLING
+ *
+ * Returns number of blocks filled in, or error if none were
+ */
+#define INCFS_IOC_FILL_BLOCKS \
+ _IOR(INCFS_IOCTL_BASE_CODE, 32, struct incfs_fill_blocks)
+
+/*
+ * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor
+ * May only be called on .pending_reads file
+ *
+ * Returns 0 on success or error
+ */
+#define INCFS_IOC_PERMIT_FILL \
+ _IOW(INCFS_IOCTL_BASE_CODE, 33, struct incfs_permit_fill)
+
+/*
+ * Fills buffer with ranges of populated blocks
+ *
+ * Returns 0 if all ranges written
+ * error otherwise
+ *
+ * Either way, range_buffer_size_out is set to the number
+ * of bytes written. Should be set to 0 by caller. The ranges
+ * filled are valid, but if an error was returned there might
+ * be more ranges to come.
+ *
+ * Ranges are ranges of filled blocks:
+ *
+ * 1 2 7 9
+ *
+ * means blocks 1, 2, 7, 8, 9 are filled, 0, 3, 4, 5, 6 and 10 on
+ * are not
+ *
+ * If hashing is enabled for the file, the hash blocks are simply
+ * treated as though they immediately followed the data blocks.
+ */
+#define INCFS_IOC_GET_FILLED_BLOCKS \
+ _IOR(INCFS_IOCTL_BASE_CODE, 34, struct incfs_get_filled_blocks_args)
+
+/*
+ * Creates a new mapped file
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_CREATE_MAPPED_FILE \
+ _IOWR(INCFS_IOCTL_BASE_CODE, 35, struct incfs_create_mapped_file_args)
+
+/*
+ * Get number of blocks, total and filled
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_GET_BLOCK_COUNT \
+ _IOR(INCFS_IOCTL_BASE_CODE, 36, struct incfs_get_block_count_args)
+
+/*
+ * Get per UID read timeouts
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_GET_READ_TIMEOUTS \
+ _IOR(INCFS_IOCTL_BASE_CODE, 37, struct incfs_get_read_timeouts_args)
+
+/*
+ * Set per UID read timeouts
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_SET_READ_TIMEOUTS \
+ _IOW(INCFS_IOCTL_BASE_CODE, 38, struct incfs_set_read_timeouts_args)
+
+/*
+ * Get last read error
+ * May only be called on .pending_reads file
+ */
+#define INCFS_IOC_GET_LAST_READ_ERROR \
+ _IOW(INCFS_IOCTL_BASE_CODE, 39, struct incfs_get_last_read_error_args)
+
+/* ===== sysfs feature flags ===== */
+/*
+ * Each flag is represented by a file in /sys/fs/incremental-fs/features
+ * If the file exists the feature is supported
+ * Also the file contents will be the line "supported"
+ */
+
+/*
+ * Basic flag stating that the core incfs file system is available
+ */
+#define INCFS_FEATURE_FLAG_COREFS "corefs"
+
+/*
+ * zstd compression support
+ */
+#define INCFS_FEATURE_FLAG_ZSTD "zstd"
+
+/*
+ * v2 feature set support. Covers:
+ * INCFS_IOC_CREATE_MAPPED_FILE
+ * INCFS_IOC_GET_BLOCK_COUNT
+ * INCFS_IOC_GET_READ_TIMEOUTS/INCFS_IOC_SET_READ_TIMEOUTS
+ * .blocks_written status file
+ * .incomplete folder
+ * report_uid mount option
+ */
+#define INCFS_FEATURE_FLAG_V2 "v2"
+
+enum incfs_compression_alg {
+ COMPRESSION_NONE = 0,
+ COMPRESSION_LZ4 = 1,
+ COMPRESSION_ZSTD = 2,
+};
+
+enum incfs_block_flags {
+ INCFS_BLOCK_FLAGS_NONE = 0,
+ INCFS_BLOCK_FLAGS_HASH = 1,
+};
+
+typedef struct {
+ __u8 bytes[16];
+} incfs_uuid_t __attribute__((aligned (8)));
+
+/*
+ * Description of a pending read. A pending read - a read call by
+ * a userspace program for which the filesystem currently doesn't have data.
+ *
+ * Reads from .pending_reads and .log return an array of these structure
+ */
+struct incfs_pending_read_info {
+ /* Id of a file that is being read from. */
+ incfs_uuid_t file_id;
+
+ /* A number of microseconds since system boot to the read. */
+ __aligned_u64 timestamp_us;
+
+ /* Index of a file block that is being read. */
+ __u32 block_index;
+
+ /* A serial number of this pending read. */
+ __u32 serial_number;
+};
+
+/*
+ * Description of a pending read. A pending read - a read call by
+ * a userspace program for which the filesystem currently doesn't have data.
+ *
+ * This version of incfs_pending_read_info is used whenever the file system is
+ * mounted with the report_uid flag
+ */
+struct incfs_pending_read_info2 {
+ /* Id of a file that is being read from. */
+ incfs_uuid_t file_id;
+
+ /* A number of microseconds since system boot to the read. */
+ __aligned_u64 timestamp_us;
+
+ /* Index of a file block that is being read. */
+ __u32 block_index;
+
+ /* A serial number of this pending read. */
+ __u32 serial_number;
+
+ /* The UID of the reading process */
+ __u32 uid;
+
+ __u32 reserved;
+};
+
+/*
+ * Description of a data or hash block to add to a data file.
+ */
+struct incfs_fill_block {
+ /* Index of a data block. */
+ __u32 block_index;
+
+ /* Length of data */
+ __u32 data_len;
+
+ /*
+ * A pointer to an actual data for the block.
+ *
+ * Equivalent to: __u8 *data;
+ */
+ __aligned_u64 data;
+
+ /*
+ * Compression algorithm used to compress the data block.
+ * Values from enum incfs_compression_alg.
+ */
+ __u8 compression;
+
+ /* Values from enum incfs_block_flags */
+ __u8 flags;
+
+ __u16 reserved1;
+
+ __u32 reserved2;
+
+ __aligned_u64 reserved3;
+};
+
+/*
+ * Description of a number of blocks to add to a data file
+ *
+ * Argument for INCFS_IOC_FILL_BLOCKS
+ */
+struct incfs_fill_blocks {
+ /* Number of blocks */
+ __u64 count;
+
+ /* A pointer to an array of incfs_fill_block structs */
+ __aligned_u64 fill_blocks;
+};
+
+/*
+ * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor
+ * May only be called on .pending_reads file
+ *
+ * Argument for INCFS_IOC_PERMIT_FILL
+ */
+struct incfs_permit_fill {
+ /* File to permit fills on */
+ __u32 file_descriptor;
+};
+
+enum incfs_hash_tree_algorithm {
+ INCFS_HASH_TREE_NONE = 0,
+ INCFS_HASH_TREE_SHA256 = 1
+};
+
+/*
+ * Create a new file or directory.
+ */
+struct incfs_new_file_args {
+ /* Id of a file to create. */
+ incfs_uuid_t file_id;
+
+ /*
+ * Total size of the new file. Ignored if S_ISDIR(mode).
+ */
+ __aligned_u64 size;
+
+ /*
+ * File mode. Permissions and dir flag.
+ */
+ __u16 mode;
+
+ __u16 reserved1;
+
+ __u32 reserved2;
+
+ /*
+ * A pointer to a null-terminated relative path to the file's parent
+ * dir.
+ * Max length: PATH_MAX
+ *
+ * Equivalent to: char *directory_path;
+ */
+ __aligned_u64 directory_path;
+
+ /*
+ * A pointer to a null-terminated file's name.
+ * Max length: PATH_MAX
+ *
+ * Equivalent to: char *file_name;
+ */
+ __aligned_u64 file_name;
+
+ /*
+ * A pointer to a file attribute to be set on creation.
+ *
+ * Equivalent to: u8 *file_attr;
+ */
+ __aligned_u64 file_attr;
+
+ /*
+ * Length of the data buffer specfied by file_attr.
+ * Max value: INCFS_MAX_FILE_ATTR_SIZE
+ */
+ __u32 file_attr_len;
+
+ __u32 reserved4;
+
+ /*
+ * Points to an APK V4 Signature data blob
+ * Signature must have two sections
+ * Format is:
+ * u32 version
+ * u32 size_of_hash_info_section
+ * u8 hash_info_section[]
+ * u32 size_of_signing_info_section
+ * u8 signing_info_section[]
+ *
+ * Note that incfs does not care about what is in signing_info_section
+ *
+ * hash_info_section has following format:
+ * u32 hash_algorithm; // Must be SHA256 == 1
+ * u8 log2_blocksize; // Must be 12 for 4096 byte blocks
+ * u32 salt_size;
+ * u8 salt[];
+ * u32 hash_size;
+ * u8 root_hash[];
+ */
+ __aligned_u64 signature_info;
+
+ /* Size of signature_info */
+ __aligned_u64 signature_size;
+
+ __aligned_u64 reserved6;
+};
+
+/*
+ * Request a digital signature blob for a given file.
+ * Argument for INCFS_IOC_READ_FILE_SIGNATURE ioctl
+ */
+struct incfs_get_file_sig_args {
+ /*
+ * A pointer to the data buffer to save an signature blob to.
+ *
+ * Equivalent to: u8 *file_signature;
+ */
+ __aligned_u64 file_signature;
+
+ /* Size of the buffer at file_signature. */
+ __u32 file_signature_buf_size;
+
+ /*
+ * Number of bytes save file_signature buffer.
+ * It is set after ioctl done.
+ */
+ __u32 file_signature_len_out;
+};
+
+struct incfs_filled_range {
+ __u32 begin;
+ __u32 end;
+};
+
+/*
+ * Request ranges of filled blocks
+ * Argument for INCFS_IOC_GET_FILLED_BLOCKS
+ */
+struct incfs_get_filled_blocks_args {
+ /*
+ * A buffer to populate with ranges of filled blocks
+ *
+ * Equivalent to struct incfs_filled_ranges *range_buffer
+ */
+ __aligned_u64 range_buffer;
+
+ /* Size of range_buffer */
+ __u32 range_buffer_size;
+
+ /* Start index to read from */
+ __u32 start_index;
+
+ /*
+ * End index to read to. 0 means read to end. This is a range,
+ * so incfs will read from start_index to end_index - 1
+ */
+ __u32 end_index;
+
+ /* Actual number of blocks in file */
+ __u32 total_blocks_out;
+
+ /* The number of data blocks in file */
+ __u32 data_blocks_out;
+
+ /* Number of bytes written to range buffer */
+ __u32 range_buffer_size_out;
+
+ /* Sector scanned up to, if the call was interrupted */
+ __u32 index_out;
+};
+
+/*
+ * Create a new mapped file
+ * Argument for INCFS_IOC_CREATE_MAPPED_FILE
+ */
+struct incfs_create_mapped_file_args {
+ /*
+ * Total size of the new file.
+ */
+ __aligned_u64 size;
+
+ /*
+ * File mode. Permissions and dir flag.
+ */
+ __u16 mode;
+
+ __u16 reserved1;
+
+ __u32 reserved2;
+
+ /*
+ * A pointer to a null-terminated relative path to the incfs mount
+ * point
+ * Max length: PATH_MAX
+ *
+ * Equivalent to: char *directory_path;
+ */
+ __aligned_u64 directory_path;
+
+ /*
+ * A pointer to a null-terminated file name.
+ * Max length: PATH_MAX
+ *
+ * Equivalent to: char *file_name;
+ */
+ __aligned_u64 file_name;
+
+ /* Id of source file to map. */
+ incfs_uuid_t source_file_id;
+
+ /*
+ * Offset in source file to start mapping. Must be a multiple of
+ * INCFS_DATA_FILE_BLOCK_SIZE
+ */
+ __aligned_u64 source_offset;
+};
+
+/*
+ * Get information about the blocks in this file
+ * Argument for INCFS_IOC_GET_BLOCK_COUNT
+ */
+struct incfs_get_block_count_args {
+ /* Total number of data blocks in the file */
+ __u32 total_data_blocks_out;
+
+ /* Number of filled data blocks in the file */
+ __u32 filled_data_blocks_out;
+
+ /* Total number of hash blocks in the file */
+ __u32 total_hash_blocks_out;
+
+ /* Number of filled hash blocks in the file */
+ __u32 filled_hash_blocks_out;
+};
+
+/* Description of timeouts for one UID */
+struct incfs_per_uid_read_timeouts {
+ /* UID to apply these timeouts to */
+ __u32 uid;
+
+ /*
+ * Min time in microseconds to read any block. Note that this doesn't
+ * apply to reads which are satisfied from the page cache.
+ */
+ __u32 min_time_us;
+
+ /*
+ * Min time in microseconds to satisfy a pending read. Any pending read
+ * which is filled before this time will be delayed so that the total
+ * read time >= this value.
+ */
+ __u32 min_pending_time_us;
+
+ /*
+ * Max time in microseconds to satisfy a pending read before the read
+ * times out. If set to U32_MAX, defaults to mount options
+ * read_timeout_ms * 1000. Must be >= min_pending_time_us
+ */
+ __u32 max_pending_time_us;
+};
+
+/*
+ * Get the read timeouts array
+ * Argument for INCFS_IOC_GET_READ_TIMEOUTS
+ */
+struct incfs_get_read_timeouts_args {
+ /*
+ * A pointer to a buffer to fill with the current timeouts
+ *
+ * Equivalent to struct incfs_per_uid_read_timeouts *
+ */
+ __aligned_u64 timeouts_array;
+
+ /* Size of above buffer in bytes */
+ __u32 timeouts_array_size;
+
+ /* Size used in bytes, or size needed if -ENOMEM returned */
+ __u32 timeouts_array_size_out;
+};
+
+/*
+ * Set the read timeouts array
+ * Arguments for INCFS_IOC_SET_READ_TIMEOUTS
+ */
+struct incfs_set_read_timeouts_args {
+ /*
+ * A pointer to an array containing the new timeouts
+ * This will replace any existing timeouts
+ *
+ * Equivalent to struct incfs_per_uid_read_timeouts *
+ */
+ __aligned_u64 timeouts_array;
+
+ /* Size of above array in bytes. Must be < 256 */
+ __u32 timeouts_array_size;
+};
+
+/*
+ * Get last read error struct
+ * Arguments for INCFS_IOC_GET_LAST_READ_ERROR
+ */
+struct incfs_get_last_read_error_args {
+ /* File id of last file that had a read error */
+ incfs_uuid_t file_id_out;
+
+ /* Time of last read error, in us, from CLOCK_MONOTONIC */
+ __u64 time_us_out;
+
+ /* Index of page that was being read at last read error */
+ __u32 page_out;
+
+ /* errno of last read error */
+ __u32 errno_out;
+
+ /* uid of last read error */
+ __u32 uid_out;
+
+ __u32 reserved1;
+ __u64 reserved2;
+};
+
+#endif /* _UAPI_LINUX_INCREMENTALFS_H */