android-mainline/ANDROID-Initial-commit-of-Incremental-FS.patch - kernel/common-patches - Git at Google

 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Eugene Zemtsov <ezemtsov@google.com>
 Date: Mon, 18 Nov 2019 20:21:06 -0800
 Subject: ANDROID: Initial commit of Incremental FS

 Fully working incremental fs filesystem

 [CPNOTE: 20/07/21] Lee: Asked Paul to open an OoT bug to follow progress

 Bug: 133435829
 Signed-off-by: Eugene Zemtsov <ezemtsov@google.com>
 Signed-off-by: Paul Lawrence <paullawrence@google.com>
 [Lee: Squashed all subsequent changes into this initial patch]
 Signed-off-by: Lee Jones <lee.jones@linaro.org>
 Change-Id: I02cce0b654d0ef74de0a190d30907410b23ab160
 Signed-off-by: Lee Jones <joneslee@google.com>
 ---
  Documentation/ABI/testing/sysfs-fs-incfs |   70 +
  Documentation/filesystems/incfs.rst      |   85 +
  MAINTAINERS                              |    7 +
  fs/Kconfig                               |    1 +
  fs/Makefile                              |    1 +
  fs/incfs/Kconfig                         |   15 +
  fs/incfs/Makefile                        |   13 +
  fs/incfs/data_mgmt.c                     | 1889 ++++++++++++++++++++
  fs/incfs/data_mgmt.h                     |  551 ++++++
  fs/incfs/format.c                        |  752 ++++++++
  fs/incfs/format.h                        |  408 +++++
  fs/incfs/integrity.c                     |  235 +++
  fs/incfs/integrity.h                     |   56 +
  fs/incfs/internal.h                      |   23 +
  fs/incfs/main.c                          |   48 +
  fs/incfs/pseudo_files.c                  | 1394 +++++++++++++++
  fs/incfs/pseudo_files.h                  |   20 +
  fs/incfs/sysfs.c                         |  205 +++
  fs/incfs/sysfs.h                         |   22 +
  fs/incfs/verity.c                        |  821 +++++++++
  fs/incfs/verity.h                        |   49 +
  fs/incfs/vfs.c                           | 1994 ++++++++++++++++++++++
  fs/incfs/vfs.h                           |   33 +
  include/uapi/linux/incrementalfs.h       |  590 +++++++
  24 files changed, 9282 insertions(+)
  create mode 100644 Documentation/ABI/testing/sysfs-fs-incfs
  create mode 100644 Documentation/filesystems/incfs.rst
  create mode 100644 fs/incfs/Kconfig
  create mode 100644 fs/incfs/Makefile
  create mode 100644 fs/incfs/data_mgmt.c
  create mode 100644 fs/incfs/data_mgmt.h
  create mode 100644 fs/incfs/format.c
  create mode 100644 fs/incfs/format.h
  create mode 100644 fs/incfs/integrity.c
  create mode 100644 fs/incfs/integrity.h
  create mode 100644 fs/incfs/internal.h
  create mode 100644 fs/incfs/main.c
  create mode 100644 fs/incfs/pseudo_files.c
  create mode 100644 fs/incfs/pseudo_files.h
  create mode 100644 fs/incfs/sysfs.c
  create mode 100644 fs/incfs/sysfs.h
  create mode 100644 fs/incfs/verity.c
  create mode 100644 fs/incfs/verity.h
  create mode 100644 fs/incfs/vfs.c
  create mode 100644 fs/incfs/vfs.h
  create mode 100644 include/uapi/linux/incrementalfs.h

 diff --git a/Documentation/ABI/testing/sysfs-fs-incfs b/Documentation/ABI/testing/sysfs-fs-incfs
 new file mode 100644
 --- /dev/null
 +++ b/Documentation/ABI/testing/sysfs-fs-incfs
 @@ -0,0 +1,70 @@
 +What:		/sys/fs/incremental-fs/features/corefs
 +Date:		2019
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Reads 'supported'. Always present.
 +
 +What:		/sys/fs/incremental-fs/features/v2
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Reads 'supported'. Present if all v2 features of incfs are
 +		supported.
 +
 +What:		/sys/fs/incremental-fs/features/zstd
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Reads 'supported'. Present if zstd compression is supported
 +		for data blocks.
 +
 +What:		/sys/fs/incremental-fs/features/bugfix_throttling
 +Date:		January 2023
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Reads 'supported'. Present if the throttling lock bug is fixed
 +		https://android-review.git.corp.google.com/c/kernel/common/+/2381827
 +
 +What:		/sys/fs/incremental-fs/instances/[name]
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Folder created when incfs is mounted with the sysfs_name=[name]
 +		option. If this option is used, the following values are created
 +		in this folder.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_delayed_min
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns a count of the number of reads that were delayed as a
 +		result of the per UID read timeouts min time setting.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns total delay time for all files since first mount as a
 +		result of the per UID read timeouts min time setting.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns a count of the number of reads that were delayed as a
 +		result of waiting for a pending read.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns total delay time for all files since first mount as a
 +		result of waiting for a pending read.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns number of reads that failed because of hash verification
 +		failures.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_failed_other
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns number of reads that failed for reasons other than
 +		timing out or hash failures.
 +
 +What:		/sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
 +Date:		April 2021
 +Contact:	Paul Lawrence <paullawrence@google.com>
 +Description:	Returns number of reads that timed out.
 diff --git a/Documentation/filesystems/incfs.rst b/Documentation/filesystems/incfs.rst
 new file mode 100644
 --- /dev/null
 +++ b/Documentation/filesystems/incfs.rst
 @@ -0,0 +1,85 @@
 +.. SPDX-License-Identifier: GPL-2.0
 +
 +=================================================
 +incfs: A stacked incremental filesystem for Linux
 +=================================================
 +
 +/sys/fs interface
 +=================
 +
 +Please update Documentation/ABI/testing/sysfs-fs-incfs if you update this
 +section.
 +
 +incfs creates the following files in /sys/fs.
 +
 +Features
 +--------
 +
 +/sys/fs/incremental-fs/features/corefs
 +  Reads 'supported'. Always present.
 +
 +/sys/fs/incremental-fs/features/v2
 +  Reads 'supported'. Present if all v2 features of incfs are supported. These
 +  are:
 +    fs-verity support
 +    inotify support
 +    ioclts:
 +      INCFS_IOC_SET_READ_TIMEOUTS
 +      INCFS_IOC_GET_READ_TIMEOUTS
 +      INCFS_IOC_GET_BLOCK_COUNT
 +      INCFS_IOC_CREATE_MAPPED_FILE
 +    .incomplete folder
 +    .blocks_written pseudo file
 +    report_uid mount option
 +
 +/sys/fs/incremental-fs/features/zstd
 +  Reads 'supported'. Present if zstd compression is supported for data blocks.
 +
 +/sys/fs/incremental-fs/features/bugfix_throttling
 +  Reads 'supported'. Present if the throttling lock bug is fixed
 +
 +Optional per mount
 +------------------
 +
 +For each incfs mount, the mount option sysfs_name=[name] creates a /sys/fs
 +node called:
 +
 +/sys/fs/incremental-fs/instances/[name]
 +
 +This will contain the following files:
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_delayed_min
 +  Returns a count of the number of reads that were delayed as a result of the
 +  per UID read timeouts min time setting.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
 +  Returns total delay time for all files since first mount as a result of the
 +  per UID read timeouts min time setting.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
 +  Returns a count of the number of reads that were delayed as a result of
 +  waiting for a pending read.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
 +  Returns total delay time for all files since first mount as a result of
 +  waiting for a pending read.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
 +  Returns number of reads that failed because of hash verification failures.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_failed_other
 +  Returns number of reads that failed for reasons other than timing out or
 +  hash failures.
 +
 +/sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
 +  Returns number of reads that timed out.
 +
 +For reads_delayed_*** settings, note that a file can count for both
 +reads_delayed_min and reads_delayed_pending if incfs first waits for a pending
 +read then has to wait further for the min time. In that case, the time spent
 +waiting is split between reads_delayed_pending_us, which is increased by the
 +time spent waiting for the pending read, and reads_delayed_min_us, which is
 +increased by the remainder of the time spent waiting.
 +
 +Reads that timed out are not added to the reads_delayed_pending or the
 +reads_delayed_pending_us counters.
 diff --git a/MAINTAINERS b/MAINTAINERS
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 @@ -10373,6 +10373,13 @@ F:	Documentation/hwmon/ina2xx.rst
  F:	drivers/hwmon/ina2xx.c
  F:	include/linux/platform_data/ina2xx.h

 +INCREMENTAL FILE SYSTEM
 +M:	Paul Lawrence <paullawrence@google.com>
 +L:	linux-unionfs@vger.kernel.org
 +S:	Supported
 +F:	fs/incfs/
 +F:	tools/testing/selftests/filesystems/incfs/
 +
  INDEX OF FURTHER KERNEL DOCUMENTATION
  M:	Carlos Bilbao <carlos.bilbao@amd.com>
  S:	Maintained
 diff --git a/fs/Kconfig b/fs/Kconfig
 --- a/fs/Kconfig
 +++ b/fs/Kconfig
 @@ -136,6 +136,7 @@ source "fs/quota/Kconfig"
  source "fs/autofs/Kconfig"
  source "fs/fuse/Kconfig"
  source "fs/overlayfs/Kconfig"
 +source "fs/incfs/Kconfig"

  menu "Caches"

 diff --git a/fs/Makefile b/fs/Makefile
 --- a/fs/Makefile
 +++ b/fs/Makefile
 @@ -106,6 +106,7 @@ obj-$(CONFIG_ADFS_FS)		+= adfs/
  obj-$(CONFIG_FUSE_FS)		+= fuse/
  obj-$(CONFIG_OVERLAY_FS)	+= overlayfs/
  obj-$(CONFIG_ORANGEFS_FS)       += orangefs/
 +obj-$(CONFIG_INCREMENTAL_FS)	+= incfs/
  obj-$(CONFIG_UDF_FS)		+= udf/
  obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
  obj-$(CONFIG_OMFS_FS)		+= omfs/
 diff --git a/fs/incfs/Kconfig b/fs/incfs/Kconfig
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/Kconfig
 @@ -0,0 +1,15 @@
 +config INCREMENTAL_FS
 +	tristate "Incremental file system support"
 +	depends on BLOCK
 +	# incfs does not verify fsverity builtin signatures.
 +	depends on !CONFIG_FS_VERITY_BUILTIN_SIGNATURES
 +	select DECOMPRESS_LZ4
 +	select DECOMPRESS_ZSTD
 +	select CRYPTO_SHA256
 +	help
 +	  Incremental FS is a read-only virtual file system that facilitates execution
 +	  of programs while their binaries are still being lazily downloaded over the
 +	  network, USB or pigeon post.
 +
 +	  To compile this file system support as a module, choose M here: the
 +	  module will be called incrementalfs.
 diff --git a/fs/incfs/Makefile b/fs/incfs/Makefile
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/Makefile
 @@ -0,0 +1,13 @@
 +# SPDX-License-Identifier: GPL-2.0
 +obj-$(CONFIG_INCREMENTAL_FS)	+= incrementalfs.o
 +
 +incrementalfs-y := \
 +	data_mgmt.o \
 +	format.o \
 +	integrity.o \
 +	main.o \
 +	pseudo_files.o \
 +	sysfs.o \
 +	vfs.o
 +
 +incrementalfs-$(CONFIG_FS_VERITY) += verity.o
 diff --git a/fs/incfs/data_mgmt.c b/fs/incfs/data_mgmt.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/data_mgmt.c
 @@ -0,0 +1,1889 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2019 Google LLC
 + */
 +#include <linux/crc32.h>
 +#include <linux/file.h>
 +#include <linux/fsverity.h>
 +#include <linux/gfp.h>
 +#include <linux/kobject.h>
 +#include <linux/ktime.h>
 +#include <linux/lz4.h>
 +#include <linux/mm.h>
 +#include <linux/namei.h>
 +#include <linux/pagemap.h>
 +#include <linux/slab.h>
 +#include <linux/types.h>
 +#include <linux/workqueue.h>
 +
 +#include "data_mgmt.h"
 +#include "format.h"
 +#include "integrity.h"
 +#include "sysfs.h"
 +#include "verity.h"
 +
 +static int incfs_scan_metadata_chain(struct data_file *df);
 +
 +static void log_wake_up_all(struct work_struct *work)
 +{
 +	struct delayed_work *dw = container_of(work, struct delayed_work, work);
 +	struct read_log *rl = container_of(dw, struct read_log, ml_wakeup_work);
 +	wake_up_all(&rl->ml_notif_wq);
 +}
 +
 +static void zstd_free_workspace(struct work_struct *work)
 +{
 +	struct delayed_work *dw = container_of(work, struct delayed_work, work);
 +	struct mount_info *mi =
 +		container_of(dw, struct mount_info, mi_zstd_cleanup_work);
 +
 +	mutex_lock(&mi->mi_zstd_workspace_mutex);
 +	kvfree(mi->mi_zstd_workspace);
 +	mi->mi_zstd_workspace = NULL;
 +	mi->mi_zstd_stream = NULL;
 +	mutex_unlock(&mi->mi_zstd_workspace_mutex);
 +}
 +
 +struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
 +					  struct mount_options *options,
 +					  struct path *backing_dir_path)
 +{
 +	struct mount_info *mi = NULL;
 +	int error = 0;
 +	struct incfs_sysfs_node *node;
 +
 +	mi = kzalloc(sizeof(*mi), GFP_NOFS);
 +	if (!mi)
 +		return ERR_PTR(-ENOMEM);
 +
 +	mi->mi_sb = sb;
 +	mi->mi_backing_dir_path = *backing_dir_path;
 +	mi->mi_owner = get_current_cred();
 +	path_get(&mi->mi_backing_dir_path);
 +	mutex_init(&mi->mi_dir_struct_mutex);
 +	init_waitqueue_head(&mi->mi_pending_reads_notif_wq);
 +	init_waitqueue_head(&mi->mi_log.ml_notif_wq);
 +	init_waitqueue_head(&mi->mi_blocks_written_notif_wq);
 +	atomic_set(&mi->mi_blocks_written, 0);
 +	INIT_DELAYED_WORK(&mi->mi_log.ml_wakeup_work, log_wake_up_all);
 +	spin_lock_init(&mi->mi_log.rl_lock);
 +	spin_lock_init(&mi->pending_read_lock);
 +	INIT_LIST_HEAD(&mi->mi_reads_list_head);
 +	spin_lock_init(&mi->mi_per_uid_read_timeouts_lock);
 +	mutex_init(&mi->mi_zstd_workspace_mutex);
 +	INIT_DELAYED_WORK(&mi->mi_zstd_cleanup_work, zstd_free_workspace);
 +	mutex_init(&mi->mi_le_mutex);
 +
 +	node = incfs_add_sysfs_node(options->sysfs_name, mi);
 +	if (IS_ERR(node)) {
 +		error = PTR_ERR(node);
 +		goto err;
 +	}
 +	mi->mi_sysfs_node = node;
 +
 +	error = incfs_realloc_mount_info(mi, options);
 +	if (error)
 +		goto err;
 +
 +	return mi;
 +
 +err:
 +	incfs_free_mount_info(mi);
 +	return ERR_PTR(error);
 +}
 +
 +int incfs_realloc_mount_info(struct mount_info *mi,
 +			     struct mount_options *options)
 +{
 +	void *new_buffer = NULL;
 +	void *old_buffer;
 +	size_t new_buffer_size = 0;
 +
 +	if (options->read_log_pages != mi->mi_options.read_log_pages) {
 +		struct read_log_state log_state;
 +		/*
 +		 * Even though having two buffers allocated at once isn't
 +		 * usually good, allocating a multipage buffer under a spinlock
 +		 * is even worse, so let's optimize for the shorter lock
 +		 * duration. It's not end of the world if we fail to increase
 +		 * the buffer size anyway.
 +		 */
 +		if (options->read_log_pages > 0) {
 +			new_buffer_size = PAGE_SIZE * options->read_log_pages;
 +			new_buffer = kzalloc(new_buffer_size, GFP_NOFS);
 +			if (!new_buffer)
 +				return -ENOMEM;
 +		}
 +
 +		spin_lock(&mi->mi_log.rl_lock);
 +		old_buffer = mi->mi_log.rl_ring_buf;
 +		mi->mi_log.rl_ring_buf = new_buffer;
 +		mi->mi_log.rl_size = new_buffer_size;
 +		log_state = (struct read_log_state){
 +			.generation_id = mi->mi_log.rl_head.generation_id + 1,
 +		};
 +		mi->mi_log.rl_head = log_state;
 +		mi->mi_log.rl_tail = log_state;
 +		spin_unlock(&mi->mi_log.rl_lock);
 +
 +		kfree(old_buffer);
 +	}
 +
 +	if (options->sysfs_name && !mi->mi_sysfs_node)
 +		mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name,
 +							 mi);
 +	else if (!options->sysfs_name && mi->mi_sysfs_node) {
 +		incfs_free_sysfs_node(mi->mi_sysfs_node);
 +		mi->mi_sysfs_node = NULL;
 +	} else if (options->sysfs_name &&
 +		strcmp(options->sysfs_name,
 +		       kobject_name(&mi->mi_sysfs_node->isn_sysfs_node))) {
 +		incfs_free_sysfs_node(mi->mi_sysfs_node);
 +		mi->mi_sysfs_node = incfs_add_sysfs_node(options->sysfs_name,
 +							 mi);
 +	}
 +
 +	if (IS_ERR(mi->mi_sysfs_node)) {
 +		int err = PTR_ERR(mi->mi_sysfs_node);
 +
 +		mi->mi_sysfs_node = NULL;
 +		return err;
 +	}
 +
 +	mi->mi_options = *options;
 +	return 0;
 +}
 +
 +void incfs_free_mount_info(struct mount_info *mi)
 +{
 +	int i;
 +	if (!mi)
 +		return;
 +
 +	flush_delayed_work(&mi->mi_log.ml_wakeup_work);
 +	flush_delayed_work(&mi->mi_zstd_cleanup_work);
 +
 +	dput(mi->mi_index_dir);
 +	dput(mi->mi_incomplete_dir);
 +	path_put(&mi->mi_backing_dir_path);
 +	mutex_destroy(&mi->mi_dir_struct_mutex);
 +	mutex_destroy(&mi->mi_zstd_workspace_mutex);
 +	put_cred(mi->mi_owner);
 +	kfree(mi->mi_log.rl_ring_buf);
 +	for (i = 0; i < ARRAY_SIZE(mi->pseudo_file_xattr); ++i)
 +		kfree(mi->pseudo_file_xattr[i].data);
 +	kfree(mi->mi_per_uid_read_timeouts);
 +	incfs_free_sysfs_node(mi->mi_sysfs_node);
 +	kfree(mi);
 +}
 +
 +static void data_file_segment_init(struct data_file_segment *segment)
 +{
 +	init_waitqueue_head(&segment->new_data_arrival_wq);
 +	init_rwsem(&segment->rwsem);
 +	INIT_LIST_HEAD(&segment->reads_list_head);
 +}
 +
 +char *file_id_to_str(incfs_uuid_t id)
 +{
 +	char *result = kmalloc(1 + sizeof(id.bytes) * 2, GFP_NOFS);
 +	char *end;
 +
 +	if (!result)
 +		return NULL;
 +
 +	end = bin2hex(result, id.bytes, sizeof(id.bytes));
 +	*end = 0;
 +	return result;
 +}
 +
 +struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name)
 +{
 +	struct inode *inode;
 +	struct dentry *result = NULL;
 +
 +	if (!parent)
 +		return ERR_PTR(-EFAULT);
 +
 +	inode = d_inode(parent);
 +	inode_lock_nested(inode, I_MUTEX_PARENT);
 +	result = lookup_one_len(name, parent, strlen(name));
 +	inode_unlock(inode);
 +
 +	if (IS_ERR(result))
 +		pr_warn("%s err:%ld\n", __func__, PTR_ERR(result));
 +
 +	return result;
 +}
 +
 +static struct data_file *handle_mapped_file(struct mount_info *mi,
 +					    struct data_file *df)
 +{
 +	char *file_id_str;
 +	struct dentry *index_file_dentry;
 +	struct path path;
 +	struct file *bf;
 +	struct data_file *result = NULL;
 +	const struct cred *old_cred;
 +
 +	file_id_str = file_id_to_str(df->df_id);
 +	if (!file_id_str)
 +		return ERR_PTR(-ENOENT);
 +
 +	index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir,
 +						file_id_str);
 +	kfree(file_id_str);
 +	if (!index_file_dentry)
 +		return ERR_PTR(-ENOENT);
 +	if (IS_ERR(index_file_dentry))
 +		return ERR_CAST(index_file_dentry);
 +	if (!d_really_is_positive(index_file_dentry)) {
 +		result = ERR_PTR(-ENOENT);
 +		goto out;
 +	}
 +
 +	path = (struct path) {
 +		.mnt = mi->mi_backing_dir_path.mnt,
 +		.dentry = index_file_dentry
 +	};
 +
 +	old_cred = override_creds(mi->mi_owner);
 +	bf = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
 +			 current_cred());
 +	revert_creds(old_cred);
 +
 +	if (IS_ERR(bf)) {
 +		result = ERR_CAST(bf);
 +		goto out;
 +	}
 +
 +	result = incfs_open_data_file(mi, bf);
 +	fput(bf);
 +	if (IS_ERR(result))
 +		goto out;
 +
 +	result->df_mapped_offset = df->df_metadata_off;
 +
 +out:
 +	dput(index_file_dentry);
 +	return result;
 +}
 +
 +struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf)
 +{
 +	struct data_file *df = NULL;
 +	struct backing_file_context *bfc = NULL;
 +	int md_records;
 +	u64 size;
 +	int error = 0;
 +	int i;
 +
 +	if (!bf || !mi)
 +		return ERR_PTR(-EFAULT);
 +
 +	if (!S_ISREG(bf->f_inode->i_mode))
 +		return ERR_PTR(-EBADF);
 +
 +	bfc = incfs_alloc_bfc(mi, bf);
 +	if (IS_ERR(bfc))
 +		return ERR_CAST(bfc);
 +
 +	df = kzalloc(sizeof(*df), GFP_NOFS);
 +	if (!df) {
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +
 +	mutex_init(&df->df_enable_verity);
 +
 +	df->df_backing_file_context = bfc;
 +	df->df_mount_info = mi;
 +	for (i = 0; i < ARRAY_SIZE(df->df_segments); i++)
 +		data_file_segment_init(&df->df_segments[i]);
 +
 +	error = incfs_read_file_header(bfc, &df->df_metadata_off, &df->df_id,
 +				       &size, &df->df_header_flags);
 +
 +	if (error)
 +		goto out;
 +
 +	df->df_size = size;
 +	if (size > 0)
 +		df->df_data_block_count = get_blocks_count_for_size(size);
 +
 +	if (df->df_header_flags & INCFS_FILE_MAPPED) {
 +		struct data_file *mapped_df = handle_mapped_file(mi, df);
 +
 +		incfs_free_data_file(df);
 +		return mapped_df;
 +	}
 +
 +	md_records = incfs_scan_metadata_chain(df);
 +	if (md_records < 0)
 +		error = md_records;
 +
 +out:
 +	if (error) {
 +		incfs_free_bfc(bfc);
 +		if (df)
 +			df->df_backing_file_context = NULL;
 +		incfs_free_data_file(df);
 +		return ERR_PTR(error);
 +	}
 +	return df;
 +}
 +
 +void incfs_free_data_file(struct data_file *df)
 +{
 +	u32 data_blocks_written, hash_blocks_written;
 +
 +	if (!df)
 +		return;
 +
 +	data_blocks_written = atomic_read(&df->df_data_blocks_written);
 +	hash_blocks_written = atomic_read(&df->df_hash_blocks_written);
 +
 +	if (data_blocks_written != df->df_initial_data_blocks_written ||
 +	    hash_blocks_written != df->df_initial_hash_blocks_written) {
 +		struct backing_file_context *bfc = df->df_backing_file_context;
 +		int error = -1;
 +
 +		if (bfc && !mutex_lock_interruptible(&bfc->bc_mutex)) {
 +			error = incfs_write_status_to_backing_file(
 +						df->df_backing_file_context,
 +						df->df_status_offset,
 +						data_blocks_written,
 +						hash_blocks_written);
 +			mutex_unlock(&bfc->bc_mutex);
 +		}
 +
 +		if (error)
 +			/* Nothing can be done, just warn */
 +			pr_warn("incfs: failed to write status to backing file\n");
 +	}
 +
 +	incfs_free_mtree(df->df_hash_tree);
 +	incfs_free_bfc(df->df_backing_file_context);
 +	kfree(df->df_signature);
 +	kfree(df->df_verity_file_digest.data);
 +	kfree(df->df_verity_signature);
 +	mutex_destroy(&df->df_enable_verity);
 +	kfree(df);
 +}
 +
 +int make_inode_ready_for_data_ops(struct mount_info *mi,
 +				struct inode *inode,
 +				struct file *backing_file)
 +{
 +	struct inode_info *node = get_incfs_node(inode);
 +	struct data_file *df = NULL;
 +	int err = 0;
 +
 +	inode_lock(inode);
 +	if (S_ISREG(inode->i_mode)) {
 +		if (!node->n_file) {
 +			df = incfs_open_data_file(mi, backing_file);
 +
 +			if (IS_ERR(df))
 +				err = PTR_ERR(df);
 +			else
 +				node->n_file = df;
 +		}
 +	} else
 +		err = -EBADF;
 +	inode_unlock(inode);
 +	return err;
 +}
 +
 +struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf)
 +{
 +	struct dir_file *dir = NULL;
 +
 +	if (!S_ISDIR(bf->f_inode->i_mode))
 +		return ERR_PTR(-EBADF);
 +
 +	dir = kzalloc(sizeof(*dir), GFP_NOFS);
 +	if (!dir)
 +		return ERR_PTR(-ENOMEM);
 +
 +	dir->backing_dir = get_file(bf);
 +	dir->mount_info = mi;
 +	return dir;
 +}
 +
 +void incfs_free_dir_file(struct dir_file *dir)
 +{
 +	if (!dir)
 +		return;
 +	if (dir->backing_dir)
 +		fput(dir->backing_dir);
 +	kfree(dir);
 +}
 +
 +static ssize_t zstd_decompress_safe(struct mount_info *mi,
 +				    struct mem_range src, struct mem_range dst)
 +{
 +	ssize_t result;
 +	ZSTD_inBuffer inbuf = {.src = src.data,	.size = src.len};
 +	ZSTD_outBuffer outbuf = {.dst = dst.data, .size = dst.len};
 +
 +	result = mutex_lock_interruptible(&mi->mi_zstd_workspace_mutex);
 +	if (result)
 +		return result;
 +
 +	if (!mi->mi_zstd_stream) {
 +		unsigned int workspace_size = zstd_dstream_workspace_bound(
 +						INCFS_DATA_FILE_BLOCK_SIZE);
 +		void *workspace = kvmalloc(workspace_size, GFP_NOFS);
 +		ZSTD_DStream *stream;
 +
 +		if (!workspace) {
 +			result = -ENOMEM;
 +			goto out;
 +		}
 +
 +		stream = zstd_init_dstream(INCFS_DATA_FILE_BLOCK_SIZE, workspace,
 +				  workspace_size);
 +		if (!stream) {
 +			kvfree(workspace);
 +			result = -EIO;
 +			goto out;
 +		}
 +
 +		mi->mi_zstd_workspace = workspace;
 +		mi->mi_zstd_stream = stream;
 +	}
 +
 +	result = zstd_decompress_stream(mi->mi_zstd_stream, &outbuf, &inbuf) ?
 +		-EBADMSG : outbuf.pos;
 +
 +	mod_delayed_work(system_wq, &mi->mi_zstd_cleanup_work,
 +			 msecs_to_jiffies(5000));
 +
 +out:
 +	mutex_unlock(&mi->mi_zstd_workspace_mutex);
 +	return result;
 +}
 +
 +static ssize_t decompress(struct mount_info *mi,
 +			  struct mem_range src, struct mem_range dst, int alg)
 +{
 +	int result;
 +
 +	switch (alg) {
 +	case INCFS_BLOCK_COMPRESSED_LZ4:
 +		result = LZ4_decompress_safe(src.data, dst.data, src.len,
 +					     dst.len);
 +		if (result < 0)
 +			return -EBADMSG;
 +		return result;
 +
 +	case INCFS_BLOCK_COMPRESSED_ZSTD:
 +		return zstd_decompress_safe(mi, src, dst);
 +
 +	default:
 +		WARN_ON(true);
 +		return -EOPNOTSUPP;
 +	}
 +}
 +
 +static void log_read_one_record(struct read_log *rl, struct read_log_state *rs)
 +{
 +	union log_record *record =
 +		(union log_record *)((u8 *)rl->rl_ring_buf + rs->next_offset);
 +	size_t record_size;
 +
 +	switch (record->full_record.type) {
 +	case FULL:
 +		rs->base_record = record->full_record;
 +		record_size = sizeof(record->full_record);
 +		break;
 +
 +	case SAME_FILE:
 +		rs->base_record.block_index =
 +			record->same_file.block_index;
 +		rs->base_record.absolute_ts_us +=
 +			record->same_file.relative_ts_us;
 +		rs->base_record.uid = record->same_file.uid;
 +		record_size = sizeof(record->same_file);
 +		break;
 +
 +	case SAME_FILE_CLOSE_BLOCK:
 +		rs->base_record.block_index +=
 +			record->same_file_close_block.block_index_delta;
 +		rs->base_record.absolute_ts_us +=
 +			record->same_file_close_block.relative_ts_us;
 +		record_size = sizeof(record->same_file_close_block);
 +		break;
 +
 +	case SAME_FILE_CLOSE_BLOCK_SHORT:
 +		rs->base_record.block_index +=
 +			record->same_file_close_block_short.block_index_delta;
 +		rs->base_record.absolute_ts_us +=
 +		   record->same_file_close_block_short.relative_ts_tens_us * 10;
 +		record_size = sizeof(record->same_file_close_block_short);
 +		break;
 +
 +	case SAME_FILE_NEXT_BLOCK:
 +		++rs->base_record.block_index;
 +		rs->base_record.absolute_ts_us +=
 +			record->same_file_next_block.relative_ts_us;
 +		record_size = sizeof(record->same_file_next_block);
 +		break;
 +
 +	case SAME_FILE_NEXT_BLOCK_SHORT:
 +		++rs->base_record.block_index;
 +		rs->base_record.absolute_ts_us +=
 +		    record->same_file_next_block_short.relative_ts_tens_us * 10;
 +		record_size = sizeof(record->same_file_next_block_short);
 +		break;
 +	}
 +
 +	rs->next_offset += record_size;
 +	if (rs->next_offset > rl->rl_size - sizeof(*record)) {
 +		rs->next_offset = 0;
 +		++rs->current_pass_no;
 +	}
 +	++rs->current_record_no;
 +}
 +
 +static void log_block_read(struct mount_info *mi, incfs_uuid_t *id,
 +			   int block_index)
 +{
 +	struct read_log *log = &mi->mi_log;
 +	struct read_log_state *head, *tail;
 +	s64 now_us;
 +	s64 relative_us;
 +	union log_record record;
 +	size_t record_size;
 +	uid_t uid = current_uid().val;
 +	int block_delta;
 +	bool same_file, same_uid;
 +	bool next_block, close_block, very_close_block;
 +	bool close_time, very_close_time, very_very_close_time;
 +
 +	/*
 +	 * This may read the old value, but it's OK to delay the logging start
 +	 * right after the configuration update.
 +	 */
 +	if (READ_ONCE(log->rl_size) == 0)
 +		return;
 +
 +	now_us = ktime_to_us(ktime_get());
 +
 +	spin_lock(&log->rl_lock);
 +	if (log->rl_size == 0) {
 +		spin_unlock(&log->rl_lock);
 +		return;
 +	}
 +
 +	head = &log->rl_head;
 +	tail = &log->rl_tail;
 +	relative_us = now_us - head->base_record.absolute_ts_us;
 +
 +	same_file = !memcmp(id, &head->base_record.file_id,
 +			    sizeof(incfs_uuid_t));
 +	same_uid = uid == head->base_record.uid;
 +
 +	block_delta = block_index - head->base_record.block_index;
 +	next_block = block_delta == 1;
 +	very_close_block = block_delta >= S8_MIN && block_delta <= S8_MAX;
 +	close_block = block_delta >= S16_MIN && block_delta <= S16_MAX;
 +
 +	very_very_close_time = relative_us < (1 << 5) * 10;
 +	very_close_time = relative_us < (1 << 13);
 +	close_time = relative_us < (1 << 16);
 +
 +	if (same_file && same_uid && next_block && very_very_close_time) {
 +		record.same_file_next_block_short =
 +			(struct same_file_next_block_short){
 +				.type = SAME_FILE_NEXT_BLOCK_SHORT,
 +				.relative_ts_tens_us = div_s64(relative_us, 10),
 +			};
 +		record_size = sizeof(struct same_file_next_block_short);
 +	} else if (same_file && same_uid && next_block && very_close_time) {
 +		record.same_file_next_block = (struct same_file_next_block){
 +			.type = SAME_FILE_NEXT_BLOCK,
 +			.relative_ts_us = relative_us,
 +		};
 +		record_size = sizeof(struct same_file_next_block);
 +	} else if (same_file && same_uid && very_close_block &&
 +		   very_very_close_time) {
 +		record.same_file_close_block_short =
 +			(struct same_file_close_block_short){
 +				.type = SAME_FILE_CLOSE_BLOCK_SHORT,
 +				.relative_ts_tens_us = div_s64(relative_us, 10),
 +				.block_index_delta = block_delta,
 +			};
 +		record_size = sizeof(struct same_file_close_block_short);
 +	} else if (same_file && same_uid && close_block && very_close_time) {
 +		record.same_file_close_block = (struct same_file_close_block){
 +				.type = SAME_FILE_CLOSE_BLOCK,
 +				.relative_ts_us = relative_us,
 +				.block_index_delta = block_delta,
 +			};
 +		record_size = sizeof(struct same_file_close_block);
 +	} else if (same_file && close_time) {
 +		record.same_file = (struct same_file){
 +			.type = SAME_FILE,
 +			.block_index = block_index,
 +			.relative_ts_us = relative_us,
 +			.uid = uid,
 +		};
 +		record_size = sizeof(struct same_file);
 +	} else {
 +		record.full_record = (struct full_record){
 +			.type = FULL,
 +			.block_index = block_index,
 +			.file_id = *id,
 +			.absolute_ts_us = now_us,
 +			.uid = uid,
 +		};
 +		head->base_record.file_id = *id;
 +		record_size = sizeof(struct full_record);
 +	}
 +
 +	head->base_record.block_index = block_index;
 +	head->base_record.absolute_ts_us = now_us;
 +
 +	/* Advance tail beyond area we are going to overwrite */
 +	while (tail->current_pass_no < head->current_pass_no &&
 +	       tail->next_offset < head->next_offset + record_size)
 +		log_read_one_record(log, tail);
 +
 +	memcpy(((u8 *)log->rl_ring_buf) + head->next_offset, &record,
 +	       record_size);
 +	head->next_offset += record_size;
 +	if (head->next_offset > log->rl_size - sizeof(record)) {
 +		head->next_offset = 0;
 +		++head->current_pass_no;
 +	}
 +	++head->current_record_no;
 +
 +	spin_unlock(&log->rl_lock);
 +	schedule_delayed_work(&log->ml_wakeup_work, msecs_to_jiffies(16));
 +}
 +
 +static int validate_hash_tree(struct backing_file_context *bfc, struct file *f,
 +			      int block_index, struct mem_range data, u8 *buf)
 +{
 +	struct data_file *df = get_incfs_data_file(f);
 +	u8 stored_digest[INCFS_MAX_HASH_SIZE] = {};
 +	u8 calculated_digest[INCFS_MAX_HASH_SIZE] = {};
 +	struct mtree *tree = NULL;
 +	struct incfs_df_signature *sig = NULL;
 +	int digest_size;
 +	int hash_block_index = block_index;
 +	int lvl;
 +	int res;
 +	loff_t hash_block_offset[INCFS_MAX_MTREE_LEVELS];
 +	size_t hash_offset_in_block[INCFS_MAX_MTREE_LEVELS];
 +	int hash_per_block;
 +	pgoff_t file_pages;
 +
 +	/*
 +	 * Memory barrier to make sure tree is fully present if added via enable
 +	 * verity
 +	 */
 +	tree = smp_load_acquire(&df->df_hash_tree);
 +	sig = df->df_signature;
 +	if (!tree || !sig)
 +		return 0;
 +
 +	digest_size = tree->alg->digest_size;
 +	hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / digest_size;
 +	for (lvl = 0; lvl < tree->depth; lvl++) {
 +		loff_t lvl_off = tree->hash_level_suboffset[lvl];
 +
 +		hash_block_offset[lvl] =
 +			lvl_off + round_down(hash_block_index * digest_size,
 +					     INCFS_DATA_FILE_BLOCK_SIZE);
 +		hash_offset_in_block[lvl] = hash_block_index * digest_size %
 +					    INCFS_DATA_FILE_BLOCK_SIZE;
 +		hash_block_index /= hash_per_block;
 +	}
 +
 +	memcpy(stored_digest, tree->root_hash, digest_size);
 +
 +	file_pages = DIV_ROUND_UP(df->df_size, INCFS_DATA_FILE_BLOCK_SIZE);
 +	for (lvl = tree->depth - 1; lvl >= 0; lvl--) {
 +		pgoff_t hash_page =
 +			file_pages +
 +			hash_block_offset[lvl] / INCFS_DATA_FILE_BLOCK_SIZE;
 +		struct page *page = find_get_page_flags(
 +			f->f_inode->i_mapping, hash_page, FGP_ACCESSED);
 +
 +		if (page && PageChecked(page)) {
 +			u8 *addr = kmap_atomic(page);
 +
 +			memcpy(stored_digest, addr + hash_offset_in_block[lvl],
 +			       digest_size);
 +
 +			kunmap_atomic(addr);
 +			put_page(page);
 +			continue;
 +		}
 +
 +		if (page)
 +			put_page(page);
 +
 +		res = incfs_kread(bfc, buf, INCFS_DATA_FILE_BLOCK_SIZE,
 +				  hash_block_offset[lvl] + sig->hash_offset);
 +		if (res < 0)
 +			return res;
 +		if (res != INCFS_DATA_FILE_BLOCK_SIZE)
 +			return -EIO;
 +		res = incfs_calc_digest(tree->alg,
 +					range(buf, INCFS_DATA_FILE_BLOCK_SIZE),
 +					range(calculated_digest, digest_size));
 +		if (res)
 +			return res;
 +
 +		if (memcmp(stored_digest, calculated_digest, digest_size)) {
 +			int i;
 +			bool zero = true;
 +
 +			pr_warn("incfs: Hash mismatch lvl:%d blk:%d\n",
 +				lvl, block_index);
 +			for (i = 0; i < digest_size; i++)
 +				if (stored_digest[i]) {
 +					zero = false;
 +					break;
 +				}
 +
 +			if (zero)
 +				pr_debug("Note saved_digest all zero - did you forget to load the hashes?\n");
 +			return -EBADMSG;
 +		}
 +
 +		memcpy(stored_digest, buf + hash_offset_in_block[lvl],
 +		       digest_size);
 +
 +		page = grab_cache_page(f->f_inode->i_mapping, hash_page);
 +		if (page) {
 +			u8 *addr = kmap_atomic(page);
 +
 +			memcpy(addr, buf, INCFS_DATA_FILE_BLOCK_SIZE);
 +			kunmap_atomic(addr);
 +			SetPageChecked(page);
 +			SetPageUptodate(page);
 +			unlock_page(page);
 +			put_page(page);
 +		}
 +	}
 +
 +	res = incfs_calc_digest(tree->alg, data,
 +				range(calculated_digest, digest_size));
 +	if (res)
 +		return res;
 +
 +	if (memcmp(stored_digest, calculated_digest, digest_size)) {
 +		pr_debug("Leaf hash mismatch blk:%d\n", block_index);
 +		return -EBADMSG;
 +	}
 +
 +	return 0;
 +}
 +
 +static struct data_file_segment *get_file_segment(struct data_file *df,
 +						  int block_index)
 +{
 +	int seg_idx = block_index % ARRAY_SIZE(df->df_segments);
 +
 +	return &df->df_segments[seg_idx];
 +}
 +
 +static bool is_data_block_present(struct data_file_block *block)
 +{
 +	return (block->db_backing_file_data_offset != 0) &&
 +	       (block->db_stored_size != 0);
 +}
 +
 +static void convert_data_file_block(struct incfs_blockmap_entry *bme,
 +				    struct data_file_block *res_block)
 +{
 +	u16 flags = le16_to_cpu(bme->me_flags);
 +
 +	res_block->db_backing_file_data_offset =
 +		le16_to_cpu(bme->me_data_offset_hi);
 +	res_block->db_backing_file_data_offset <<= 32;
 +	res_block->db_backing_file_data_offset |=
 +		le32_to_cpu(bme->me_data_offset_lo);
 +	res_block->db_stored_size = le16_to_cpu(bme->me_data_size);
 +	res_block->db_comp_alg = flags & INCFS_BLOCK_COMPRESSED_MASK;
 +}
 +
 +static int get_data_file_block(struct data_file *df, int index,
 +			       struct data_file_block *res_block)
 +{
 +	struct incfs_blockmap_entry bme = {};
 +	struct backing_file_context *bfc = NULL;
 +	loff_t blockmap_off = 0;
 +	int error = 0;
 +
 +	if (!df || !res_block)
 +		return -EFAULT;
 +
 +	blockmap_off = df->df_blockmap_off;
 +	bfc = df->df_backing_file_context;
 +
 +	if (index < 0 || blockmap_off == 0)
 +		return -EINVAL;
 +
 +	error = incfs_read_blockmap_entry(bfc, index, blockmap_off, &bme);
 +	if (error)
 +		return error;
 +
 +	convert_data_file_block(&bme, res_block);
 +	return 0;
 +}
 +
 +static int check_room_for_one_range(u32 size, u32 size_out)
 +{
 +	if (size_out + sizeof(struct incfs_filled_range) > size)
 +		return -ERANGE;
 +	return 0;
 +}
 +
 +static int copy_one_range(struct incfs_filled_range *range, void __user *buffer,
 +			  u32 size, u32 *size_out)
 +{
 +	int error = check_room_for_one_range(size, *size_out);
 +	if (error)
 +		return error;
 +
 +	if (copy_to_user(((char __user *)buffer) + *size_out, range,
 +				sizeof(*range)))
 +		return -EFAULT;
 +
 +	*size_out += sizeof(*range);
 +	return 0;
 +}
 +
 +#define READ_BLOCKMAP_ENTRIES 512
 +int incfs_get_filled_blocks(struct data_file *df,
 +			    struct incfs_file_data *fd,
 +			    struct incfs_get_filled_blocks_args *arg)
 +{
 +	int error = 0;
 +	bool in_range = false;
 +	struct incfs_filled_range range;
 +	void __user *buffer = u64_to_user_ptr(arg->range_buffer);
 +	u32 size = arg->range_buffer_size;
 +	u32 end_index =
 +		arg->end_index ? arg->end_index : df->df_total_block_count;
 +	u32 *size_out = &arg->range_buffer_size_out;
 +	int i = READ_BLOCKMAP_ENTRIES - 1;
 +	int entries_read = 0;
 +	struct incfs_blockmap_entry *bme;
 +	int data_blocks_filled = 0;
 +	int hash_blocks_filled = 0;
 +
 +	*size_out = 0;
 +	if (end_index > df->df_total_block_count)
 +		end_index = df->df_total_block_count;
 +	arg->total_blocks_out = df->df_total_block_count;
 +	arg->data_blocks_out = df->df_data_block_count;
 +
 +	if (atomic_read(&df->df_data_blocks_written) ==
 +	    df->df_data_block_count) {
 +		pr_debug("File marked full, fast get_filled_blocks");
 +		if (arg->start_index > end_index) {
 +			arg->index_out = arg->start_index;
 +			return 0;
 +		}
 +		arg->index_out = arg->start_index;
 +
 +		error = check_room_for_one_range(size, *size_out);
 +		if (error)
 +			return error;
 +
 +		range = (struct incfs_filled_range){
 +			.begin = arg->start_index,
 +			.end = end_index,
 +		};
 +
 +		error = copy_one_range(&range, buffer, size, size_out);
 +		if (error)
 +			return error;
 +		arg->index_out = end_index;
 +		return 0;
 +	}
 +
 +	bme = kzalloc(sizeof(*bme) * READ_BLOCKMAP_ENTRIES,
 +		      GFP_NOFS | __GFP_COMP);
 +	if (!bme)
 +		return -ENOMEM;
 +
 +	for (arg->index_out = arg->start_index; arg->index_out < end_index;
 +	     ++arg->index_out) {
 +		struct data_file_block dfb;
 +
 +		if (++i == READ_BLOCKMAP_ENTRIES) {
 +			entries_read = incfs_read_blockmap_entries(
 +				df->df_backing_file_context, bme,
 +				arg->index_out, READ_BLOCKMAP_ENTRIES,
 +				df->df_blockmap_off);
 +			if (entries_read < 0) {
 +				error = entries_read;
 +				break;
 +			}
 +
 +			i = 0;
 +		}
 +
 +		if (i >= entries_read) {
 +			error = -EIO;
 +			break;
 +		}
 +
 +		convert_data_file_block(bme + i, &dfb);
 +
 +		if (is_data_block_present(&dfb)) {
 +			if (arg->index_out >= df->df_data_block_count)
 +				++hash_blocks_filled;
 +			else
 +				++data_blocks_filled;
 +		}
 +
 +		if (is_data_block_present(&dfb) == in_range)
 +			continue;
 +
 +		if (!in_range) {
 +			error = check_room_for_one_range(size, *size_out);
 +			if (error)
 +				break;
 +			in_range = true;
 +			range.begin = arg->index_out;
 +		} else {
 +			range.end = arg->index_out;
 +			error = copy_one_range(&range, buffer, size, size_out);
 +			if (error) {
 +				/* there will be another try out of the loop,
 +				 * it will reset the index_out if it fails too
 +				 */
 +				break;
 +			}
 +			in_range = false;
 +		}
 +	}
 +
 +	if (in_range) {
 +		range.end = arg->index_out;
 +		error = copy_one_range(&range, buffer, size, size_out);
 +		if (error)
 +			arg->index_out = range.begin;
 +	}
 +
 +	if (arg->start_index == 0) {
 +		fd->fd_get_block_pos = 0;
 +		fd->fd_filled_data_blocks = 0;
 +		fd->fd_filled_hash_blocks = 0;
 +	}
 +
 +	if (arg->start_index == fd->fd_get_block_pos) {
 +		fd->fd_get_block_pos = arg->index_out + 1;
 +		fd->fd_filled_data_blocks += data_blocks_filled;
 +		fd->fd_filled_hash_blocks += hash_blocks_filled;
 +	}
 +
 +	if (fd->fd_get_block_pos == df->df_total_block_count + 1) {
 +		if (fd->fd_filled_data_blocks >
 +		   atomic_read(&df->df_data_blocks_written))
 +			atomic_set(&df->df_data_blocks_written,
 +				   fd->fd_filled_data_blocks);
 +
 +		if (fd->fd_filled_hash_blocks >
 +		   atomic_read(&df->df_hash_blocks_written))
 +			atomic_set(&df->df_hash_blocks_written,
 +				   fd->fd_filled_hash_blocks);
 +	}
 +
 +	kfree(bme);
 +	return error;
 +}
 +
 +static bool is_read_done(struct pending_read *read)
 +{
 +	return atomic_read_acquire(&read->done) != 0;
 +}
 +
 +static void set_read_done(struct pending_read *read)
 +{
 +	atomic_set_release(&read->done, 1);
 +}
 +
 +/*
 + * Notifies a given data file about pending read from a given block.
 + * Returns a new pending read entry.
 + */
 +static struct pending_read *add_pending_read(struct data_file *df,
 +					     int block_index)
 +{
 +	struct pending_read *result = NULL;
 +	struct data_file_segment *segment = NULL;
 +	struct mount_info *mi = NULL;
 +
 +	segment = get_file_segment(df, block_index);
 +	mi = df->df_mount_info;
 +
 +	result = kzalloc(sizeof(*result), GFP_NOFS);
 +	if (!result)
 +		return NULL;
 +
 +	result->file_id = df->df_id;
 +	result->block_index = block_index;
 +	result->timestamp_us = ktime_to_us(ktime_get());
 +	result->uid = current_uid().val;
 +
 +	spin_lock(&mi->pending_read_lock);
 +
 +	result->serial_number = ++mi->mi_last_pending_read_number;
 +	mi->mi_pending_reads_count++;
 +
 +	list_add_rcu(&result->mi_reads_list, &mi->mi_reads_list_head);
 +	list_add_rcu(&result->segment_reads_list, &segment->reads_list_head);
 +
 +	spin_unlock(&mi->pending_read_lock);
 +
 +	wake_up_all(&mi->mi_pending_reads_notif_wq);
 +	return result;
 +}
 +
 +static void free_pending_read_entry(struct rcu_head *entry)
 +{
 +	struct pending_read *read;
 +
 +	read = container_of(entry, struct pending_read, rcu);
 +
 +	kfree(read);
 +}
 +
 +/* Notifies a given data file that pending read is completed. */
 +static void remove_pending_read(struct data_file *df, struct pending_read *read)
 +{
 +	struct mount_info *mi = NULL;
 +
 +	if (!df || !read) {
 +		WARN_ON(!df);
 +		WARN_ON(!read);
 +		return;
 +	}
 +
 +	mi = df->df_mount_info;
 +
 +	spin_lock(&mi->pending_read_lock);
 +
 +	list_del_rcu(&read->mi_reads_list);
 +	list_del_rcu(&read->segment_reads_list);
 +
 +	mi->mi_pending_reads_count--;
 +
 +	spin_unlock(&mi->pending_read_lock);
 +
 +	/* Don't free. Wait for readers */
 +	call_rcu(&read->rcu, free_pending_read_entry);
 +}
 +
 +static void notify_pending_reads(struct mount_info *mi,
 +		struct data_file_segment *segment,
 +		int index)
 +{
 +	struct pending_read *entry = NULL;
 +
 +	/* Notify pending reads waiting for this block. */
 +	rcu_read_lock();
 +	list_for_each_entry_rcu(entry, &segment->reads_list_head,
 +						segment_reads_list) {
 +		if (entry->block_index == index)
 +			set_read_done(entry);
 +	}
 +	rcu_read_unlock();
 +	wake_up_all(&segment->new_data_arrival_wq);
 +
 +	atomic_inc(&mi->mi_blocks_written);
 +	wake_up_all(&mi->mi_blocks_written_notif_wq);
 +}
 +
 +static int wait_for_data_block(struct data_file *df, int block_index,
 +			       struct data_file_block *res_block,
 +			       struct incfs_read_data_file_timeouts *timeouts,
 +			       unsigned int *delayed_min_us)
 +{
 +	struct data_file_block block = {};
 +	struct data_file_segment *segment = NULL;
 +	struct pending_read *read = NULL;
 +	struct mount_info *mi = NULL;
 +	int error;
 +	int wait_res = 0;
 +	unsigned int delayed_pending_us = 0;
 +	bool delayed_pending = false;
 +
 +	if (!df || !res_block)
 +		return -EFAULT;
 +
 +	if (block_index < 0 || block_index >= df->df_data_block_count)
 +		return -EINVAL;
 +
 +	if (df->df_blockmap_off <= 0 || !df->df_mount_info)
 +		return -ENODATA;
 +
 +	mi = df->df_mount_info;
 +	segment = get_file_segment(df, block_index);
 +
 +	error = down_read_killable(&segment->rwsem);
 +	if (error)
 +		return error;
 +
 +	/* Look up the given block */
 +	error = get_data_file_block(df, block_index, &block);
 +
 +	up_read(&segment->rwsem);
 +
 +	if (error)
 +		return error;
 +
 +	/* If the block was found, just return it. No need to wait. */
 +	if (is_data_block_present(&block)) {
 +		*res_block = block;
 +		if (timeouts && timeouts->min_time_us) {
 +			*delayed_min_us = timeouts->min_time_us;
 +			goto out;
 +		}
 +		return 0;
 +	} else {
 +		/* If it's not found, create a pending read */
 +		if (timeouts && timeouts->max_pending_time_us) {
 +			read = add_pending_read(df, block_index);
 +			if (!read)
 +				return -ENOMEM;
 +		} else {
 +			log_block_read(mi, &df->df_id, block_index);
 +			return -ETIME;
 +		}
 +	}
 +
 +	/* Rest of function only applies if timeouts != NULL */
 +	if (!timeouts) {
 +		pr_warn("incfs: timeouts unexpectedly NULL\n");
 +		return -EFSCORRUPTED;
 +	}
 +
 +	/* Wait for notifications about block's arrival */
 +	wait_res =
 +		wait_event_interruptible_timeout(segment->new_data_arrival_wq,
 +			(is_read_done(read)),
 +			usecs_to_jiffies(timeouts->max_pending_time_us));
 +
 +	/* Woke up, the pending read is no longer needed. */
 +	remove_pending_read(df, read);
 +
 +	if (wait_res == 0) {
 +		/* Wait has timed out */
 +		log_block_read(mi, &df->df_id, block_index);
 +		return -ETIME;
 +	}
 +	if (wait_res < 0) {
 +		/*
 +		 * Only ERESTARTSYS is really expected here when a signal
 +		 * comes while we wait.
 +		 */
 +		return wait_res;
 +	}
 +
 +	delayed_pending = true;
 +	delayed_pending_us = timeouts->max_pending_time_us -
 +				jiffies_to_usecs(wait_res);
 +	if (timeouts->min_pending_time_us > delayed_pending_us)
 +		*delayed_min_us = timeouts->min_pending_time_us -
 +					     delayed_pending_us;
 +
 +	error = down_read_killable(&segment->rwsem);
 +	if (error)
 +		return error;
 +
 +	/*
 +	 * Re-read blocks info now, it has just arrived and
 +	 * should be available.
 +	 */
 +	error = get_data_file_block(df, block_index, &block);
 +	if (!error) {
 +		if (is_data_block_present(&block))
 +			*res_block = block;
 +		else {
 +			/*
 +			 * Somehow wait finished successfully but block still
 +			 * can't be found. It's not normal.
 +			 */
 +			pr_warn("incfs: Wait succeeded but block not found.\n");
 +			error = -ENODATA;
 +		}
 +	}
 +	up_read(&segment->rwsem);
 +
 +out:
 +	if (error)
 +		return error;
 +
 +	if (delayed_pending) {
 +		mi->mi_reads_delayed_pending++;
 +		mi->mi_reads_delayed_pending_us +=
 +			delayed_pending_us;
 +	}
 +
 +	if (delayed_min_us && *delayed_min_us) {
 +		mi->mi_reads_delayed_min++;
 +		mi->mi_reads_delayed_min_us += *delayed_min_us;
 +	}
 +
 +	return 0;
 +}
 +
 +static int incfs_update_sysfs_error(struct file *file, int index, int result,
 +				struct mount_info *mi, struct data_file *df)
 +{
 +	int error;
 +
 +	if (result >= 0)
 +		return 0;
 +
 +	error = mutex_lock_interruptible(&mi->mi_le_mutex);
 +	if (error)
 +		return error;
 +
 +	mi->mi_le_file_id = df->df_id;
 +	mi->mi_le_time_us = ktime_to_us(ktime_get());
 +	mi->mi_le_page = index;
 +	mi->mi_le_errno = result;
 +	mi->mi_le_uid = current_uid().val;
 +	mutex_unlock(&mi->mi_le_mutex);
 +
 +	return 0;
 +}
 +
 +ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
 +			int index, struct mem_range tmp,
 +			struct incfs_read_data_file_timeouts *timeouts,
 +			unsigned int *delayed_min_us)
 +{
 +	loff_t pos;
 +	ssize_t result;
 +	size_t bytes_to_read;
 +	struct mount_info *mi = NULL;
 +	struct backing_file_context *bfc = NULL;
 +	struct data_file_block block = {};
 +	struct data_file *df = get_incfs_data_file(f);
 +
 +	if (!dst.data || !df || !tmp.data)
 +		return -EFAULT;
 +
 +	if (tmp.len < 2 * INCFS_DATA_FILE_BLOCK_SIZE)
 +		return -ERANGE;
 +
 +	mi = df->df_mount_info;
 +	bfc = df->df_backing_file_context;
 +
 +	result = wait_for_data_block(df, index, &block, timeouts,
 +				     delayed_min_us);
 +	if (result < 0)
 +		goto out;
 +
 +	pos = block.db_backing_file_data_offset;
 +	if (block.db_comp_alg == COMPRESSION_NONE) {
 +		bytes_to_read = min(dst.len, block.db_stored_size);
 +		result = incfs_kread(bfc, dst.data, bytes_to_read, pos);
 +
 +		/* Some data was read, but not enough */
 +		if (result >= 0 && result != bytes_to_read)
 +			result = -EIO;
 +	} else {
 +		bytes_to_read = min(tmp.len, block.db_stored_size);
 +		result = incfs_kread(bfc, tmp.data, bytes_to_read, pos);
 +		if (result == bytes_to_read) {
 +			result =
 +				decompress(mi, range(tmp.data, bytes_to_read),
 +					   dst, block.db_comp_alg);
 +			if (result < 0) {
 +				const char *name =
 +				    bfc->bc_file->f_path.dentry->d_name.name;
 +
 +				pr_warn_once("incfs: Decompression error. %s",
 +					     name);
 +			}
 +		} else if (result >= 0) {
 +			/* Some data was read, but not enough */
 +			result = -EIO;
 +		}
 +	}
 +
 +	if (result > 0) {
 +		int err = validate_hash_tree(bfc, f, index, dst, tmp.data);
 +
 +		if (err < 0)
 +			result = err;
 +	}
 +
 +	if (result >= 0)
 +		log_block_read(mi, &df->df_id, index);
 +
 +out:
 +	if (result == -ETIME)
 +		mi->mi_reads_failed_timed_out++;
 +	else if (result == -EBADMSG)
 +		mi->mi_reads_failed_hash_verification++;
 +	else if (result < 0)
 +		mi->mi_reads_failed_other++;
 +
 +	incfs_update_sysfs_error(f, index, result, mi, df);
 +
 +	return result;
 +}
 +
 +ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
 +				      struct data_file *df, size_t offset)
 +{
 +	struct backing_file_context *bfc = NULL;
 +	struct incfs_df_signature *sig = NULL;
 +	size_t to_read = dst.len;
 +
 +	if (!dst.data || !df)
 +		return -EFAULT;
 +
 +	sig = df->df_signature;
 +	bfc = df->df_backing_file_context;
 +
 +	if (offset > sig->hash_size)
 +		return -ERANGE;
 +
 +	if (offset + to_read > sig->hash_size)
 +		to_read = sig->hash_size - offset;
 +
 +	return incfs_kread(bfc, dst.data, to_read, sig->hash_offset + offset);
 +}
 +
 +int incfs_process_new_data_block(struct data_file *df,
 +				 struct incfs_fill_block *block, u8 *data,
 +				 bool *complete)
 +{
 +	struct mount_info *mi = NULL;
 +	struct backing_file_context *bfc = NULL;
 +	struct data_file_segment *segment = NULL;
 +	struct data_file_block existing_block = {};
 +	u16 flags = 0;
 +	int error = 0;
 +
 +	if (!df || !block)
 +		return -EFAULT;
 +
 +	bfc = df->df_backing_file_context;
 +	mi = df->df_mount_info;
 +
 +	if (block->block_index >= df->df_data_block_count)
 +		return -ERANGE;
 +
 +	segment = get_file_segment(df, block->block_index);
 +	if (!segment)
 +		return -EFAULT;
 +
 +	if (block->compression == COMPRESSION_LZ4)
 +		flags |= INCFS_BLOCK_COMPRESSED_LZ4;
 +	else if (block->compression == COMPRESSION_ZSTD)
 +		flags |= INCFS_BLOCK_COMPRESSED_ZSTD;
 +	else if (block->compression)
 +		return -EINVAL;
 +
 +	error = down_read_killable(&segment->rwsem);
 +	if (error)
 +		return error;
 +
 +	error = get_data_file_block(df, block->block_index, &existing_block);
 +
 +	up_read(&segment->rwsem);
 +
 +	if (error)
 +		return error;
 +	if (is_data_block_present(&existing_block))
 +		/* Block is already present, nothing to do here */
 +		return 0;
 +
 +	error = down_write_killable(&segment->rwsem);
 +	if (error)
 +		return error;
 +
 +	/* Recheck inside write lock */
 +	error = get_data_file_block(df, block->block_index, &existing_block);
 +	if (error)
 +		goto out_up_write;
 +
 +	if (is_data_block_present(&existing_block))
 +		goto out_up_write;
 +
 +	error = mutex_lock_interruptible(&bfc->bc_mutex);
 +	if (error)
 +		goto out_up_write;
 +
 +	error = incfs_write_data_block_to_backing_file(bfc,
 +			range(data, block->data_len), block->block_index,
 +			df->df_blockmap_off, flags);
 +	if (error)
 +		goto out_mutex_unlock;
 +
 +	if (atomic_inc_return(&df->df_data_blocks_written)
 +			>= df->df_data_block_count)
 +		*complete = true;
 +
 +out_mutex_unlock:
 +	mutex_unlock(&bfc->bc_mutex);
 +	if (!error)
 +		notify_pending_reads(mi, segment, block->block_index);
 +
 +out_up_write:
 +	up_write(&segment->rwsem);
 +
 +	if (error)
 +		pr_debug("%d error: %d\n", block->block_index, error);
 +	return error;
 +}
 +
 +int incfs_read_file_signature(struct data_file *df, struct mem_range dst)
 +{
 +	struct backing_file_context *bfc = df->df_backing_file_context;
 +	struct incfs_df_signature *sig;
 +	int read_res = 0;
 +
 +	if (!dst.data)
 +		return -EFAULT;
 +
 +	sig = df->df_signature;
 +	if (!sig)
 +		return 0;
 +
 +	if (dst.len < sig->sig_size)
 +		return -E2BIG;
 +
 +	read_res = incfs_kread(bfc, dst.data, sig->sig_size, sig->sig_offset);
 +
 +	if (read_res < 0)
 +		return read_res;
 +
 +	if (read_res != sig->sig_size)
 +		return -EIO;
 +
 +	return read_res;
 +}
 +
 +int incfs_process_new_hash_block(struct data_file *df,
 +				 struct incfs_fill_block *block, u8 *data)
 +{
 +	struct backing_file_context *bfc = NULL;
 +	struct mount_info *mi = NULL;
 +	struct mtree *hash_tree = NULL;
 +	struct incfs_df_signature *sig = NULL;
 +	loff_t hash_area_base = 0;
 +	loff_t hash_area_size = 0;
 +	int error = 0;
 +
 +	if (!df || !block)
 +		return -EFAULT;
 +
 +	if (!(block->flags & INCFS_BLOCK_FLAGS_HASH))
 +		return -EINVAL;
 +
 +	bfc = df->df_backing_file_context;
 +	mi = df->df_mount_info;
 +
 +	if (!df)
 +		return -ENOENT;
 +
 +	hash_tree = df->df_hash_tree;
 +	sig = df->df_signature;
 +	if (!hash_tree || !sig || sig->hash_offset == 0)
 +		return -ENOTSUPP;
 +
 +	hash_area_base = sig->hash_offset;
 +	hash_area_size = sig->hash_size;
 +	if (hash_area_size < block->block_index * INCFS_DATA_FILE_BLOCK_SIZE
 +				+ block->data_len) {
 +		/* Hash block goes beyond dedicated hash area of this file. */
 +		return -ERANGE;
 +	}
 +
 +	error = mutex_lock_interruptible(&bfc->bc_mutex);
 +	if (!error) {
 +		error = incfs_write_hash_block_to_backing_file(
 +			bfc, range(data, block->data_len), block->block_index,
 +			hash_area_base, df->df_blockmap_off, df->df_size);
 +		mutex_unlock(&bfc->bc_mutex);
 +	}
 +	if (!error)
 +		atomic_inc(&df->df_hash_blocks_written);
 +
 +	return error;
 +}
 +
 +static int process_blockmap_md(struct incfs_blockmap *bm,
 +			       struct metadata_handler *handler)
 +{
 +	struct data_file *df = handler->context;
 +	int error = 0;
 +	loff_t base_off = le64_to_cpu(bm->m_base_offset);
 +	u32 block_count = le32_to_cpu(bm->m_block_count);
 +
 +	if (!df)
 +		return -EFAULT;
 +
 +	if (df->df_data_block_count > block_count)
 +		return -EBADMSG;
 +
 +	df->df_total_block_count = block_count;
 +	df->df_blockmap_off = base_off;
 +	return error;
 +}
 +
 +static int process_file_signature_md(struct incfs_file_signature *sg,
 +				struct metadata_handler *handler)
 +{
 +	struct data_file *df = handler->context;
 +	struct mtree *hash_tree = NULL;
 +	int error = 0;
 +	struct incfs_df_signature *signature =
 +		kzalloc(sizeof(*signature), GFP_NOFS);
 +	void *buf = NULL;
 +	ssize_t read;
 +
 +	if (!signature)
 +		return -ENOMEM;
 +
 +	if (!df || !df->df_backing_file_context ||
 +	    !df->df_backing_file_context->bc_file) {
 +		error = -ENOENT;
 +		goto out;
 +	}
 +
 +	signature->hash_offset = le64_to_cpu(sg->sg_hash_tree_offset);
 +	signature->hash_size = le32_to_cpu(sg->sg_hash_tree_size);
 +	signature->sig_offset = le64_to_cpu(sg->sg_sig_offset);
 +	signature->sig_size = le32_to_cpu(sg->sg_sig_size);
 +
 +	buf = kzalloc(signature->sig_size, GFP_NOFS);
 +	if (!buf) {
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +
 +	read = incfs_kread(df->df_backing_file_context, buf,
 +			   signature->sig_size, signature->sig_offset);
 +	if (read < 0) {
 +		error = read;
 +		goto out;
 +	}
 +
 +	if (read != signature->sig_size) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	hash_tree = incfs_alloc_mtree(range(buf, signature->sig_size),
 +				      df->df_data_block_count);
 +	if (IS_ERR(hash_tree)) {
 +		error = PTR_ERR(hash_tree);
 +		hash_tree = NULL;
 +		goto out;
 +	}
 +	if (hash_tree->hash_tree_area_size != signature->hash_size) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +	if (signature->hash_size > 0 &&
 +	    handler->md_record_offset <= signature->hash_offset) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +	if (handler->md_record_offset <= signature->sig_offset) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +	df->df_hash_tree = hash_tree;
 +	hash_tree = NULL;
 +	df->df_signature = signature;
 +	signature = NULL;
 +out:
 +	incfs_free_mtree(hash_tree);
 +	kfree(signature);
 +	kfree(buf);
 +
 +	return error;
 +}
 +
 +static int process_status_md(struct incfs_status *is,
 +			     struct metadata_handler *handler)
 +{
 +	struct data_file *df = handler->context;
 +
 +	df->df_initial_data_blocks_written =
 +		le32_to_cpu(is->is_data_blocks_written);
 +	atomic_set(&df->df_data_blocks_written,
 +		   df->df_initial_data_blocks_written);
 +
 +	df->df_initial_hash_blocks_written =
 +		le32_to_cpu(is->is_hash_blocks_written);
 +	atomic_set(&df->df_hash_blocks_written,
 +		   df->df_initial_hash_blocks_written);
 +
 +	df->df_status_offset = handler->md_record_offset;
 +	return 0;
 +}
 +
 +static int process_file_verity_signature_md(
 +		struct incfs_file_verity_signature *vs,
 +		struct metadata_handler *handler)
 +{
 +	struct data_file *df = handler->context;
 +	struct incfs_df_verity_signature *verity_signature;
 +
 +	if (!df)
 +		return -EFAULT;
 +
 +	verity_signature = kzalloc(sizeof(*verity_signature), GFP_NOFS);
 +	if (!verity_signature)
 +		return -ENOMEM;
 +
 +	verity_signature->offset = le64_to_cpu(vs->vs_offset);
 +	verity_signature->size = le32_to_cpu(vs->vs_size);
 +	if (verity_signature->size > FS_VERITY_MAX_SIGNATURE_SIZE) {
 +		kfree(verity_signature);
 +		return -EFAULT;
 +	}
 +
 +	df->df_verity_signature = verity_signature;
 +	return 0;
 +}
 +
 +static int incfs_scan_metadata_chain(struct data_file *df)
 +{
 +	struct metadata_handler *handler = NULL;
 +	int result = 0;
 +	int records_count = 0;
 +	int error = 0;
 +	struct backing_file_context *bfc = NULL;
 +	int nondata_block_count;
 +
 +	if (!df || !df->df_backing_file_context)
 +		return -EFAULT;
 +
 +	bfc = df->df_backing_file_context;
 +
 +	handler = kzalloc(sizeof(*handler), GFP_NOFS);
 +	if (!handler)
 +		return -ENOMEM;
 +
 +	handler->md_record_offset = df->df_metadata_off;
 +	handler->context = df;
 +	handler->handle_blockmap = process_blockmap_md;
 +	handler->handle_signature = process_file_signature_md;
 +	handler->handle_status = process_status_md;
 +	handler->handle_verity_signature = process_file_verity_signature_md;
 +
 +	while (handler->md_record_offset > 0) {
 +		error = incfs_read_next_metadata_record(bfc, handler);
 +		if (error) {
 +			pr_warn("incfs: Error during reading incfs-metadata record. Offset: %lld Record #%d Error code: %d\n",
 +				handler->md_record_offset, records_count + 1,
 +				-error);
 +			break;
 +		}
 +		records_count++;
 +	}
 +	if (error) {
 +		pr_warn("incfs: Error %d after reading %d incfs-metadata records.\n",
 +			 -error, records_count);
 +		result = error;
 +	} else
 +		result = records_count;
 +
 +	nondata_block_count = df->df_total_block_count -
 +		df->df_data_block_count;
 +	if (df->df_hash_tree) {
 +		int hash_block_count = get_blocks_count_for_size(
 +			df->df_hash_tree->hash_tree_area_size);
 +
 +		/*
 +		 * Files that were created with a hash tree have the hash tree
 +		 * included in the block map, i.e. nondata_block_count ==
 +		 * hash_block_count.  Files whose hash tree was added by
 +		 * FS_IOC_ENABLE_VERITY will still have the original block
 +		 * count, i.e. nondata_block_count == 0.
 +		 */
 +		if (nondata_block_count != hash_block_count &&
 +		    nondata_block_count != 0)
 +			result = -EINVAL;
 +	} else if (nondata_block_count != 0) {
 +		result = -EINVAL;
 +	}
 +
 +	kfree(handler);
 +	return result;
 +}
 +
 +/*
 + * Quickly checks if there are pending reads with a serial number larger
 + * than a given one.
 + */
 +bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number)
 +{
 +	bool result = false;
 +
 +	spin_lock(&mi->pending_read_lock);
 +	result = (mi->mi_last_pending_read_number > last_number) &&
 +		(mi->mi_pending_reads_count > 0);
 +	spin_unlock(&mi->pending_read_lock);
 +	return result;
 +}
 +
 +int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
 +				struct incfs_pending_read_info *reads,
 +				struct incfs_pending_read_info2 *reads2,
 +				int reads_size, int *new_max_sn)
 +{
 +	int reported_reads = 0;
 +	struct pending_read *entry = NULL;
 +
 +	if (!mi)
 +		return -EFAULT;
 +
 +	if (reads_size <= 0)
 +		return 0;
 +
 +	if (!incfs_fresh_pending_reads_exist(mi, sn_lowerbound))
 +		return 0;
 +
 +	rcu_read_lock();
 +
 +	list_for_each_entry_rcu(entry, &mi->mi_reads_list_head, mi_reads_list) {
 +		if (entry->serial_number <= sn_lowerbound)
 +			continue;
 +
 +		if (reads) {
 +			reads[reported_reads].file_id = entry->file_id;
 +			reads[reported_reads].block_index = entry->block_index;
 +			reads[reported_reads].serial_number =
 +				entry->serial_number;
 +			reads[reported_reads].timestamp_us =
 +				entry->timestamp_us;
 +		}
 +
 +		if (reads2) {
 +			reads2[reported_reads].file_id = entry->file_id;
 +			reads2[reported_reads].block_index = entry->block_index;
 +			reads2[reported_reads].serial_number =
 +				entry->serial_number;
 +			reads2[reported_reads].timestamp_us =
 +				entry->timestamp_us;
 +			reads2[reported_reads].uid = entry->uid;
 +		}
 +
 +		if (entry->serial_number > *new_max_sn)
 +			*new_max_sn = entry->serial_number;
 +
 +		reported_reads++;
 +		if (reported_reads >= reads_size)
 +			break;
 +	}
 +
 +	rcu_read_unlock();
 +
 +	return reported_reads;
 +}
 +
 +struct read_log_state incfs_get_log_state(struct mount_info *mi)
 +{
 +	struct read_log *log = &mi->mi_log;
 +	struct read_log_state result;
 +
 +	spin_lock(&log->rl_lock);
 +	result = log->rl_head;
 +	spin_unlock(&log->rl_lock);
 +	return result;
 +}
 +
 +int incfs_get_uncollected_logs_count(struct mount_info *mi,
 +				     const struct read_log_state *state)
 +{
 +	struct read_log *log = &mi->mi_log;
 +	u32 generation;
 +	u64 head_no, tail_no;
 +
 +	spin_lock(&log->rl_lock);
 +	tail_no = log->rl_tail.current_record_no;
 +	head_no = log->rl_head.current_record_no;
 +	generation = log->rl_head.generation_id;
 +	spin_unlock(&log->rl_lock);
 +
 +	if (generation != state->generation_id)
 +		return head_no - tail_no;
 +	else
 +		return head_no - max_t(u64, tail_no, state->current_record_no);
 +}
 +
 +int incfs_collect_logged_reads(struct mount_info *mi,
 +			       struct read_log_state *state,
 +			       struct incfs_pending_read_info *reads,
 +			       struct incfs_pending_read_info2 *reads2,
 +			       int reads_size)
 +{
 +	int dst_idx;
 +	struct read_log *log = &mi->mi_log;
 +	struct read_log_state *head, *tail;
 +
 +	spin_lock(&log->rl_lock);
 +	head = &log->rl_head;
 +	tail = &log->rl_tail;
 +
 +	if (state->generation_id != head->generation_id) {
 +		pr_debug("read ptr is wrong generation: %u/%u",
 +			 state->generation_id, head->generation_id);
 +
 +		*state = (struct read_log_state){
 +			.generation_id = head->generation_id,
 +		};
 +	}
 +
 +	if (state->current_record_no < tail->current_record_no) {
 +		pr_debug("read ptr is behind, moving: %u/%u -> %u/%u\n",
 +			 (u32)state->next_offset,
 +			 (u32)state->current_pass_no,
 +			 (u32)tail->next_offset, (u32)tail->current_pass_no);
 +
 +		*state = *tail;
 +	}
 +
 +	for (dst_idx = 0; dst_idx < reads_size; dst_idx++) {
 +		if (state->current_record_no == head->current_record_no)
 +			break;
 +
 +		log_read_one_record(log, state);
 +
 +		if (reads)
 +			reads[dst_idx] = (struct incfs_pending_read_info) {
 +				.file_id = state->base_record.file_id,
 +				.block_index = state->base_record.block_index,
 +				.serial_number = state->current_record_no,
 +				.timestamp_us =
 +					state->base_record.absolute_ts_us,
 +			};
 +
 +		if (reads2)
 +			reads2[dst_idx] = (struct incfs_pending_read_info2) {
 +				.file_id = state->base_record.file_id,
 +				.block_index = state->base_record.block_index,
 +				.serial_number = state->current_record_no,
 +				.timestamp_us =
 +					state->base_record.absolute_ts_us,
 +				.uid = state->base_record.uid,
 +			};
 +	}
 +
 +	spin_unlock(&log->rl_lock);
 +	return dst_idx;
 +}
 +
 diff --git a/fs/incfs/data_mgmt.h b/fs/incfs/data_mgmt.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/data_mgmt.h
 @@ -0,0 +1,551 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2019 Google LLC
 + */
 +#ifndef _INCFS_DATA_MGMT_H
 +#define _INCFS_DATA_MGMT_H
 +
 +#include <linux/cred.h>
 +#include <linux/fs.h>
 +#include <linux/types.h>
 +#include <linux/mutex.h>
 +#include <linux/spinlock.h>
 +#include <linux/rcupdate.h>
 +#include <linux/completion.h>
 +#include <linux/wait.h>
 +#include <linux/zstd.h>
 +#include <crypto/hash.h>
 +#include <linux/rwsem.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "internal.h"
 +#include "pseudo_files.h"
 +
 +#define SEGMENTS_PER_FILE 3
 +
 +enum LOG_RECORD_TYPE {
 +	FULL,
 +	SAME_FILE,
 +	SAME_FILE_CLOSE_BLOCK,
 +	SAME_FILE_CLOSE_BLOCK_SHORT,
 +	SAME_FILE_NEXT_BLOCK,
 +	SAME_FILE_NEXT_BLOCK_SHORT,
 +};
 +
 +struct full_record {
 +	enum LOG_RECORD_TYPE type : 3; /* FULL */
 +	u32 block_index : 29;
 +	incfs_uuid_t file_id;
 +	u64 absolute_ts_us;
 +	uid_t uid;
 +} __packed; /* 32 bytes */
 +
 +struct same_file {
 +	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */
 +	u32 block_index : 29;
 +	uid_t uid;
 +	u16 relative_ts_us; /* max 2^16 us ~= 64 ms */
 +} __packed; /* 10 bytes */
 +
 +struct same_file_close_block {
 +	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */
 +	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
 +	s16 block_index_delta;
 +} __packed; /* 4 bytes */
 +
 +struct same_file_close_block_short {
 +	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */
 +	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
 +	s8 block_index_delta;
 +} __packed; /* 2 bytes */
 +
 +struct same_file_next_block {
 +	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */
 +	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
 +} __packed; /* 2 bytes */
 +
 +struct same_file_next_block_short {
 +	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */
 +	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
 +} __packed; /* 1 byte */
 +
 +union log_record {
 +	struct full_record full_record;
 +	struct same_file same_file;
 +	struct same_file_close_block same_file_close_block;
 +	struct same_file_close_block_short same_file_close_block_short;
 +	struct same_file_next_block same_file_next_block;
 +	struct same_file_next_block_short same_file_next_block_short;
 +};
 +
 +struct read_log_state {
 +	/* Log buffer generation id, incremented on configuration changes */
 +	u32 generation_id;
 +
 +	/* Offset in rl_ring_buf to write into. */
 +	u32 next_offset;
 +
 +	/* Current number of writer passes over rl_ring_buf */
 +	u32 current_pass_no;
 +
 +	/* Current full_record to diff against */
 +	struct full_record base_record;
 +
 +	/* Current record number counting from configuration change */
 +	u64 current_record_no;
 +};
 +
 +/* A ring buffer to save records about data blocks which were recently read. */
 +struct read_log {
 +	void *rl_ring_buf;
 +
 +	int rl_size;
 +
 +	struct read_log_state rl_head;
 +
 +	struct read_log_state rl_tail;
 +
 +	/* A lock to protect the above fields */
 +	spinlock_t rl_lock;
 +
 +	/* A queue of waiters who want to be notified about reads */
 +	wait_queue_head_t ml_notif_wq;
 +
 +	/* A work item to wake up those waiters without slowing down readers */
 +	struct delayed_work ml_wakeup_work;
 +};
 +
 +struct mount_options {
 +	unsigned int read_timeout_ms;
 +	unsigned int readahead_pages;
 +	unsigned int read_log_pages;
 +	unsigned int read_log_wakeup_count;
 +	bool report_uid;
 +	char *sysfs_name;
 +};
 +
 +struct mount_info {
 +	struct super_block *mi_sb;
 +
 +	struct path mi_backing_dir_path;
 +
 +	struct dentry *mi_index_dir;
 +	/* For stacking mounts, if true, this indicates if the index dir needs
 +	 * to be freed for this SB otherwise it was created by lower level SB */
 +	bool mi_index_free;
 +
 +	struct dentry *mi_incomplete_dir;
 +	/* For stacking mounts, if true, this indicates if the incomplete dir
 +	 * needs to be freed for this SB. Similar to mi_index_free */
 +	bool mi_incomplete_free;
 +
 +	const struct cred *mi_owner;
 +
 +	struct mount_options mi_options;
 +
 +	/* This mutex is to be taken before create, rename, delete */
 +	struct mutex mi_dir_struct_mutex;
 +
 +	/*
 +	 * A queue of waiters who want to be notified about new pending reads.
 +	 */
 +	wait_queue_head_t mi_pending_reads_notif_wq;
 +
 +	/*
 +	 * Protects - RCU safe:
 +	 *  - reads_list_head
 +	 *  - mi_pending_reads_count
 +	 *  - mi_last_pending_read_number
 +	 *  - data_file_segment.reads_list_head
 +	 */
 +	spinlock_t pending_read_lock;
 +
 +	/* List of active pending_read objects */
 +	struct list_head mi_reads_list_head;
 +
 +	/* Total number of items in reads_list_head */
 +	int mi_pending_reads_count;
 +
 +	/*
 +	 * Last serial number that was assigned to a pending read.
 +	 * 0 means no pending reads have been seen yet.
 +	 */
 +	int mi_last_pending_read_number;
 +
 +	/* Temporary buffer for read logger. */
 +	struct read_log mi_log;
 +
 +	/* SELinux needs special xattrs on our pseudo files */
 +	struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT];
 +
 +	/* A queue of waiters who want to be notified about blocks_written */
 +	wait_queue_head_t mi_blocks_written_notif_wq;
 +
 +	/* Number of blocks written since mount */
 +	atomic_t mi_blocks_written;
 +
 +	/* Per UID read timeouts */
 +	spinlock_t mi_per_uid_read_timeouts_lock;
 +	struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts;
 +	int mi_per_uid_read_timeouts_size;
 +
 +	/* zstd workspace */
 +	struct mutex mi_zstd_workspace_mutex;
 +	void *mi_zstd_workspace;
 +	ZSTD_DStream *mi_zstd_stream;
 +	struct delayed_work mi_zstd_cleanup_work;
 +
 +	/* sysfs node */
 +	struct incfs_sysfs_node *mi_sysfs_node;
 +
 +	/* Last error information */
 +	struct mutex	mi_le_mutex;
 +	incfs_uuid_t	mi_le_file_id;
 +	u64		mi_le_time_us;
 +	u32		mi_le_page;
 +	u32		mi_le_errno;
 +	uid_t		mi_le_uid;
 +
 +	/* Number of reads timed out */
 +	u32 mi_reads_failed_timed_out;
 +
 +	/* Number of reads failed because hash verification failed */
 +	u32 mi_reads_failed_hash_verification;
 +
 +	/* Number of reads failed for another reason */
 +	u32 mi_reads_failed_other;
 +
 +	/* Number of reads delayed because page had to be fetched */
 +	u32 mi_reads_delayed_pending;
 +
 +	/* Total time waiting for pages to be fetched */
 +	u64 mi_reads_delayed_pending_us;
 +
 +	/*
 +	 * Number of reads delayed because of per-uid min_time_us or
 +	 * min_pending_time_us settings
 +	 */
 +	u32 mi_reads_delayed_min;
 +
 +	/* Total time waiting because of per-uid min_time_us or
 +	 * min_pending_time_us settings.
 +	 *
 +	 * Note that if a read is initially delayed because we have to wait for
 +	 * the page, then further delayed because of min_pending_time_us
 +	 * setting, this counter gets incremented by only the further delay
 +	 * time.
 +	 */
 +	u64 mi_reads_delayed_min_us;
 +};
 +
 +struct data_file_block {
 +	loff_t db_backing_file_data_offset;
 +
 +	size_t db_stored_size;
 +
 +	enum incfs_compression_alg db_comp_alg;
 +};
 +
 +struct pending_read {
 +	incfs_uuid_t file_id;
 +
 +	s64 timestamp_us;
 +
 +	atomic_t done;
 +
 +	int block_index;
 +
 +	int serial_number;
 +
 +	uid_t uid;
 +
 +	struct list_head mi_reads_list;
 +
 +	struct list_head segment_reads_list;
 +
 +	struct rcu_head rcu;
 +};
 +
 +struct data_file_segment {
 +	wait_queue_head_t new_data_arrival_wq;
 +
 +	/* Protects reads and writes from the blockmap */
 +	struct rw_semaphore rwsem;
 +
 +	/* List of active pending_read objects belonging to this segment */
 +	/* Protected by mount_info.pending_reads_mutex */
 +	struct list_head reads_list_head;
 +};
 +
 +/*
 + * Extra info associated with a file. Just a few bytes set by a user.
 + */
 +struct file_attr {
 +	loff_t fa_value_offset;
 +
 +	size_t fa_value_size;
 +
 +	u32 fa_crc;
 +};
 +
 +
 +struct data_file {
 +	struct backing_file_context *df_backing_file_context;
 +
 +	struct mount_info *df_mount_info;
 +
 +	incfs_uuid_t df_id;
 +
 +	/*
 +	 * Array of segments used to reduce lock contention for the file.
 +	 * Segment is chosen for a block depends on the block's index.
 +	 */
 +	struct data_file_segment df_segments[SEGMENTS_PER_FILE];
 +
 +	/* Base offset of the first metadata record. */
 +	loff_t df_metadata_off;
 +
 +	/* Base offset of the block map. */
 +	loff_t df_blockmap_off;
 +
 +	/* File size in bytes */
 +	loff_t df_size;
 +
 +	/* File header flags */
 +	u32 df_header_flags;
 +
 +	/* File size in DATA_FILE_BLOCK_SIZE blocks */
 +	int df_data_block_count;
 +
 +	/* Total number of blocks, data + hash */
 +	int df_total_block_count;
 +
 +	/* For mapped files, the offset into the actual file */
 +	loff_t df_mapped_offset;
 +
 +	/* Number of data blocks written to file */
 +	atomic_t df_data_blocks_written;
 +
 +	/* Number of data blocks in the status block */
 +	u32 df_initial_data_blocks_written;
 +
 +	/* Number of hash blocks written to file */
 +	atomic_t df_hash_blocks_written;
 +
 +	/* Number of hash blocks in the status block */
 +	u32 df_initial_hash_blocks_written;
 +
 +	/* Offset to status metadata header */
 +	loff_t df_status_offset;
 +
 +	/*
 +	 * Mutex acquired while enabling verity. Note that df_hash_tree is set
 +	 * by enable verity.
 +	 *
 +	 * The backing file mutex bc_mutex  may be taken while this mutex is
 +	 * held.
 +	 */
 +	struct mutex df_enable_verity;
 +
 +	/*
 +	 * Set either at construction time or during enabling verity. In the
 +	 * latter case, set via smp_store_release, so use smp_load_acquire to
 +	 * read it.
 +	 */
 +	struct mtree *df_hash_tree;
 +
 +	/* Guaranteed set if df_hash_tree is set. */
 +	struct incfs_df_signature *df_signature;
 +
 +	/*
 +	 * The verity file digest, set when verity is enabled and the file has
 +	 * been opened
 +	 */
 +	struct mem_range df_verity_file_digest;
 +
 +	struct incfs_df_verity_signature *df_verity_signature;
 +};
 +
 +struct dir_file {
 +	struct mount_info *mount_info;
 +
 +	struct file *backing_dir;
 +};
 +
 +struct inode_info {
 +	struct mount_info *n_mount_info; /* A mount, this file belongs to */
 +
 +	struct inode *n_backing_inode;
 +
 +	struct data_file *n_file;
 +
 +	struct inode n_vfs_inode;
 +};
 +
 +struct dentry_info {
 +	struct path backing_path;
 +};
 +
 +enum FILL_PERMISSION {
 +	CANT_FILL = 0,
 +	CAN_FILL = 1,
 +};
 +
 +struct incfs_file_data {
 +	/* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */
 +	enum FILL_PERMISSION fd_fill_permission;
 +
 +	/* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */
 +	int fd_get_block_pos;
 +
 +	/* And how many filled blocks are there up to that point */
 +	int fd_filled_data_blocks;
 +	int fd_filled_hash_blocks;
 +};
 +
 +struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
 +					  struct mount_options *options,
 +					  struct path *backing_dir_path);
 +
 +int incfs_realloc_mount_info(struct mount_info *mi,
 +			     struct mount_options *options);
 +
 +void incfs_free_mount_info(struct mount_info *mi);
 +
 +char *file_id_to_str(incfs_uuid_t id);
 +struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name);
 +struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf);
 +void incfs_free_data_file(struct data_file *df);
 +
 +struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf);
 +void incfs_free_dir_file(struct dir_file *dir);
 +
 +struct incfs_read_data_file_timeouts {
 +	u32 min_time_us;
 +	u32 min_pending_time_us;
 +	u32 max_pending_time_us;
 +};
 +
 +ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
 +			int index, struct mem_range tmp,
 +			struct incfs_read_data_file_timeouts *timeouts,
 +			unsigned int *delayed_min_us);
 +
 +ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
 +				      struct data_file *df, size_t offset);
 +
 +int incfs_get_filled_blocks(struct data_file *df,
 +			    struct incfs_file_data *fd,
 +			    struct incfs_get_filled_blocks_args *arg);
 +
 +int incfs_read_file_signature(struct data_file *df, struct mem_range dst);
 +
 +int incfs_process_new_data_block(struct data_file *df,
 +				 struct incfs_fill_block *block, u8 *data,
 +				 bool *complete);
 +
 +int incfs_process_new_hash_block(struct data_file *df,
 +				 struct incfs_fill_block *block, u8 *data);
 +
 +bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number);
 +
 +/*
 + * Collects pending reads and saves them into the array (reads/reads_size).
 + * Only reads with serial_number > sn_lowerbound are reported.
 + * Returns how many reads were saved into the array.
 + */
 +int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
 +				struct incfs_pending_read_info *reads,
 +				struct incfs_pending_read_info2 *reads2,
 +				int reads_size, int *new_max_sn);
 +
 +int incfs_collect_logged_reads(struct mount_info *mi,
 +			       struct read_log_state *start_state,
 +			       struct incfs_pending_read_info *reads,
 +			       struct incfs_pending_read_info2 *reads2,
 +			       int reads_size);
 +struct read_log_state incfs_get_log_state(struct mount_info *mi);
 +int incfs_get_uncollected_logs_count(struct mount_info *mi,
 +				     const struct read_log_state *state);
 +
 +static inline struct inode_info *get_incfs_node(struct inode *inode)
 +{
 +	if (!inode)
 +		return NULL;
 +
 +	if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) {
 +		/* This inode doesn't belong to us. */
 +		pr_warn_once("incfs: %s on an alien inode.", __func__);
 +		return NULL;
 +	}
 +
 +	return container_of(inode, struct inode_info, n_vfs_inode);
 +}
 +
 +static inline struct data_file *get_incfs_data_file(struct file *f)
 +{
 +	struct inode_info *node = NULL;
 +
 +	if (!f)
 +		return NULL;
 +
 +	if (!S_ISREG(f->f_inode->i_mode))
 +		return NULL;
 +
 +	node = get_incfs_node(f->f_inode);
 +	if (!node)
 +		return NULL;
 +
 +	return node->n_file;
 +}
 +
 +static inline struct dir_file *get_incfs_dir_file(struct file *f)
 +{
 +	if (!f)
 +		return NULL;
 +
 +	if (!S_ISDIR(f->f_inode->i_mode))
 +		return NULL;
 +
 +	return (struct dir_file *)f->private_data;
 +}
 +
 +/*
 + * Make sure that inode_info.n_file is initialized and inode can be used
 + * for reading and writing data from/to the backing file.
 + */
 +int make_inode_ready_for_data_ops(struct mount_info *mi,
 +				struct inode *inode,
 +				struct file *backing_file);
 +
 +static inline struct dentry_info *get_incfs_dentry(const struct dentry *d)
 +{
 +	if (!d)
 +		return NULL;
 +
 +	return (struct dentry_info *)d->d_fsdata;
 +}
 +
 +static inline void get_incfs_backing_path(const struct dentry *d,
 +					  struct path *path)
 +{
 +	struct dentry_info *di = get_incfs_dentry(d);
 +
 +	if (!di) {
 +		*path = (struct path) {};
 +		return;
 +	}
 +
 +	*path = di->backing_path;
 +	path_get(path);
 +}
 +
 +static inline int get_blocks_count_for_size(u64 size)
 +{
 +	if (size == 0)
 +		return 0;
 +	return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE;
 +}
 +
 +#endif /* _INCFS_DATA_MGMT_H */
 diff --git a/fs/incfs/format.c b/fs/incfs/format.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/format.c
 @@ -0,0 +1,752 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2018 Google LLC
 + */
 +#include <linux/fs.h>
 +#include <linux/file.h>
 +#include <linux/types.h>
 +#include <linux/mutex.h>
 +#include <linux/mm.h>
 +#include <linux/falloc.h>
 +#include <linux/slab.h>
 +#include <linux/crc32.h>
 +#include <linux/kernel.h>
 +
 +#include "format.h"
 +#include "data_mgmt.h"
 +
 +struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi,
 +					     struct file *backing_file)
 +{
 +	struct backing_file_context *result = NULL;
 +
 +	result = kzalloc(sizeof(*result), GFP_NOFS);
 +	if (!result)
 +		return ERR_PTR(-ENOMEM);
 +
 +	result->bc_file = get_file(backing_file);
 +	result->bc_cred = mi->mi_owner;
 +	mutex_init(&result->bc_mutex);
 +	return result;
 +}
 +
 +void incfs_free_bfc(struct backing_file_context *bfc)
 +{
 +	if (!bfc)
 +		return;
 +
 +	if (bfc->bc_file)
 +		fput(bfc->bc_file);
 +
 +	mutex_destroy(&bfc->bc_mutex);
 +	kfree(bfc);
 +}
 +
 +static loff_t incfs_get_end_offset(struct file *f)
 +{
 +	/*
 +	 * This function assumes that file size and the end-offset
 +	 * are the same. This is not always true.
 +	 */
 +	return i_size_read(file_inode(f));
 +}
 +
 +/*
 + * Truncate the tail of the file to the given length.
 + * Used to rollback partially successful multistep writes.
 + */
 +static int truncate_backing_file(struct backing_file_context *bfc,
 +				loff_t new_end)
 +{
 +	struct inode *inode = NULL;
 +	struct dentry *dentry = NULL;
 +	loff_t old_end = 0;
 +	struct iattr attr;
 +	int result = 0;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	if (!bfc->bc_file)
 +		return -EFAULT;
 +
 +	old_end = incfs_get_end_offset(bfc->bc_file);
 +	if (old_end == new_end)
 +		return 0;
 +	if (old_end < new_end)
 +		return -EINVAL;
 +
 +	inode = bfc->bc_file->f_inode;
 +	dentry = bfc->bc_file->f_path.dentry;
 +
 +	attr.ia_size = new_end;
 +	attr.ia_valid = ATTR_SIZE;
 +
 +	inode_lock(inode);
 +	result = notify_change(&nop_mnt_idmap, dentry, &attr, NULL);
 +	inode_unlock(inode);
 +
 +	return result;
 +}
 +
 +static int write_to_bf(struct backing_file_context *bfc, const void *buf,
 +			size_t count, loff_t pos)
 +{
 +	ssize_t res = incfs_kwrite(bfc, buf, count, pos);
 +
 +	if (res < 0)
 +		return res;
 +	if (res != count)
 +		return -EIO;
 +	return 0;
 +}
 +
 +static int append_zeros_no_fallocate(struct backing_file_context *bfc,
 +				     size_t file_size, size_t len)
 +{
 +	u8 buffer[256] = {};
 +	size_t i;
 +
 +	for (i = 0; i < len; i += sizeof(buffer)) {
 +		int to_write = len - i > sizeof(buffer)
 +			? sizeof(buffer) : len - i;
 +		int err = write_to_bf(bfc, buffer, to_write, file_size + i);
 +
 +		if (err)
 +			return err;
 +	}
 +
 +	return 0;
 +}
 +
 +/* Append a given number of zero bytes to the end of the backing file. */
 +static int append_zeros(struct backing_file_context *bfc, size_t len)
 +{
 +	loff_t file_size = 0;
 +	loff_t new_last_byte_offset = 0;
 +	int result;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	if (len == 0)
 +		return 0;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	/*
 +	 * Allocate only one byte at the new desired end of the file.
 +	 * It will increase file size and create a zeroed area of
 +	 * a given size.
 +	 */
 +	file_size = incfs_get_end_offset(bfc->bc_file);
 +	new_last_byte_offset = file_size + len - 1;
 +	result = vfs_fallocate(bfc->bc_file, 0, new_last_byte_offset, 1);
 +	if (result != -EOPNOTSUPP)
 +		return result;
 +
 +	return append_zeros_no_fallocate(bfc, file_size, len);
 +}
 +
 +/*
 + * Append a given metadata record to the backing file and update a previous
 + * record to add the new record the the metadata list.
 + */
 +static int append_md_to_backing_file(struct backing_file_context *bfc,
 +			      struct incfs_md_header *record)
 +{
 +	int result = 0;
 +	loff_t record_offset;
 +	loff_t file_pos;
 +	__le64 new_md_offset;
 +	size_t record_size;
 +
 +	if (!bfc || !record)
 +		return -EFAULT;
 +
 +	if (bfc->bc_last_md_record_offset < 0)
 +		return -EINVAL;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	record_size = le16_to_cpu(record->h_record_size);
 +	file_pos = incfs_get_end_offset(bfc->bc_file);
 +	record->h_next_md_offset = 0;
 +
 +	/* Write the metadata record to the end of the backing file */
 +	record_offset = file_pos;
 +	new_md_offset = cpu_to_le64(record_offset);
 +	result = write_to_bf(bfc, record, record_size, file_pos);
 +	if (result)
 +		return result;
 +
 +	/* Update next metadata offset in a previous record or a superblock. */
 +	if (bfc->bc_last_md_record_offset) {
 +		/*
 +		 * Find a place in the previous md record where new record's
 +		 * offset needs to be saved.
 +		 */
 +		file_pos = bfc->bc_last_md_record_offset +
 +			offsetof(struct incfs_md_header, h_next_md_offset);
 +	} else {
 +		/*
 +		 * No metadata yet, file a place to update in the
 +		 * file_header.
 +		 */
 +		file_pos = offsetof(struct incfs_file_header,
 +				    fh_first_md_offset);
 +	}
 +	result = write_to_bf(bfc, &new_md_offset, sizeof(new_md_offset),
 +			     file_pos);
 +	if (result)
 +		return result;
 +
 +	bfc->bc_last_md_record_offset = record_offset;
 +	return result;
 +}
 +
 +/*
 + * Reserve 0-filled space for the blockmap body, and append
 + * incfs_blockmap metadata record pointing to it.
 + */
 +int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
 +					 u32 block_count)
 +{
 +	struct incfs_blockmap blockmap = {};
 +	int result = 0;
 +	loff_t file_end = 0;
 +	size_t map_size = block_count * sizeof(struct incfs_blockmap_entry);
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	blockmap.m_header.h_md_entry_type = INCFS_MD_BLOCK_MAP;
 +	blockmap.m_header.h_record_size = cpu_to_le16(sizeof(blockmap));
 +	blockmap.m_header.h_next_md_offset = cpu_to_le64(0);
 +	blockmap.m_block_count = cpu_to_le32(block_count);
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	/* Reserve 0-filled space for the blockmap body in the backing file. */
 +	file_end = incfs_get_end_offset(bfc->bc_file);
 +	result = append_zeros(bfc, map_size);
 +	if (result)
 +		return result;
 +
 +	/* Write blockmap metadata record pointing to the body written above. */
 +	blockmap.m_base_offset = cpu_to_le64(file_end);
 +	result = append_md_to_backing_file(bfc, &blockmap.m_header);
 +	if (result)
 +		/* Error, rollback file changes */
 +		truncate_backing_file(bfc, file_end);
 +
 +	return result;
 +}
 +
 +int incfs_write_signature_to_backing_file(struct backing_file_context *bfc,
 +					struct mem_range sig, u32 tree_size,
 +					loff_t *tree_offset, loff_t *sig_offset)
 +{
 +	struct incfs_file_signature sg = {};
 +	int result = 0;
 +	loff_t rollback_pos = 0;
 +	loff_t tree_area_pos = 0;
 +	size_t alignment = 0;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	rollback_pos = incfs_get_end_offset(bfc->bc_file);
 +
 +	sg.sg_header.h_md_entry_type = INCFS_MD_SIGNATURE;
 +	sg.sg_header.h_record_size = cpu_to_le16(sizeof(sg));
 +	sg.sg_header.h_next_md_offset = cpu_to_le64(0);
 +	if (sig.data != NULL && sig.len > 0) {
 +		sg.sg_sig_size = cpu_to_le32(sig.len);
 +		sg.sg_sig_offset = cpu_to_le64(rollback_pos);
 +
 +		result = write_to_bf(bfc, sig.data, sig.len, rollback_pos);
 +		if (result)
 +			goto err;
 +	}
 +
 +	tree_area_pos = incfs_get_end_offset(bfc->bc_file);
 +	if (tree_size > 0) {
 +		if (tree_size > 5 * INCFS_DATA_FILE_BLOCK_SIZE) {
 +			/*
 +			 * If hash tree is big enough, it makes sense to
 +			 * align in the backing file for faster access.
 +			 */
 +			loff_t offset = round_up(tree_area_pos, PAGE_SIZE);
 +
 +			alignment = offset - tree_area_pos;
 +			tree_area_pos = offset;
 +		}
 +
 +		/*
 +		 * If root hash is not the only hash in the tree.
 +		 * reserve 0-filled space for the tree.
 +		 */
 +		result = append_zeros(bfc, tree_size + alignment);
 +		if (result)
 +			goto err;
 +
 +		sg.sg_hash_tree_size = cpu_to_le32(tree_size);
 +		sg.sg_hash_tree_offset = cpu_to_le64(tree_area_pos);
 +	}
 +
 +	/* Write a hash tree metadata record pointing to the hash tree above. */
 +	result = append_md_to_backing_file(bfc, &sg.sg_header);
 +err:
 +	if (result)
 +		/* Error, rollback file changes */
 +		truncate_backing_file(bfc, rollback_pos);
 +	else {
 +		if (tree_offset)
 +			*tree_offset = tree_area_pos;
 +		if (sig_offset)
 +			*sig_offset = rollback_pos;
 +	}
 +
 +	return result;
 +}
 +
 +static int write_new_status_to_backing_file(struct backing_file_context *bfc,
 +				       u32 data_blocks_written,
 +				       u32 hash_blocks_written)
 +{
 +	int result;
 +	loff_t rollback_pos;
 +	struct incfs_status is = {
 +		.is_header = {
 +			.h_md_entry_type = INCFS_MD_STATUS,
 +			.h_record_size = cpu_to_le16(sizeof(is)),
 +		},
 +		.is_data_blocks_written = cpu_to_le32(data_blocks_written),
 +		.is_hash_blocks_written = cpu_to_le32(hash_blocks_written),
 +	};
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +	rollback_pos = incfs_get_end_offset(bfc->bc_file);
 +	result = append_md_to_backing_file(bfc, &is.is_header);
 +	if (result)
 +		truncate_backing_file(bfc, rollback_pos);
 +
 +	return result;
 +}
 +
 +int incfs_write_status_to_backing_file(struct backing_file_context *bfc,
 +				       loff_t status_offset,
 +				       u32 data_blocks_written,
 +				       u32 hash_blocks_written)
 +{
 +	struct incfs_status is;
 +	int result;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	if (status_offset == 0)
 +		return write_new_status_to_backing_file(bfc,
 +				data_blocks_written, hash_blocks_written);
 +
 +	result = incfs_kread(bfc, &is, sizeof(is), status_offset);
 +	if (result != sizeof(is))
 +		return -EIO;
 +
 +	is.is_data_blocks_written = cpu_to_le32(data_blocks_written);
 +	is.is_hash_blocks_written = cpu_to_le32(hash_blocks_written);
 +	result = incfs_kwrite(bfc, &is, sizeof(is), status_offset);
 +	if (result != sizeof(is))
 +		return -EIO;
 +
 +	return 0;
 +}
 +
 +int incfs_write_verity_signature_to_backing_file(
 +		struct backing_file_context *bfc, struct mem_range signature,
 +		loff_t *offset)
 +{
 +	struct incfs_file_verity_signature vs = {};
 +	int result;
 +	loff_t pos;
 +
 +	/* No verity signature section is equivalent to an empty section */
 +	if (signature.data == NULL || signature.len == 0)
 +		return 0;
 +
 +	pos = incfs_get_end_offset(bfc->bc_file);
 +
 +	vs = (struct incfs_file_verity_signature) {
 +		.vs_header = (struct incfs_md_header) {
 +			.h_md_entry_type = INCFS_MD_VERITY_SIGNATURE,
 +			.h_record_size = cpu_to_le16(sizeof(vs)),
 +			.h_next_md_offset = cpu_to_le64(0),
 +		},
 +		.vs_size = cpu_to_le32(signature.len),
 +		.vs_offset = cpu_to_le64(pos),
 +	};
 +
 +	result = write_to_bf(bfc, signature.data, signature.len, pos);
 +	if (result)
 +		goto err;
 +
 +	result = append_md_to_backing_file(bfc, &vs.vs_header);
 +	if (result)
 +		goto err;
 +
 +	*offset = pos;
 +err:
 +	if (result)
 +		/* Error, rollback file changes */
 +		truncate_backing_file(bfc, pos);
 +	return result;
 +}
 +
 +/*
 + * Write a backing file header
 + * It should always be called only on empty file.
 + * fh.fh_first_md_offset is 0 for now, but will be updated
 + * once first metadata record is added.
 + */
 +int incfs_write_fh_to_backing_file(struct backing_file_context *bfc,
 +				   incfs_uuid_t *uuid, u64 file_size)
 +{
 +	struct incfs_file_header fh = {};
 +	loff_t file_pos = 0;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
 +	fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
 +	fh.fh_header_size = cpu_to_le16(sizeof(fh));
 +	fh.fh_first_md_offset = cpu_to_le64(0);
 +	fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
 +
 +	fh.fh_file_size = cpu_to_le64(file_size);
 +	fh.fh_uuid = *uuid;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	file_pos = incfs_get_end_offset(bfc->bc_file);
 +	if (file_pos != 0)
 +		return -EEXIST;
 +
 +	return write_to_bf(bfc, &fh, sizeof(fh), file_pos);
 +}
 +
 +/*
 + * Write a backing file header for a mapping file
 + * It should always be called only on empty file.
 + */
 +int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc,
 +				incfs_uuid_t *uuid, u64 file_size, u64 offset)
 +{
 +	struct incfs_file_header fh = {};
 +	loff_t file_pos = 0;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	fh.fh_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
 +	fh.fh_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
 +	fh.fh_header_size = cpu_to_le16(sizeof(fh));
 +	fh.fh_original_offset = cpu_to_le64(offset);
 +	fh.fh_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
 +
 +	fh.fh_mapped_file_size = cpu_to_le64(file_size);
 +	fh.fh_original_uuid = *uuid;
 +	fh.fh_flags = cpu_to_le32(INCFS_FILE_MAPPED);
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	file_pos = incfs_get_end_offset(bfc->bc_file);
 +	if (file_pos != 0)
 +		return -EEXIST;
 +
 +	return write_to_bf(bfc, &fh, sizeof(fh), file_pos);
 +}
 +
 +/* Write a given data block and update file's blockmap to point it. */
 +int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
 +				     struct mem_range block, int block_index,
 +				     loff_t bm_base_off, u16 flags)
 +{
 +	struct incfs_blockmap_entry bm_entry = {};
 +	int result = 0;
 +	loff_t data_offset = 0;
 +	loff_t bm_entry_off =
 +		bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	if (block.len >= (1 << 16) || block_index < 0)
 +		return -EINVAL;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	data_offset = incfs_get_end_offset(bfc->bc_file);
 +	if (data_offset <= bm_entry_off) {
 +		/* Blockmap entry is beyond the file's end. It is not normal. */
 +		return -EINVAL;
 +	}
 +
 +	/* Write the block data at the end of the backing file. */
 +	result = write_to_bf(bfc, block.data, block.len, data_offset);
 +	if (result)
 +		return result;
 +
 +	/* Update the blockmap to point to the newly written data. */
 +	bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
 +	bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
 +	bm_entry.me_data_size = cpu_to_le16((u16)block.len);
 +	bm_entry.me_flags = cpu_to_le16(flags);
 +
 +	return write_to_bf(bfc, &bm_entry, sizeof(bm_entry),
 +				bm_entry_off);
 +}
 +
 +int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc,
 +					   struct mem_range block,
 +					   int block_index,
 +					   loff_t hash_area_off,
 +					   loff_t bm_base_off,
 +					   loff_t file_size)
 +{
 +	struct incfs_blockmap_entry bm_entry = {};
 +	int result;
 +	loff_t data_offset = 0;
 +	loff_t file_end = 0;
 +	loff_t bm_entry_off =
 +		bm_base_off +
 +		sizeof(struct incfs_blockmap_entry) *
 +			(block_index + get_blocks_count_for_size(file_size));
 +
 +	if (!bfc)
 +		return -EFAULT;
 +
 +	LOCK_REQUIRED(bfc->bc_mutex);
 +
 +	data_offset = hash_area_off + block_index * INCFS_DATA_FILE_BLOCK_SIZE;
 +	file_end = incfs_get_end_offset(bfc->bc_file);
 +	if (data_offset + block.len > file_end) {
 +		/* Block is located beyond the file's end. It is not normal. */
 +		return -EINVAL;
 +	}
 +
 +	result = write_to_bf(bfc, block.data, block.len, data_offset);
 +	if (result)
 +		return result;
 +
 +	bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
 +	bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
 +	bm_entry.me_data_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
 +
 +	return write_to_bf(bfc, &bm_entry, sizeof(bm_entry), bm_entry_off);
 +}
 +
 +int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
 +			loff_t bm_base_off,
 +			struct incfs_blockmap_entry *bm_entry)
 +{
 +	int error = incfs_read_blockmap_entries(bfc, bm_entry, block_index, 1,
 +						bm_base_off);
 +
 +	if (error < 0)
 +		return error;
 +
 +	if (error == 0)
 +		return -EIO;
 +
 +	if (error != 1)
 +		return -EFAULT;
 +
 +	return 0;
 +}
 +
 +int incfs_read_blockmap_entries(struct backing_file_context *bfc,
 +		struct incfs_blockmap_entry *entries,
 +		int start_index, int blocks_number,
 +		loff_t bm_base_off)
 +{
 +	loff_t bm_entry_off =
 +		bm_base_off + sizeof(struct incfs_blockmap_entry) * start_index;
 +	const size_t bytes_to_read = sizeof(struct incfs_blockmap_entry)
 +					* blocks_number;
 +	int result = 0;
 +
 +	if (!bfc || !entries)
 +		return -EFAULT;
 +
 +	if (start_index < 0 || bm_base_off <= 0)
 +		return -ENODATA;
 +
 +	result = incfs_kread(bfc, entries, bytes_to_read, bm_entry_off);
 +	if (result < 0)
 +		return result;
 +	return result / sizeof(*entries);
 +}
 +
 +int incfs_read_file_header(struct backing_file_context *bfc,
 +			   loff_t *first_md_off, incfs_uuid_t *uuid,
 +			   u64 *file_size, u32 *flags)
 +{
 +	ssize_t bytes_read = 0;
 +	struct incfs_file_header fh = {};
 +
 +	if (!bfc || !first_md_off)
 +		return -EFAULT;
 +
 +	bytes_read = incfs_kread(bfc, &fh, sizeof(fh), 0);
 +	if (bytes_read < 0)
 +		return bytes_read;
 +
 +	if (bytes_read < sizeof(fh))
 +		return -EBADMSG;
 +
 +	if (le64_to_cpu(fh.fh_magic) != INCFS_MAGIC_NUMBER)
 +		return -EILSEQ;
 +
 +	if (le64_to_cpu(fh.fh_version) > INCFS_FORMAT_CURRENT_VER)
 +		return -EILSEQ;
 +
 +	if (le16_to_cpu(fh.fh_data_block_size) != INCFS_DATA_FILE_BLOCK_SIZE)
 +		return -EILSEQ;
 +
 +	if (le16_to_cpu(fh.fh_header_size) != sizeof(fh))
 +		return -EILSEQ;
 +
 +	if (first_md_off)
 +		*first_md_off = le64_to_cpu(fh.fh_first_md_offset);
 +	if (uuid)
 +		*uuid = fh.fh_uuid;
 +	if (file_size)
 +		*file_size = le64_to_cpu(fh.fh_file_size);
 +	if (flags)
 +		*flags = le32_to_cpu(fh.fh_flags);
 +	return 0;
 +}
 +
 +/*
 + * Read through metadata records from the backing file one by one
 + * and call provided metadata handlers.
 + */
 +int incfs_read_next_metadata_record(struct backing_file_context *bfc,
 +			      struct metadata_handler *handler)
 +{
 +	const ssize_t max_md_size = INCFS_MAX_METADATA_RECORD_SIZE;
 +	ssize_t bytes_read = 0;
 +	size_t md_record_size = 0;
 +	loff_t next_record = 0;
 +	int res = 0;
 +	struct incfs_md_header *md_hdr = NULL;
 +
 +	if (!bfc || !handler)
 +		return -EFAULT;
 +
 +	if (handler->md_record_offset == 0)
 +		return -EPERM;
 +
 +	memset(&handler->md_buffer, 0, max_md_size);
 +	bytes_read = incfs_kread(bfc, &handler->md_buffer, max_md_size,
 +				 handler->md_record_offset);
 +	if (bytes_read < 0)
 +		return bytes_read;
 +	if (bytes_read < sizeof(*md_hdr))
 +		return -EBADMSG;
 +
 +	md_hdr = &handler->md_buffer.md_header;
 +	next_record = le64_to_cpu(md_hdr->h_next_md_offset);
 +	md_record_size = le16_to_cpu(md_hdr->h_record_size);
 +
 +	if (md_record_size > max_md_size) {
 +		pr_warn("incfs: The record is too large. Size: %zu",
 +				md_record_size);
 +		return -EBADMSG;
 +	}
 +
 +	if (bytes_read < md_record_size) {
 +		pr_warn("incfs: The record hasn't been fully read.");
 +		return -EBADMSG;
 +	}
 +
 +	if (next_record <= handler->md_record_offset && next_record != 0) {
 +		pr_warn("incfs: Next record (%lld) points back in file.",
 +			next_record);
 +		return -EBADMSG;
 +	}
 +
 +	switch (md_hdr->h_md_entry_type) {
 +	case INCFS_MD_NONE:
 +		break;
 +	case INCFS_MD_BLOCK_MAP:
 +		if (handler->handle_blockmap)
 +			res = handler->handle_blockmap(
 +				&handler->md_buffer.blockmap, handler);
 +		break;
 +	case INCFS_MD_FILE_ATTR:
 +		/*
 +		 * File attrs no longer supported, ignore section for
 +		 * compatibility
 +		 */
 +		break;
 +	case INCFS_MD_SIGNATURE:
 +		if (handler->handle_signature)
 +			res = handler->handle_signature(
 +				&handler->md_buffer.signature, handler);
 +		break;
 +	case INCFS_MD_STATUS:
 +		if (handler->handle_status)
 +			res = handler->handle_status(
 +				&handler->md_buffer.status, handler);
 +		break;
 +	case INCFS_MD_VERITY_SIGNATURE:
 +		if (handler->handle_verity_signature)
 +			res = handler->handle_verity_signature(
 +				&handler->md_buffer.verity_signature, handler);
 +		break;
 +	default:
 +		res = -ENOTSUPP;
 +		break;
 +	}
 +
 +	if (!res) {
 +		if (next_record == 0) {
 +			/*
 +			 * Zero offset for the next record means that the last
 +			 * metadata record has just been processed.
 +			 */
 +			bfc->bc_last_md_record_offset =
 +				handler->md_record_offset;
 +		}
 +		handler->md_prev_record_offset = handler->md_record_offset;
 +		handler->md_record_offset = next_record;
 +	}
 +	return res;
 +}
 +
 +ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size,
 +		    loff_t pos)
 +{
 +	const struct cred *old_cred = override_creds(bfc->bc_cred);
 +	int ret = kernel_read(bfc->bc_file, buf, size, &pos);
 +
 +	revert_creds(old_cred);
 +	return ret;
 +}
 +
 +ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf,
 +		     size_t size, loff_t pos)
 +{
 +	const struct cred *old_cred = override_creds(bfc->bc_cred);
 +	int ret = kernel_write(bfc->bc_file, buf, size, &pos);
 +
 +	revert_creds(old_cred);
 +	return ret;
 +}
 diff --git a/fs/incfs/format.h b/fs/incfs/format.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/format.h
 @@ -0,0 +1,408 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2018 Google LLC
 + */
 +
 +/*
 + * Overview
 + * --------
 + * The backbone of the incremental-fs ondisk format is an append only linked
 + * list of metadata blocks. Each metadata block contains an offset of the next
 + * one. These blocks describe files and directories on the
 + * file system. They also represent actions of adding and removing file names
 + * (hard links).
 + *
 + * Every time incremental-fs instance is mounted, it reads through this list
 + * to recreate filesystem's state in memory. An offset of the first record in
 + * the metadata list is stored in the superblock at the beginning of the backing
 + * file.
 + *
 + * Most of the backing file is taken by data areas and blockmaps.
 + * Since data blocks can be compressed and have different sizes,
 + * single per-file data area can't be pre-allocated. That's why blockmaps are
 + * needed in order to find a location and size of each data block in
 + * the backing file. Each time a file is created, a corresponding block map is
 + * allocated to store future offsets of data blocks.
 + *
 + * Whenever a data block is given by data loader to incremental-fs:
 + *   - A data area with the given block is appended to the end of
 + *     the backing file.
 + *   - A record in the blockmap for the given block index is updated to reflect
 + *     its location, size, and compression algorithm.
 +
 + * Metadata records
 + * ----------------
 + * incfs_blockmap - metadata record that specifies size and location
 + *                           of a blockmap area for a given file. This area
 + *                           contains an array of incfs_blockmap_entry-s.
 + * incfs_file_signature - metadata record that specifies where file signature
 + *                           and its hash tree can be found in the backing file.
 + *
 + * incfs_file_attr - metadata record that specifies where additional file
 + *		        attributes blob can be found.
 + *
 + * Metadata header
 + * ---------------
 + * incfs_md_header - header of a metadata record. It's always a part
 + *                   of other structures and served purpose of metadata
 + *                   bookkeeping.
 + *
 + *              +-----------------------------------------------+       ^
 + *              |            incfs_md_header                    |       |
 + *              | 1. type of body(BLOCKMAP, FILE_ATTR..)        |       |
 + *              | 2. size of the whole record header + body     |       |
 + *              | 3. CRC the whole record header + body         |       |
 + *              | 4. offset of the previous md record           |]------+
 + *              | 5. offset of the next md record (md link)     |]---+
 + *              +-----------------------------------------------+    |
 + *              |  Metadata record body with useful data        |    |
 + *              +-----------------------------------------------+    |
 + *                                                                   +--->
 + *
 + * Other ondisk structures
 + * -----------------------
 + * incfs_super_block - backing file header
 + * incfs_blockmap_entry - a record in a blockmap area that describes size
 + *                       and location of a data block.
 + * Data blocks dont have any particular structure, they are written to the
 + * backing file in a raw form as they come from a data loader.
 + *
 + * Backing file layout
 + * -------------------
 + *
 + *
 + *              +-------------------------------------------+
 + *              |            incfs_file_header              |]---+
 + *              +-------------------------------------------+    |
 + *              |                 metadata                  |<---+
 + *              |           incfs_file_signature            |]---+
 + *              +-------------------------------------------+    |
 + *                        .........................              |
 + *              +-------------------------------------------+    |   metadata
 + *     +------->|               blockmap area               |    |  list links
 + *     |        |          [incfs_blockmap_entry]           |    |
 + *     |        |          [incfs_blockmap_entry]           |    |
 + *     |        |          [incfs_blockmap_entry]           |    |
 + *     |    +--[|          [incfs_blockmap_entry]           |    |
 + *     |    |   |          [incfs_blockmap_entry]           |    |
 + *     |    |   |          [incfs_blockmap_entry]           |    |
 + *     |    |   +-------------------------------------------+    |
 + *     |    |             .........................              |
 + *     |    |   +-------------------------------------------+    |
 + *     |    |   |                 metadata                  |<---+
 + *     +----|--[|               incfs_blockmap              |]---+
 + *          |   +-------------------------------------------+    |
 + *          |             .........................              |
 + *          |   +-------------------------------------------+    |
 + *          +-->|                 data block                |    |
 + *              +-------------------------------------------+    |
 + *                        .........................              |
 + *              +-------------------------------------------+    |
 + *              |                 metadata                  |<---+
 + *              |              incfs_file_attr              |
 + *              +-------------------------------------------+
 + */
 +#ifndef _INCFS_FORMAT_H
 +#define _INCFS_FORMAT_H
 +#include <linux/types.h>
 +#include <linux/kernel.h>
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "internal.h"
 +
 +#define INCFS_MAX_NAME_LEN 255
 +#define INCFS_FORMAT_V1 1
 +#define INCFS_FORMAT_CURRENT_VER INCFS_FORMAT_V1
 +
 +enum incfs_metadata_type {
 +	INCFS_MD_NONE = 0,
 +	INCFS_MD_BLOCK_MAP = 1,
 +	INCFS_MD_FILE_ATTR = 2,
 +	INCFS_MD_SIGNATURE = 3,
 +	INCFS_MD_STATUS = 4,
 +	INCFS_MD_VERITY_SIGNATURE = 5,
 +};
 +
 +enum incfs_file_header_flags {
 +	INCFS_FILE_MAPPED = 1 << 1,
 +};
 +
 +/* Header included at the beginning of all metadata records on the disk. */
 +struct incfs_md_header {
 +	__u8 h_md_entry_type;
 +
 +	/*
 +	 * Size of the metadata record.
 +	 * (e.g. inode, dir entry etc) not just this struct.
 +	 */
 +	__le16 h_record_size;
 +
 +	/*
 +	 * Was: CRC32 of the metadata record.
 +	 * (e.g. inode, dir entry etc) not just this struct.
 +	 */
 +	__le32 h_unused1;
 +
 +	/* Offset of the next metadata entry if any */
 +	__le64 h_next_md_offset;
 +
 +	/* Was: Offset of the previous metadata entry if any */
 +	__le64 h_unused2;
 +
 +} __packed;
 +
 +/* Backing file header */
 +struct incfs_file_header {
 +	/* Magic number: INCFS_MAGIC_NUMBER */
 +	__le64 fh_magic;
 +
 +	/* Format version: INCFS_FORMAT_CURRENT_VER */
 +	__le64 fh_version;
 +
 +	/* sizeof(incfs_file_header) */
 +	__le16 fh_header_size;
 +
 +	/* INCFS_DATA_FILE_BLOCK_SIZE */
 +	__le16 fh_data_block_size;
 +
 +	/* File flags, from incfs_file_header_flags */
 +	__le32 fh_flags;
 +
 +	union {
 +		/* Standard incfs file */
 +		struct {
 +			/* Offset of the first metadata record */
 +			__le64 fh_first_md_offset;
 +
 +			/* Full size of the file's content */
 +			__le64 fh_file_size;
 +
 +			/* File uuid */
 +			incfs_uuid_t fh_uuid;
 +		};
 +
 +		/* Mapped file - INCFS_FILE_MAPPED set in fh_flags */
 +		struct {
 +			/* Offset in original file */
 +			__le64 fh_original_offset;
 +
 +			/* Full size of the file's content */
 +			__le64 fh_mapped_file_size;
 +
 +			/* Original file's uuid */
 +			incfs_uuid_t fh_original_uuid;
 +		};
 +	};
 +} __packed;
 +
 +enum incfs_block_map_entry_flags {
 +	INCFS_BLOCK_COMPRESSED_LZ4 = 1,
 +	INCFS_BLOCK_COMPRESSED_ZSTD = 2,
 +
 +	/* Reserve 3 bits for compression alg */
 +	INCFS_BLOCK_COMPRESSED_MASK = 7,
 +};
 +
 +/* Block map entry pointing to an actual location of the data block. */
 +struct incfs_blockmap_entry {
 +	/* Offset of the actual data block. Lower 32 bits */
 +	__le32 me_data_offset_lo;
 +
 +	/* Offset of the actual data block. Higher 16 bits */
 +	__le16 me_data_offset_hi;
 +
 +	/* How many bytes the data actually occupies in the backing file */
 +	__le16 me_data_size;
 +
 +	/* Block flags from incfs_block_map_entry_flags */
 +	__le16 me_flags;
 +} __packed;
 +
 +/* Metadata record for locations of file blocks. Type = INCFS_MD_BLOCK_MAP */
 +struct incfs_blockmap {
 +	struct incfs_md_header m_header;
 +
 +	/* Base offset of the array of incfs_blockmap_entry */
 +	__le64 m_base_offset;
 +
 +	/* Size of the map entry array in blocks */
 +	__le32 m_block_count;
 +} __packed;
 +
 +/*
 + * Metadata record for file signature. Type = INCFS_MD_SIGNATURE
 + *
 + * The signature stored here is the APK V4 signature data blob. See the
 + * definition of incfs_new_file_args::signature_info for an explanation of this
 + * blob. Specifically, it contains the root hash, but it does *not* contain
 + * anything that the kernel treats as a signature.
 + *
 + * When FS_IOC_ENABLE_VERITY is called on a file without this record, an APK V4
 + * signature blob and a hash tree are added to the file, and then this metadata
 + * record is created to record their locations.
 + */
 +struct incfs_file_signature {
 +	struct incfs_md_header sg_header;
 +
 +	__le32 sg_sig_size; /* The size of the signature. */
 +
 +	__le64 sg_sig_offset; /* Signature's offset in the backing file */
 +
 +	__le32 sg_hash_tree_size; /* The size of the hash tree. */
 +
 +	__le64 sg_hash_tree_offset; /* Hash tree offset in the backing file */
 +} __packed;
 +
 +/* In memory version of above */
 +struct incfs_df_signature {
 +	u32 sig_size;
 +	u64 sig_offset;
 +	u32 hash_size;
 +	u64 hash_offset;
 +};
 +
 +struct incfs_status {
 +	struct incfs_md_header is_header;
 +
 +	__le32 is_data_blocks_written; /* Number of data blocks written */
 +
 +	__le32 is_hash_blocks_written; /* Number of hash blocks written */
 +
 +	__le32 is_dummy[6]; /* Spare fields */
 +} __packed;
 +
 +/*
 + * Metadata record for verity signature. Type = INCFS_MD_VERITY_SIGNATURE
 + *
 + * This record will only exist for verity-enabled files with signatures. Verity
 + * enabled files without signatures do not have this record.
 + *
 + * This is obsolete, as incfs no longer checks this type of signature.
 + */
 +struct incfs_file_verity_signature {
 +	struct incfs_md_header vs_header;
 +
 +	 /* The size of the signature */
 +	__le32 vs_size;
 +
 +	 /* Signature's offset in the backing file */
 +	__le64 vs_offset;
 +} __packed;
 +
 +/* In memory version of above */
 +struct incfs_df_verity_signature {
 +	u32 size;
 +	u64 offset;
 +};
 +
 +/* State of the backing file. */
 +struct backing_file_context {
 +	/* Protects writes to bc_file */
 +	struct mutex bc_mutex;
 +
 +	/* File object to read data from */
 +	struct file *bc_file;
 +
 +	/*
 +	 * Offset of the last known metadata record in the backing file.
 +	 * 0 means there are no metadata records.
 +	 */
 +	loff_t bc_last_md_record_offset;
 +
 +	/*
 +	 * Credentials to set before reads/writes
 +	 * Note that this is a pointer to the mount_info mi_owner field so
 +	 * there is no need to get/put the creds
 +	 */
 +	const struct cred *bc_cred;
 +};
 +
 +struct metadata_handler {
 +	loff_t md_record_offset;
 +	loff_t md_prev_record_offset;
 +	void *context;
 +
 +	union {
 +		struct incfs_md_header md_header;
 +		struct incfs_blockmap blockmap;
 +		struct incfs_file_signature signature;
 +		struct incfs_status status;
 +		struct incfs_file_verity_signature verity_signature;
 +	} md_buffer;
 +
 +	int (*handle_blockmap)(struct incfs_blockmap *bm,
 +			       struct metadata_handler *handler);
 +	int (*handle_signature)(struct incfs_file_signature *sig,
 +				 struct metadata_handler *handler);
 +	int (*handle_status)(struct incfs_status *sig,
 +				 struct metadata_handler *handler);
 +	int (*handle_verity_signature)(struct incfs_file_verity_signature *s,
 +					struct metadata_handler *handler);
 +};
 +#define INCFS_MAX_METADATA_RECORD_SIZE \
 +	sizeof_field(struct metadata_handler, md_buffer)
 +
 +/* Backing file context management */
 +struct mount_info;
 +struct backing_file_context *incfs_alloc_bfc(struct mount_info *mi,
 +					     struct file *backing_file);
 +
 +void incfs_free_bfc(struct backing_file_context *bfc);
 +
 +/* Writing stuff */
 +int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
 +					 u32 block_count);
 +
 +int incfs_write_fh_to_backing_file(struct backing_file_context *bfc,
 +				   incfs_uuid_t *uuid, u64 file_size);
 +
 +int incfs_write_mapping_fh_to_backing_file(struct backing_file_context *bfc,
 +				incfs_uuid_t *uuid, u64 file_size, u64 offset);
 +
 +int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
 +					   struct mem_range block,
 +					   int block_index, loff_t bm_base_off,
 +					   u16 flags);
 +
 +int incfs_write_hash_block_to_backing_file(struct backing_file_context *bfc,
 +					   struct mem_range block,
 +					   int block_index,
 +					   loff_t hash_area_off,
 +					   loff_t bm_base_off,
 +					   loff_t file_size);
 +
 +int incfs_write_signature_to_backing_file(struct backing_file_context *bfc,
 +				struct mem_range sig, u32 tree_size,
 +				loff_t *tree_offset, loff_t *sig_offset);
 +
 +int incfs_write_status_to_backing_file(struct backing_file_context *bfc,
 +				       loff_t status_offset,
 +				       u32 data_blocks_written,
 +				       u32 hash_blocks_written);
 +int incfs_write_verity_signature_to_backing_file(
 +		struct backing_file_context *bfc, struct mem_range signature,
 +		loff_t *offset);
 +
 +/* Reading stuff */
 +int incfs_read_file_header(struct backing_file_context *bfc,
 +			   loff_t *first_md_off, incfs_uuid_t *uuid,
 +			   u64 *file_size, u32 *flags);
 +
 +int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
 +			      loff_t bm_base_off,
 +			      struct incfs_blockmap_entry *bm_entry);
 +
 +int incfs_read_blockmap_entries(struct backing_file_context *bfc,
 +		struct incfs_blockmap_entry *entries,
 +		int start_index, int blocks_number,
 +		loff_t bm_base_off);
 +
 +int incfs_read_next_metadata_record(struct backing_file_context *bfc,
 +				    struct metadata_handler *handler);
 +
 +ssize_t incfs_kread(struct backing_file_context *bfc, void *buf, size_t size,
 +		    loff_t pos);
 +ssize_t incfs_kwrite(struct backing_file_context *bfc, const void *buf,
 +		     size_t size, loff_t pos);
 +
 +#endif /* _INCFS_FORMAT_H */
 diff --git a/fs/incfs/integrity.c b/fs/incfs/integrity.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/integrity.c
 @@ -0,0 +1,235 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2019 Google LLC
 + */
 +#include <crypto/sha2.h>
 +#include <crypto/hash.h>
 +#include <linux/err.h>
 +#include <linux/version.h>
 +
 +#include "integrity.h"
 +
 +struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id)
 +{
 +	static struct incfs_hash_alg sha256 = {
 +		.name = "sha256",
 +		.digest_size = SHA256_DIGEST_SIZE,
 +		.id = INCFS_HASH_TREE_SHA256
 +	};
 +	struct incfs_hash_alg *result = NULL;
 +	struct crypto_shash *shash;
 +
 +	if (id == INCFS_HASH_TREE_SHA256) {
 +		BUILD_BUG_ON(INCFS_MAX_HASH_SIZE < SHA256_DIGEST_SIZE);
 +		result = &sha256;
 +	}
 +
 +	if (result == NULL)
 +		return ERR_PTR(-ENOENT);
 +
 +	/* pairs with cmpxchg_release() below */
 +	shash = smp_load_acquire(&result->shash);
 +	if (shash)
 +		return result;
 +
 +	shash = crypto_alloc_shash(result->name, 0, 0);
 +	if (IS_ERR(shash)) {
 +		int err = PTR_ERR(shash);
 +
 +		pr_err("Can't allocate hash alg %s, error code:%d",
 +			result->name, err);
 +		return ERR_PTR(err);
 +	}
 +
 +	/* pairs with smp_load_acquire() above */
 +	if (cmpxchg_release(&result->shash, NULL, shash) != NULL)
 +		crypto_free_shash(shash);
 +
 +	return result;
 +}
 +
 +struct signature_info {
 +	u32 version;
 +	enum incfs_hash_tree_algorithm hash_algorithm;
 +	u8 log2_blocksize;
 +	struct mem_range salt;
 +	struct mem_range root_hash;
 +};
 +
 +static bool read_u32(u8 **p, u8 *top, u32 *result)
 +{
 +	if (*p + sizeof(u32) > top)
 +		return false;
 +
 +	*result = le32_to_cpu(*(__le32 *)*p);
 +	*p += sizeof(u32);
 +	return true;
 +}
 +
 +static bool read_u8(u8 **p, u8 *top, u8 *result)
 +{
 +	if (*p + sizeof(u8) > top)
 +		return false;
 +
 +	*result = *(u8 *)*p;
 +	*p += sizeof(u8);
 +	return true;
 +}
 +
 +static bool read_mem_range(u8 **p, u8 *top, struct mem_range *range)
 +{
 +	u32 len;
 +
 +	if (!read_u32(p, top, &len) || *p + len > top)
 +		return false;
 +
 +	range->len = len;
 +	range->data = *p;
 +	*p += len;
 +	return true;
 +}
 +
 +static int incfs_parse_signature(struct mem_range signature,
 +				 struct signature_info *si)
 +{
 +	u8 *p = signature.data;
 +	u8 *top = signature.data + signature.len;
 +	u32 hash_section_size;
 +
 +	if (signature.len > INCFS_MAX_SIGNATURE_SIZE)
 +		return -EINVAL;
 +
 +	if (!read_u32(&p, top, &si->version) ||
 +	    si->version != INCFS_SIGNATURE_VERSION)
 +		return -EINVAL;
 +
 +	if (!read_u32(&p, top, &hash_section_size) ||
 +	    p + hash_section_size > top)
 +		return -EINVAL;
 +	top = p + hash_section_size;
 +
 +	if (!read_u32(&p, top, &si->hash_algorithm) ||
 +	    si->hash_algorithm != INCFS_HASH_TREE_SHA256)
 +		return -EINVAL;
 +
 +	if (!read_u8(&p, top, &si->log2_blocksize) || si->log2_blocksize != 12)
 +		return -EINVAL;
 +
 +	if (!read_mem_range(&p, top, &si->salt))
 +		return -EINVAL;
 +
 +	if (!read_mem_range(&p, top, &si->root_hash))
 +		return -EINVAL;
 +
 +	if (p != top)
 +		return -EINVAL;
 +
 +	return 0;
 +}
 +
 +struct mtree *incfs_alloc_mtree(struct mem_range signature,
 +				int data_block_count)
 +{
 +	int error;
 +	struct signature_info si;
 +	struct mtree *result = NULL;
 +	struct incfs_hash_alg *hash_alg = NULL;
 +	int hash_per_block;
 +	int lvl;
 +	int total_blocks = 0;
 +	int blocks_in_level[INCFS_MAX_MTREE_LEVELS];
 +	int blocks = data_block_count;
 +
 +	if (data_block_count <= 0)
 +		return ERR_PTR(-EINVAL);
 +
 +	error = incfs_parse_signature(signature, &si);
 +	if (error)
 +		return ERR_PTR(error);
 +
 +	hash_alg = incfs_get_hash_alg(si.hash_algorithm);
 +	if (IS_ERR(hash_alg))
 +		return ERR_PTR(PTR_ERR(hash_alg));
 +
 +	if (si.root_hash.len < hash_alg->digest_size)
 +		return ERR_PTR(-EINVAL);
 +
 +	result = kzalloc(sizeof(*result), GFP_NOFS);
 +	if (!result)
 +		return ERR_PTR(-ENOMEM);
 +
 +	result->alg = hash_alg;
 +	hash_per_block = INCFS_DATA_FILE_BLOCK_SIZE / result->alg->digest_size;
 +
 +	/* Calculating tree geometry. */
 +	/* First pass: calculate how many blocks in each tree level. */
 +	for (lvl = 0; blocks > 1; lvl++) {
 +		if (lvl >= INCFS_MAX_MTREE_LEVELS) {
 +			pr_err("incfs: too much data in mtree");
 +			goto err;
 +		}
 +
 +		blocks = (blocks + hash_per_block - 1) / hash_per_block;
 +		blocks_in_level[lvl] = blocks;
 +		total_blocks += blocks;
 +	}
 +	result->depth = lvl;
 +	result->hash_tree_area_size = total_blocks * INCFS_DATA_FILE_BLOCK_SIZE;
 +	if (result->hash_tree_area_size > INCFS_MAX_HASH_AREA_SIZE)
 +		goto err;
 +
 +	blocks = 0;
 +	/* Second pass: calculate offset of each level. 0th level goes last. */
 +	for (lvl = 0; lvl < result->depth; lvl++) {
 +		u32 suboffset;
 +
 +		blocks += blocks_in_level[lvl];
 +		suboffset = (total_blocks - blocks)
 +					* INCFS_DATA_FILE_BLOCK_SIZE;
 +
 +		result->hash_level_suboffset[lvl] = suboffset;
 +	}
 +
 +	/* Root hash is stored separately from the rest of the tree. */
 +	memcpy(result->root_hash, si.root_hash.data, hash_alg->digest_size);
 +	return result;
 +
 +err:
 +	kfree(result);
 +	return ERR_PTR(-E2BIG);
 +}
 +
 +void incfs_free_mtree(struct mtree *tree)
 +{
 +	kfree(tree);
 +}
 +
 +int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data,
 +			struct mem_range digest)
 +{
 +	SHASH_DESC_ON_STACK(desc, alg->shash);
 +
 +	if (!alg || !alg->shash || !data.data || !digest.data)
 +		return -EFAULT;
 +
 +	if (alg->digest_size > digest.len)
 +		return -EINVAL;
 +
 +	desc->tfm = alg->shash;
 +
 +	if (data.len < INCFS_DATA_FILE_BLOCK_SIZE) {
 +		int err;
 +		void *buf = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS);
 +
 +		if (!buf)
 +			return -ENOMEM;
 +
 +		memcpy(buf, data.data, data.len);
 +		err = crypto_shash_digest(desc, buf, INCFS_DATA_FILE_BLOCK_SIZE,
 +					  digest.data);
 +		kfree(buf);
 +		return err;
 +	}
 +	return crypto_shash_digest(desc, data.data, data.len, digest.data);
 +}
 +
 diff --git a/fs/incfs/integrity.h b/fs/incfs/integrity.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/integrity.h
 @@ -0,0 +1,56 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2019 Google LLC
 + */
 +#ifndef _INCFS_INTEGRITY_H
 +#define _INCFS_INTEGRITY_H
 +#include <linux/types.h>
 +#include <linux/kernel.h>
 +#include <crypto/hash.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "internal.h"
 +
 +#define INCFS_MAX_MTREE_LEVELS 8
 +#define INCFS_MAX_HASH_AREA_SIZE (1280 * 1024 * 1024)
 +
 +struct incfs_hash_alg {
 +	const char *name;
 +	int digest_size;
 +	enum incfs_hash_tree_algorithm id;
 +
 +	struct crypto_shash *shash;
 +};
 +
 +/* Merkle tree structure. */
 +struct mtree {
 +	struct incfs_hash_alg *alg;
 +
 +	u8 root_hash[INCFS_MAX_HASH_SIZE];
 +
 +	/* Offset of each hash level in the hash area. */
 +	u32 hash_level_suboffset[INCFS_MAX_MTREE_LEVELS];
 +
 +	u32 hash_tree_area_size;
 +
 +	/* Number of levels in hash_level_suboffset */
 +	int depth;
 +};
 +
 +struct incfs_hash_alg *incfs_get_hash_alg(enum incfs_hash_tree_algorithm id);
 +
 +struct mtree *incfs_alloc_mtree(struct mem_range signature,
 +				int data_block_count);
 +
 +void incfs_free_mtree(struct mtree *tree);
 +
 +size_t incfs_get_mtree_depth(enum incfs_hash_tree_algorithm alg, loff_t size);
 +
 +size_t incfs_get_mtree_hash_count(enum incfs_hash_tree_algorithm alg,
 +					loff_t size);
 +
 +int incfs_calc_digest(struct incfs_hash_alg *alg, struct mem_range data,
 +			struct mem_range digest);
 +
 +#endif /* _INCFS_INTEGRITY_H */
 diff --git a/fs/incfs/internal.h b/fs/incfs/internal.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/internal.h
 @@ -0,0 +1,23 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2018 Google LLC
 + */
 +#ifndef _INCFS_INTERNAL_H
 +#define _INCFS_INTERNAL_H
 +#include <linux/types.h>
 +
 +struct mem_range {
 +	u8 *data;
 +	size_t len;
 +};
 +
 +static inline struct mem_range range(u8 *data, size_t len)
 +{
 +	return (struct mem_range){ .data = data, .len = len };
 +}
 +
 +#define LOCK_REQUIRED(lock)  WARN_ON_ONCE(!mutex_is_locked(&lock))
 +
 +#define EFSCORRUPTED EUCLEAN
 +
 +#endif /* _INCFS_INTERNAL_H */
 diff --git a/fs/incfs/main.c b/fs/incfs/main.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/main.c
 @@ -0,0 +1,48 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2018 Google LLC
 + */
 +#include <linux/fs.h>
 +#include <linux/init.h>
 +#include <linux/module.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "sysfs.h"
 +#include "vfs.h"
 +
 +static struct file_system_type incfs_fs_type = {
 +	.owner = THIS_MODULE,
 +	.name = INCFS_NAME,
 +	.mount = incfs_mount_fs,
 +	.kill_sb = incfs_kill_sb,
 +	.fs_flags = 0
 +};
 +
 +static int __init init_incfs_module(void)
 +{
 +	int err = 0;
 +
 +	err = incfs_init_sysfs();
 +	if (err)
 +		return err;
 +
 +	err = register_filesystem(&incfs_fs_type);
 +	if (err)
 +		incfs_cleanup_sysfs();
 +
 +	return err;
 +}
 +
 +static void __exit cleanup_incfs_module(void)
 +{
 +	incfs_cleanup_sysfs();
 +	unregister_filesystem(&incfs_fs_type);
 +}
 +
 +module_init(init_incfs_module);
 +module_exit(cleanup_incfs_module);
 +
 +MODULE_LICENSE("GPL v2");
 +MODULE_AUTHOR("Eugene Zemtsov <ezemtsov@google.com>");
 +MODULE_DESCRIPTION("Incremental File System");
 diff --git a/fs/incfs/pseudo_files.c b/fs/incfs/pseudo_files.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/pseudo_files.c
 @@ -0,0 +1,1394 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2020 Google LLC
 + */
 +
 +#include <linux/file.h>
 +#include <linux/fs.h>
 +#include <linux/fsnotify.h>
 +#include <linux/namei.h>
 +#include <linux/poll.h>
 +#include <linux/syscalls.h>
 +#include <linux/fdtable.h>
 +#include <linux/filelock.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "pseudo_files.h"
 +
 +#include "data_mgmt.h"
 +#include "format.h"
 +#include "integrity.h"
 +#include "vfs.h"
 +
 +#define READ_WRITE_FILE_MODE 0666
 +
 +static bool is_pseudo_filename(struct mem_range name);
 +
 +/*******************************************************************************
 + * .pending_reads pseudo file definition
 + ******************************************************************************/
 +#define INCFS_PENDING_READS_INODE 2
 +static const char pending_reads_file_name[] = INCFS_PENDING_READS_FILENAME;
 +
 +/* State of an open .pending_reads file, unique for each file descriptor. */
 +struct pending_reads_state {
 +	/* A serial number of the last pending read obtained from this file. */
 +	int last_pending_read_sn;
 +};
 +
 +static ssize_t pending_reads_read(struct file *f, char __user *buf, size_t len,
 +			    loff_t *ppos)
 +{
 +	struct pending_reads_state *pr_state = f->private_data;
 +	struct mount_info *mi = get_mount_info(file_superblock(f));
 +	bool report_uid;
 +	unsigned long page = 0;
 +	struct incfs_pending_read_info *reads_buf = NULL;
 +	struct incfs_pending_read_info2 *reads_buf2 = NULL;
 +	size_t record_size;
 +	size_t reads_to_collect;
 +	int last_known_read_sn = READ_ONCE(pr_state->last_pending_read_sn);
 +	int new_max_sn = last_known_read_sn;
 +	int reads_collected = 0;
 +	ssize_t result = 0;
 +
 +	if (!mi)
 +		return -EFAULT;
 +
 +	report_uid = mi->mi_options.report_uid;
 +	record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf);
 +	reads_to_collect = len / record_size;
 +
 +	if (!incfs_fresh_pending_reads_exist(mi, last_known_read_sn))
 +		return 0;
 +
 +	page = get_zeroed_page(GFP_NOFS);
 +	if (!page)
 +		return -ENOMEM;
 +
 +	if (report_uid)
 +		reads_buf2 = (struct incfs_pending_read_info2 *) page;
 +	else
 +		reads_buf = (struct incfs_pending_read_info *) page;
 +
 +	reads_to_collect =
 +		min_t(size_t, PAGE_SIZE / record_size, reads_to_collect);
 +
 +	reads_collected = incfs_collect_pending_reads(mi, last_known_read_sn,
 +				reads_buf, reads_buf2, reads_to_collect,
 +				&new_max_sn);
 +
 +	if (reads_collected < 0) {
 +		result = reads_collected;
 +		goto out;
 +	}
 +
 +	/*
 +	 * Just to make sure that we don't accidentally copy more data
 +	 * to reads buffer than userspace can handle.
 +	 */
 +	reads_collected = min_t(size_t, reads_collected, reads_to_collect);
 +	result = reads_collected * record_size;
 +
 +	/* Copy reads info to the userspace buffer */
 +	if (copy_to_user(buf, (void *)page, result)) {
 +		result = -EFAULT;
 +		goto out;
 +	}
 +
 +	WRITE_ONCE(pr_state->last_pending_read_sn, new_max_sn);
 +	*ppos = 0;
 +
 +out:
 +	free_page(page);
 +	return result;
 +}
 +
 +static __poll_t pending_reads_poll(struct file *file, poll_table *wait)
 +{
 +	struct pending_reads_state *state = file->private_data;
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +	__poll_t ret = 0;
 +
 +	poll_wait(file, &mi->mi_pending_reads_notif_wq, wait);
 +	if (incfs_fresh_pending_reads_exist(mi,
 +					    state->last_pending_read_sn))
 +		ret = EPOLLIN | EPOLLRDNORM;
 +
 +	return ret;
 +}
 +
 +static int pending_reads_open(struct inode *inode, struct file *file)
 +{
 +	struct pending_reads_state *state = NULL;
 +
 +	state = kzalloc(sizeof(*state), GFP_NOFS);
 +	if (!state)
 +		return -ENOMEM;
 +
 +	file->private_data = state;
 +	return 0;
 +}
 +
 +static int pending_reads_release(struct inode *inode, struct file *file)
 +{
 +	kfree(file->private_data);
 +	return 0;
 +}
 +
 +static long ioctl_permit_fill(struct file *f, void __user *arg)
 +{
 +	struct incfs_permit_fill __user *usr_permit_fill = arg;
 +	struct incfs_permit_fill permit_fill;
 +	long error = 0;
 +	struct file *file = NULL;
 +	struct incfs_file_data *fd;
 +
 +	if (copy_from_user(&permit_fill, usr_permit_fill, sizeof(permit_fill)))
 +		return -EFAULT;
 +
 +	file = fget(permit_fill.file_descriptor);
 +	if (IS_ERR_OR_NULL(file)) {
 +		if (!file)
 +			return -ENOENT;
 +
 +		return PTR_ERR(file);
 +	}
 +
 +	if (file->f_op != &incfs_file_ops) {
 +		error = -EPERM;
 +		goto out;
 +	}
 +
 +	if (file->f_inode->i_sb != f->f_inode->i_sb) {
 +		error = -EPERM;
 +		goto out;
 +	}
 +
 +	fd = file->private_data;
 +
 +	switch (fd->fd_fill_permission) {
 +	case CANT_FILL:
 +		fd->fd_fill_permission = CAN_FILL;
 +		break;
 +
 +	case CAN_FILL:
 +		pr_debug("CAN_FILL already set");
 +		break;
 +
 +	default:
 +		pr_warn("Invalid file private data");
 +		error = -EFAULT;
 +		goto out;
 +	}
 +
 +out:
 +	fput(file);
 +	return error;
 +}
 +
 +static int chmod(struct dentry *dentry, umode_t mode)
 +{
 +	struct inode *inode = dentry->d_inode;
 +	struct inode *delegated_inode = NULL;
 +	struct iattr newattrs;
 +	int error;
 +
 +retry_deleg:
 +	inode_lock(inode);
 +	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 +	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 +	error = notify_change(&nop_mnt_idmap, dentry, &newattrs, &delegated_inode);
 +	inode_unlock(inode);
 +	if (delegated_inode) {
 +		error = break_deleg_wait(&delegated_inode);
 +		if (!error)
 +			goto retry_deleg;
 +	}
 +	return error;
 +}
 +
 +static bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs)
 +{
 +	if (lhs.len != rhs.len)
 +		return false;
 +	return memcmp(lhs.data, rhs.data, lhs.len) == 0;
 +}
 +
 +static int validate_name(char *file_name)
 +{
 +	struct mem_range name = range(file_name, strlen(file_name));
 +	int i = 0;
 +
 +	if (name.len > INCFS_MAX_NAME_LEN)
 +		return -ENAMETOOLONG;
 +
 +	if (is_pseudo_filename(name))
 +		return -EINVAL;
 +
 +	for (i = 0; i < name.len; i++)
 +		if (name.data[i] == '/')
 +			return -EINVAL;
 +
 +	return 0;
 +}
 +
 +static int dir_relative_path_resolve(
 +			struct mount_info *mi,
 +			const char __user *relative_path,
 +			struct path *result_path,
 +			struct path *base_path)
 +{
 +	int dir_fd = get_unused_fd_flags(0);
 +	struct file *dir_f = NULL;
 +	int error = 0;
 +
 +	if (!base_path)
 +		base_path = &mi->mi_backing_dir_path;
 +
 +	if (dir_fd < 0)
 +		return dir_fd;
 +
 +	dir_f = dentry_open(base_path, O_RDONLY | O_NOATIME, current_cred());
 +
 +	if (IS_ERR(dir_f)) {
 +		error = PTR_ERR(dir_f);
 +		goto out;
 +	}
 +	fd_install(dir_fd, dir_f);
 +
 +	if (!relative_path) {
 +		/* No relative path given, just return the base dir. */
 +		*result_path = *base_path;
 +		path_get(result_path);
 +		goto out;
 +	}
 +
 +	error = user_path_at_empty(dir_fd, relative_path,
 +		LOOKUP_FOLLOW | LOOKUP_DIRECTORY, result_path, NULL);
 +
 +out:
 +	close_fd(dir_fd);
 +	if (error)
 +		pr_debug("Error: %d\n", error);
 +	return error;
 +}
 +
 +static struct mem_range incfs_copy_signature_info_from_user(u8 __user *original,
 +							    u64 size)
 +{
 +	u8 *result;
 +
 +	if (!original)
 +		return range(NULL, 0);
 +
 +	if (size > INCFS_MAX_SIGNATURE_SIZE)
 +		return range(ERR_PTR(-EFAULT), 0);
 +
 +	result = kzalloc(size, GFP_NOFS | __GFP_COMP);
 +	if (!result)
 +		return range(ERR_PTR(-ENOMEM), 0);
 +
 +	if (copy_from_user(result, original, size)) {
 +		kfree(result);
 +		return range(ERR_PTR(-EFAULT), 0);
 +	}
 +
 +	return range(result, size);
 +}
 +
 +static int init_new_file(struct mount_info *mi, struct dentry *dentry,
 +			 incfs_uuid_t *uuid, u64 size, struct mem_range attr,
 +			 u8 __user *user_signature_info, u64 signature_size)
 +{
 +	struct path path = {};
 +	struct file *new_file;
 +	int error = 0;
 +	struct backing_file_context *bfc = NULL;
 +	u32 block_count;
 +	struct mem_range raw_signature = { NULL };
 +	struct mtree *hash_tree = NULL;
 +
 +	if (!mi || !dentry || !uuid)
 +		return -EFAULT;
 +
 +	/* Resize newly created file to its true size. */
 +	path = (struct path) {
 +		.mnt = mi->mi_backing_dir_path.mnt,
 +		.dentry = dentry
 +	};
 +
 +	new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
 +			       current_cred());
 +
 +	if (IS_ERR(new_file)) {
 +		error = PTR_ERR(new_file);
 +		goto out;
 +	}
 +
 +	bfc = incfs_alloc_bfc(mi, new_file);
 +	fput(new_file);
 +	if (IS_ERR(bfc)) {
 +		error = PTR_ERR(bfc);
 +		bfc = NULL;
 +		goto out;
 +	}
 +
 +	mutex_lock(&bfc->bc_mutex);
 +	error = incfs_write_fh_to_backing_file(bfc, uuid, size);
 +	if (error)
 +		goto out;
 +
 +	block_count = (u32)get_blocks_count_for_size(size);
 +
 +	if (user_signature_info) {
 +		raw_signature = incfs_copy_signature_info_from_user(
 +			user_signature_info, signature_size);
 +
 +		if (IS_ERR(raw_signature.data)) {
 +			error = PTR_ERR(raw_signature.data);
 +			raw_signature.data = NULL;
 +			goto out;
 +		}
 +
 +		hash_tree = incfs_alloc_mtree(raw_signature, block_count);
 +		if (IS_ERR(hash_tree)) {
 +			error = PTR_ERR(hash_tree);
 +			hash_tree = NULL;
 +			goto out;
 +		}
 +
 +		error = incfs_write_signature_to_backing_file(bfc,
 +				raw_signature, hash_tree->hash_tree_area_size,
 +				NULL, NULL);
 +		if (error)
 +			goto out;
 +
 +		block_count += get_blocks_count_for_size(
 +			hash_tree->hash_tree_area_size);
 +	}
 +
 +	if (block_count)
 +		error = incfs_write_blockmap_to_backing_file(bfc, block_count);
 +
 +	if (error)
 +		goto out;
 +
 +out:
 +	if (bfc) {
 +		mutex_unlock(&bfc->bc_mutex);
 +		incfs_free_bfc(bfc);
 +	}
 +	incfs_free_mtree(hash_tree);
 +	kfree(raw_signature.data);
 +
 +	if (error)
 +		pr_debug("incfs: %s error: %d\n", __func__, error);
 +	return error;
 +}
 +
 +static void notify_create(struct file *pending_reads_file,
 +			  const char  __user *dir_name, const char *file_name,
 +			  const char *file_id_str, bool incomplete_file)
 +{
 +	struct mount_info *mi =
 +		get_mount_info(file_superblock(pending_reads_file));
 +	struct path base_path = {
 +		.mnt = pending_reads_file->f_path.mnt,
 +		.dentry = pending_reads_file->f_path.dentry->d_parent,
 +	};
 +	struct path dir_path = {};
 +	struct dentry *file = NULL;
 +	struct dentry *dir = NULL;
 +	int error;
 +
 +	error = dir_relative_path_resolve(mi, dir_name, &dir_path, &base_path);
 +	if (error)
 +		goto out;
 +
 +	file = incfs_lookup_dentry(dir_path.dentry, file_name);
 +	if (IS_ERR(file)) {
 +		error = PTR_ERR(file);
 +		file = NULL;
 +		goto out;
 +	}
 +
 +	fsnotify_create(d_inode(dir_path.dentry), file);
 +
 +	if (file_id_str) {
 +		dir = incfs_lookup_dentry(base_path.dentry, INCFS_INDEX_NAME);
 +		if (IS_ERR(dir)) {
 +			error = PTR_ERR(dir);
 +			dir = NULL;
 +			goto out;
 +		}
 +
 +		dput(file);
 +		file = incfs_lookup_dentry(dir, file_id_str);
 +		if (IS_ERR(file)) {
 +			error = PTR_ERR(file);
 +			file = NULL;
 +			goto out;
 +		}
 +
 +		fsnotify_create(d_inode(dir), file);
 +
 +		if (incomplete_file) {
 +			dput(dir);
 +			dir = incfs_lookup_dentry(base_path.dentry,
 +						  INCFS_INCOMPLETE_NAME);
 +			if (IS_ERR(dir)) {
 +				error = PTR_ERR(dir);
 +				dir = NULL;
 +				goto out;
 +			}
 +
 +			dput(file);
 +			file = incfs_lookup_dentry(dir, file_id_str);
 +			if (IS_ERR(file)) {
 +				error = PTR_ERR(file);
 +				file = NULL;
 +				goto out;
 +			}
 +
 +			fsnotify_create(d_inode(dir), file);
 +		}
 +	}
 +out:
 +	if (error)
 +		pr_warn("%s failed with error %d\n", __func__, error);
 +
 +	dput(dir);
 +	dput(file);
 +	path_put(&dir_path);
 +}
 +
 +static long ioctl_create_file(struct file *file,
 +			struct incfs_new_file_args __user *usr_args)
 +{
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +	struct incfs_new_file_args args;
 +	char *file_id_str = NULL;
 +	struct dentry *index_file_dentry = NULL;
 +	struct dentry *named_file_dentry = NULL;
 +	struct dentry *incomplete_file_dentry = NULL;
 +	struct path parent_dir_path = {};
 +	struct inode *index_dir_inode = NULL;
 +	__le64 size_attr_value = 0;
 +	char *file_name = NULL;
 +	char *attr_value = NULL;
 +	int error = 0;
 +	bool locked = false;
 +	bool index_linked = false;
 +	bool name_linked = false;
 +	bool incomplete_linked = false;
 +
 +	if (!mi || !mi->mi_index_dir || !mi->mi_incomplete_dir) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +
 +	if (copy_from_user(&args, usr_args, sizeof(args)) > 0) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +
 +	file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX);
 +	if (IS_ERR(file_name)) {
 +		error = PTR_ERR(file_name);
 +		file_name = NULL;
 +		goto out;
 +	}
 +
 +	error = validate_name(file_name);
 +	if (error)
 +		goto out;
 +
 +	file_id_str = file_id_to_str(args.file_id);
 +	if (!file_id_str) {
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +
 +	error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (error)
 +		goto out;
 +	locked = true;
 +
 +	/* Find a directory to put the file into. */
 +	error = dir_relative_path_resolve(mi,
 +			u64_to_user_ptr(args.directory_path),
 +			&parent_dir_path, NULL);
 +	if (error)
 +		goto out;
 +
 +	if (parent_dir_path.dentry == mi->mi_index_dir) {
 +		/* Can't create a file directly inside .index */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	if (parent_dir_path.dentry == mi->mi_incomplete_dir) {
 +		/* Can't create a file directly inside .incomplete */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	/* Look up a dentry in the parent dir. It should be negative. */
 +	named_file_dentry = incfs_lookup_dentry(parent_dir_path.dentry,
 +					file_name);
 +	if (!named_file_dentry) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +	if (IS_ERR(named_file_dentry)) {
 +		error = PTR_ERR(named_file_dentry);
 +		named_file_dentry = NULL;
 +		goto out;
 +	}
 +	if (d_really_is_positive(named_file_dentry)) {
 +		/* File with this path already exists. */
 +		error = -EEXIST;
 +		goto out;
 +	}
 +
 +	/* Look up a dentry in the incomplete dir. It should be negative. */
 +	incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir,
 +					file_id_str);
 +	if (!incomplete_file_dentry) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +	if (IS_ERR(incomplete_file_dentry)) {
 +		error = PTR_ERR(incomplete_file_dentry);
 +		incomplete_file_dentry = NULL;
 +		goto out;
 +	}
 +	if (d_really_is_positive(incomplete_file_dentry)) {
 +		/* File with this path already exists. */
 +		error = -EEXIST;
 +		goto out;
 +	}
 +
 +	/* Look up a dentry in the .index dir. It should be negative. */
 +	index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str);
 +	if (!index_file_dentry) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +	if (IS_ERR(index_file_dentry)) {
 +		error = PTR_ERR(index_file_dentry);
 +		index_file_dentry = NULL;
 +		goto out;
 +	}
 +	if (d_really_is_positive(index_file_dentry)) {
 +		/* File with this ID already exists in index. */
 +		error = -EEXIST;
 +		goto out;
 +	}
 +
 +	/* Creating a file in the .index dir. */
 +	index_dir_inode = d_inode(mi->mi_index_dir);
 +	inode_lock_nested(index_dir_inode, I_MUTEX_PARENT);
 +	error = vfs_create(&nop_mnt_idmap, index_dir_inode, index_file_dentry,
 +			   args.mode | 0222, true);
 +	inode_unlock(index_dir_inode);
 +
 +	if (error)
 +		goto out;
 +	if (!d_really_is_positive(index_file_dentry)) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	error = chmod(index_file_dentry, args.mode | 0222);
 +	if (error) {
 +		pr_debug("incfs: chmod err: %d\n", error);
 +		goto out;
 +	}
 +
 +	/* Save the file's ID as an xattr for easy fetching in future. */
 +	error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry, INCFS_XATTR_ID_NAME,
 +		file_id_str, strlen(file_id_str), XATTR_CREATE);
 +	if (error) {
 +		pr_debug("incfs: vfs_setxattr err:%d\n", error);
 +		goto out;
 +	}
 +
 +	/* Save the file's size as an xattr for easy fetching in future. */
 +	size_attr_value = cpu_to_le64(args.size);
 +	error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry, INCFS_XATTR_SIZE_NAME,
 +		(char *)&size_attr_value, sizeof(size_attr_value),
 +		XATTR_CREATE);
 +	if (error) {
 +		pr_debug("incfs: vfs_setxattr err:%d\n", error);
 +		goto out;
 +	}
 +
 +	/* Save the file's attribute as an xattr */
 +	if (args.file_attr_len && args.file_attr) {
 +		if (args.file_attr_len > INCFS_MAX_FILE_ATTR_SIZE) {
 +			error = -E2BIG;
 +			goto out;
 +		}
 +
 +		attr_value = kmalloc(args.file_attr_len, GFP_NOFS);
 +		if (!attr_value) {
 +			error = -ENOMEM;
 +			goto out;
 +		}
 +
 +		if (copy_from_user(attr_value,
 +				u64_to_user_ptr(args.file_attr),
 +				args.file_attr_len) > 0) {
 +			error = -EFAULT;
 +			goto out;
 +		}
 +
 +		error = vfs_setxattr(&nop_mnt_idmap, index_file_dentry,
 +				INCFS_XATTR_METADATA_NAME,
 +				attr_value, args.file_attr_len,
 +				XATTR_CREATE);
 +
 +		if (error)
 +			goto out;
 +	}
 +
 +	/* Initializing a newly created file. */
 +	error = init_new_file(mi, index_file_dentry, &args.file_id, args.size,
 +			      range(attr_value, args.file_attr_len),
 +			      u64_to_user_ptr(args.signature_info),
 +			      args.signature_size);
 +	if (error)
 +		goto out;
 +	index_linked = true;
 +
 +	/* Linking a file with its real name from the requested dir. */
 +	error = incfs_link(index_file_dentry, named_file_dentry);
 +	if (error)
 +		goto out;
 +	name_linked = true;
 +
 +	if (args.size) {
 +		/* Linking a file with its incomplete entry */
 +		error = incfs_link(index_file_dentry, incomplete_file_dentry);
 +		if (error)
 +			goto out;
 +		incomplete_linked = true;
 +	}
 +
 +	notify_create(file, u64_to_user_ptr(args.directory_path), file_name,
 +		      file_id_str, args.size != 0);
 +
 +out:
 +	if (error) {
 +		pr_debug("incfs: %s err:%d\n", __func__, error);
 +		if (index_linked)
 +			incfs_unlink(index_file_dentry);
 +		if (name_linked)
 +			incfs_unlink(named_file_dentry);
 +		if (incomplete_linked)
 +			incfs_unlink(incomplete_file_dentry);
 +	}
 +
 +	kfree(file_id_str);
 +	kfree(file_name);
 +	kfree(attr_value);
 +	dput(named_file_dentry);
 +	dput(index_file_dentry);
 +	dput(incomplete_file_dentry);
 +	path_put(&parent_dir_path);
 +	if (locked)
 +		mutex_unlock(&mi->mi_dir_struct_mutex);
 +
 +	return error;
 +}
 +
 +static int init_new_mapped_file(struct mount_info *mi, struct dentry *dentry,
 +			 incfs_uuid_t *uuid, u64 size, u64 offset)
 +{
 +	struct path path = {};
 +	struct file *new_file;
 +	int error = 0;
 +	struct backing_file_context *bfc = NULL;
 +
 +	if (!mi || !dentry || !uuid)
 +		return -EFAULT;
 +
 +	/* Resize newly created file to its true size. */
 +	path = (struct path) {
 +		.mnt = mi->mi_backing_dir_path.mnt,
 +		.dentry = dentry
 +	};
 +	new_file = dentry_open(&path, O_RDWR | O_NOATIME | O_LARGEFILE,
 +			       current_cred());
 +
 +	if (IS_ERR(new_file)) {
 +		error = PTR_ERR(new_file);
 +		goto out;
 +	}
 +
 +	bfc = incfs_alloc_bfc(mi, new_file);
 +	fput(new_file);
 +	if (IS_ERR(bfc)) {
 +		error = PTR_ERR(bfc);
 +		bfc = NULL;
 +		goto out;
 +	}
 +
 +	mutex_lock(&bfc->bc_mutex);
 +	error = incfs_write_mapping_fh_to_backing_file(bfc, uuid, size, offset);
 +	if (error)
 +		goto out;
 +
 +out:
 +	if (bfc) {
 +		mutex_unlock(&bfc->bc_mutex);
 +		incfs_free_bfc(bfc);
 +	}
 +
 +	if (error)
 +		pr_debug("incfs: %s error: %d\n", __func__, error);
 +	return error;
 +}
 +
 +static long ioctl_create_mapped_file(struct file *file, void __user *arg)
 +{
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +	struct incfs_create_mapped_file_args __user *args_usr_ptr = arg;
 +	struct incfs_create_mapped_file_args args = {};
 +	char *file_name;
 +	int error = 0;
 +	struct path parent_dir_path = {};
 +	char *source_file_name = NULL;
 +	struct dentry *source_file_dentry = NULL;
 +	u64 source_file_size;
 +	struct dentry *file_dentry = NULL;
 +	struct inode *parent_inode;
 +	__le64 size_attr_value;
 +
 +	if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
 +		return -EINVAL;
 +
 +	file_name = strndup_user(u64_to_user_ptr(args.file_name), PATH_MAX);
 +	if (IS_ERR(file_name)) {
 +		error = PTR_ERR(file_name);
 +		file_name = NULL;
 +		goto out;
 +	}
 +
 +	error = validate_name(file_name);
 +	if (error)
 +		goto out;
 +
 +	if (args.source_offset % INCFS_DATA_FILE_BLOCK_SIZE) {
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	/* Validate file mapping is in range */
 +	source_file_name = file_id_to_str(args.source_file_id);
 +	if (!source_file_name) {
 +		pr_warn("Failed to alloc source_file_name\n");
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +
 +	source_file_dentry = incfs_lookup_dentry(mi->mi_index_dir,
 +						       source_file_name);
 +	if (!source_file_dentry) {
 +		pr_warn("Source file does not exist\n");
 +		error = -EINVAL;
 +		goto out;
 +	}
 +	if (IS_ERR(source_file_dentry)) {
 +		pr_warn("Error opening source file\n");
 +		error = PTR_ERR(source_file_dentry);
 +		source_file_dentry = NULL;
 +		goto out;
 +	}
 +	if (!d_really_is_positive(source_file_dentry)) {
 +		pr_warn("Source file dentry negative\n");
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	error = vfs_getxattr(&nop_mnt_idmap, source_file_dentry, INCFS_XATTR_SIZE_NAME,
 +			     (char *)&size_attr_value, sizeof(size_attr_value));
 +	if (error < 0)
 +		goto out;
 +
 +	if (error != sizeof(size_attr_value)) {
 +		pr_warn("Mapped file has no size attr\n");
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	source_file_size = le64_to_cpu(size_attr_value);
 +	if (args.source_offset + args.size > source_file_size) {
 +		pr_warn("Mapped file out of range\n");
 +		error = -EINVAL;
 +		goto out;
 +	}
 +
 +	/* Find a directory to put the file into. */
 +	error = dir_relative_path_resolve(mi,
 +			u64_to_user_ptr(args.directory_path),
 +			&parent_dir_path, NULL);
 +	if (error)
 +		goto out;
 +
 +	if (parent_dir_path.dentry == mi->mi_index_dir) {
 +		/* Can't create a file directly inside .index */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	/* Look up a dentry in the parent dir. It should be negative. */
 +	file_dentry = incfs_lookup_dentry(parent_dir_path.dentry,
 +					file_name);
 +	if (!file_dentry) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +	if (IS_ERR(file_dentry)) {
 +		error = PTR_ERR(file_dentry);
 +		file_dentry = NULL;
 +		goto out;
 +	}
 +	if (d_really_is_positive(file_dentry)) {
 +		error = -EEXIST;
 +		goto out;
 +	}
 +
 +	parent_inode = d_inode(parent_dir_path.dentry);
 +	inode_lock_nested(parent_inode, I_MUTEX_PARENT);
 +	error = vfs_create(&nop_mnt_idmap, parent_inode, file_dentry,
 +			   args.mode | 0222, true);
 +	inode_unlock(parent_inode);
 +	if (error)
 +		goto out;
 +
 +	error = chmod(file_dentry, args.mode | 0222);
 +	if (error) {
 +		pr_debug("incfs: chmod err: %d\n", error);
 +		goto delete_file;
 +	}
 +
 +	/* Save the file's size as an xattr for easy fetching in future. */
 +	size_attr_value = cpu_to_le64(args.size);
 +	error = vfs_setxattr(&nop_mnt_idmap, file_dentry, INCFS_XATTR_SIZE_NAME,
 +		(char *)&size_attr_value, sizeof(size_attr_value),
 +		XATTR_CREATE);
 +	if (error) {
 +		pr_debug("incfs: vfs_setxattr err:%d\n", error);
 +		goto delete_file;
 +	}
 +
 +	error = init_new_mapped_file(mi, file_dentry, &args.source_file_id,
 +			args.size, args.source_offset);
 +	if (error)
 +		goto delete_file;
 +
 +	notify_create(file, u64_to_user_ptr(args.directory_path), file_name,
 +		      NULL, false);
 +
 +	goto out;
 +
 +delete_file:
 +	incfs_unlink(file_dentry);
 +
 +out:
 +	dput(file_dentry);
 +	dput(source_file_dentry);
 +	path_put(&parent_dir_path);
 +	kfree(file_name);
 +	kfree(source_file_name);
 +	return error;
 +}
 +
 +static long ioctl_get_read_timeouts(struct mount_info *mi, void __user *arg)
 +{
 +	struct incfs_get_read_timeouts_args __user *args_usr_ptr = arg;
 +	struct incfs_get_read_timeouts_args args = {};
 +	int error = 0;
 +	struct incfs_per_uid_read_timeouts *buffer;
 +	int size;
 +
 +	if (copy_from_user(&args, args_usr_ptr, sizeof(args)))
 +		return -EINVAL;
 +
 +	if (args.timeouts_array_size > INCFS_DATA_FILE_BLOCK_SIZE)
 +		return -EINVAL;
 +
 +	buffer = kzalloc(args.timeouts_array_size, GFP_NOFS);
 +	if (!buffer)
 +		return -ENOMEM;
 +
 +	spin_lock(&mi->mi_per_uid_read_timeouts_lock);
 +	size = mi->mi_per_uid_read_timeouts_size;
 +	if (args.timeouts_array_size < size)
 +		error = -E2BIG;
 +	else if (size)
 +		memcpy(buffer, mi->mi_per_uid_read_timeouts, size);
 +	spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
 +
 +	args.timeouts_array_size_out = size;
 +	if (!error && size)
 +		if (copy_to_user(u64_to_user_ptr(args.timeouts_array), buffer,
 +				 size))
 +			error = -EFAULT;
 +
 +	if (!error || error == -E2BIG)
 +		if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0)
 +			error = -EFAULT;
 +
 +	kfree(buffer);
 +	return error;
 +}
 +
 +static long ioctl_set_read_timeouts(struct mount_info *mi, void __user *arg)
 +{
 +	struct incfs_set_read_timeouts_args __user *args_usr_ptr = arg;
 +	struct incfs_set_read_timeouts_args args = {};
 +	int error = 0;
 +	int size;
 +	struct incfs_per_uid_read_timeouts *buffer = NULL, *tmp;
 +	int i;
 +
 +	if (copy_from_user(&args, args_usr_ptr, sizeof(args)))
 +		return -EINVAL;
 +
 +	size = args.timeouts_array_size;
 +	if (size) {
 +		if (size > INCFS_DATA_FILE_BLOCK_SIZE ||
 +		    size % sizeof(*buffer) != 0)
 +			return -EINVAL;
 +
 +		buffer = kzalloc(size, GFP_NOFS);
 +		if (!buffer)
 +			return -ENOMEM;
 +
 +		if (copy_from_user(buffer, u64_to_user_ptr(args.timeouts_array),
 +				   size)) {
 +			error = -EINVAL;
 +			goto out;
 +		}
 +
 +		for (i = 0; i < size / sizeof(*buffer); ++i) {
 +			struct incfs_per_uid_read_timeouts *t = &buffer[i];
 +
 +			if (t->min_pending_time_us > t->max_pending_time_us) {
 +				error = -EINVAL;
 +				goto out;
 +			}
 +		}
 +	}
 +
 +	spin_lock(&mi->mi_per_uid_read_timeouts_lock);
 +	mi->mi_per_uid_read_timeouts_size = size;
 +	tmp = mi->mi_per_uid_read_timeouts;
 +	mi->mi_per_uid_read_timeouts = buffer;
 +	buffer = tmp;
 +	spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
 +
 +out:
 +	kfree(buffer);
 +	return error;
 +}
 +
 +static long ioctl_get_last_read_error(struct mount_info *mi, void __user *arg)
 +{
 +	struct incfs_get_last_read_error_args __user *args_usr_ptr = arg;
 +	struct incfs_get_last_read_error_args args = {};
 +	int error;
 +
 +	error = mutex_lock_interruptible(&mi->mi_le_mutex);
 +	if (error)
 +		return error;
 +
 +	args.file_id_out = mi->mi_le_file_id;
 +	args.time_us_out = mi->mi_le_time_us;
 +	args.page_out = mi->mi_le_page;
 +	args.errno_out = mi->mi_le_errno;
 +	args.uid_out = mi->mi_le_uid;
 +
 +	mutex_unlock(&mi->mi_le_mutex);
 +	if (copy_to_user(args_usr_ptr, &args, sizeof(args)) > 0)
 +		error = -EFAULT;
 +
 +	return error;
 +}
 +
 +static long pending_reads_dispatch_ioctl(struct file *f, unsigned int req,
 +					unsigned long arg)
 +{
 +	struct mount_info *mi = get_mount_info(file_superblock(f));
 +
 +	switch (req) {
 +	case INCFS_IOC_CREATE_FILE:
 +		return ioctl_create_file(f, (void __user *)arg);
 +	case INCFS_IOC_PERMIT_FILL:
 +		return ioctl_permit_fill(f, (void __user *)arg);
 +	case INCFS_IOC_CREATE_MAPPED_FILE:
 +		return ioctl_create_mapped_file(f, (void __user *)arg);
 +	case INCFS_IOC_GET_READ_TIMEOUTS:
 +		return ioctl_get_read_timeouts(mi, (void __user *)arg);
 +	case INCFS_IOC_SET_READ_TIMEOUTS:
 +		return ioctl_set_read_timeouts(mi, (void __user *)arg);
 +	case INCFS_IOC_GET_LAST_READ_ERROR:
 +		return ioctl_get_last_read_error(mi, (void __user *)arg);
 +	default:
 +		return -EINVAL;
 +	}
 +}
 +
 +static const struct file_operations incfs_pending_reads_file_ops = {
 +	.read = pending_reads_read,
 +	.poll = pending_reads_poll,
 +	.open = pending_reads_open,
 +	.release = pending_reads_release,
 +	.llseek = noop_llseek,
 +	.unlocked_ioctl = pending_reads_dispatch_ioctl,
 +	.compat_ioctl = pending_reads_dispatch_ioctl
 +};
 +
 +/*******************************************************************************
 + * .log pseudo file definition
 + ******************************************************************************/
 +#define INCFS_LOG_INODE 3
 +static const char log_file_name[] = INCFS_LOG_FILENAME;
 +
 +/* State of an open .log file, unique for each file descriptor. */
 +struct log_file_state {
 +	struct read_log_state state;
 +};
 +
 +static ssize_t log_read(struct file *f, char __user *buf, size_t len,
 +			loff_t *ppos)
 +{
 +	struct log_file_state *log_state = f->private_data;
 +	struct mount_info *mi = get_mount_info(file_superblock(f));
 +	int total_reads_collected = 0;
 +	int rl_size;
 +	ssize_t result = 0;
 +	bool report_uid;
 +	unsigned long page = 0;
 +	struct incfs_pending_read_info *reads_buf = NULL;
 +	struct incfs_pending_read_info2 *reads_buf2 = NULL;
 +	size_t record_size;
 +	ssize_t reads_to_collect;
 +	ssize_t reads_per_page;
 +
 +	if (!mi)
 +		return -EFAULT;
 +
 +	report_uid = mi->mi_options.report_uid;
 +	record_size = report_uid ? sizeof(*reads_buf2) : sizeof(*reads_buf);
 +	reads_to_collect = len / record_size;
 +	reads_per_page = PAGE_SIZE / record_size;
 +
 +	rl_size = READ_ONCE(mi->mi_log.rl_size);
 +	if (rl_size == 0)
 +		return 0;
 +
 +	page = __get_free_page(GFP_NOFS);
 +	if (!page)
 +		return -ENOMEM;
 +
 +	if (report_uid)
 +		reads_buf2 = (struct incfs_pending_read_info2 *)page;
 +	else
 +		reads_buf = (struct incfs_pending_read_info *)page;
 +
 +	reads_to_collect = min_t(ssize_t, rl_size, reads_to_collect);
 +	while (reads_to_collect > 0) {
 +		struct read_log_state next_state;
 +		int reads_collected;
 +
 +		memcpy(&next_state, &log_state->state, sizeof(next_state));
 +		reads_collected = incfs_collect_logged_reads(
 +			mi, &next_state, reads_buf, reads_buf2,
 +			min_t(ssize_t, reads_to_collect, reads_per_page));
 +		if (reads_collected <= 0) {
 +			result = total_reads_collected ?
 +				       total_reads_collected * record_size :
 +				       reads_collected;
 +			goto out;
 +		}
 +		if (copy_to_user(buf, (void *)page,
 +				 reads_collected * record_size)) {
 +			result = total_reads_collected ?
 +				       total_reads_collected * record_size :
 +				       -EFAULT;
 +			goto out;
 +		}
 +
 +		memcpy(&log_state->state, &next_state, sizeof(next_state));
 +		total_reads_collected += reads_collected;
 +		buf += reads_collected * record_size;
 +		reads_to_collect -= reads_collected;
 +	}
 +
 +	result = total_reads_collected * record_size;
 +	*ppos = 0;
 +out:
 +	free_page(page);
 +	return result;
 +}
 +
 +static __poll_t log_poll(struct file *file, poll_table *wait)
 +{
 +	struct log_file_state *log_state = file->private_data;
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +	int count;
 +	__poll_t ret = 0;
 +
 +	poll_wait(file, &mi->mi_log.ml_notif_wq, wait);
 +	count = incfs_get_uncollected_logs_count(mi, &log_state->state);
 +	if (count >= mi->mi_options.read_log_wakeup_count)
 +		ret = EPOLLIN | EPOLLRDNORM;
 +
 +	return ret;
 +}
 +
 +static int log_open(struct inode *inode, struct file *file)
 +{
 +	struct log_file_state *log_state = NULL;
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +
 +	log_state = kzalloc(sizeof(*log_state), GFP_NOFS);
 +	if (!log_state)
 +		return -ENOMEM;
 +
 +	log_state->state = incfs_get_log_state(mi);
 +	file->private_data = log_state;
 +	return 0;
 +}
 +
 +static int log_release(struct inode *inode, struct file *file)
 +{
 +	kfree(file->private_data);
 +	return 0;
 +}
 +
 +static const struct file_operations incfs_log_file_ops = {
 +	.read = log_read,
 +	.poll = log_poll,
 +	.open = log_open,
 +	.release = log_release,
 +	.llseek = noop_llseek,
 +};
 +
 +/*******************************************************************************
 + * .blocks_written pseudo file definition
 + ******************************************************************************/
 +#define INCFS_BLOCKS_WRITTEN_INODE 4
 +static const char blocks_written_file_name[] = INCFS_BLOCKS_WRITTEN_FILENAME;
 +
 +/* State of an open .blocks_written file, unique for each file descriptor. */
 +struct blocks_written_file_state {
 +	unsigned long blocks_written;
 +};
 +
 +static ssize_t blocks_written_read(struct file *f, char __user *buf, size_t len,
 +			loff_t *ppos)
 +{
 +	struct mount_info *mi = get_mount_info(file_superblock(f));
 +	struct blocks_written_file_state *state = f->private_data;
 +	unsigned long blocks_written;
 +	char string[21];
 +	int result = 0;
 +
 +	if (!mi)
 +		return -EFAULT;
 +
 +	blocks_written = atomic_read(&mi->mi_blocks_written);
 +	if (state->blocks_written == blocks_written)
 +		return 0;
 +
 +	result = snprintf(string, sizeof(string), "%lu", blocks_written);
 +	if (result > len)
 +		result = len;
 +	if (copy_to_user(buf, string, result))
 +		return -EFAULT;
 +
 +	state->blocks_written = blocks_written;
 +	return result;
 +}
 +
 +static __poll_t blocks_written_poll(struct file *f, poll_table *wait)
 +{
 +	struct mount_info *mi = get_mount_info(file_superblock(f));
 +	struct blocks_written_file_state *state = f->private_data;
 +	unsigned long blocks_written;
 +
 +	if (!mi)
 +		return 0;
 +
 +	poll_wait(f, &mi->mi_blocks_written_notif_wq, wait);
 +	blocks_written = atomic_read(&mi->mi_blocks_written);
 +	if (state->blocks_written == blocks_written)
 +		return 0;
 +
 +	return EPOLLIN | EPOLLRDNORM;
 +}
 +
 +static int blocks_written_open(struct inode *inode, struct file *file)
 +{
 +	struct blocks_written_file_state *state =
 +		kzalloc(sizeof(*state), GFP_NOFS);
 +
 +	if (!state)
 +		return -ENOMEM;
 +
 +	state->blocks_written = -1;
 +	file->private_data = state;
 +	return 0;
 +}
 +
 +static int blocks_written_release(struct inode *inode, struct file *file)
 +{
 +	kfree(file->private_data);
 +	return 0;
 +}
 +
 +static const struct file_operations incfs_blocks_written_file_ops = {
 +	.read = blocks_written_read,
 +	.poll = blocks_written_poll,
 +	.open = blocks_written_open,
 +	.release = blocks_written_release,
 +	.llseek = noop_llseek,
 +};
 +
 +/*******************************************************************************
 + * Generic inode lookup functionality
 + ******************************************************************************/
 +
 +const struct mem_range incfs_pseudo_file_names[] = {
 +	{ .data = (u8 *)pending_reads_file_name,
 +	  .len = ARRAY_SIZE(pending_reads_file_name) - 1 },
 +	{ .data = (u8 *)log_file_name, .len = ARRAY_SIZE(log_file_name) - 1 },
 +	{ .data = (u8 *)blocks_written_file_name,
 +	  .len = ARRAY_SIZE(blocks_written_file_name) - 1 }
 +};
 +
 +const unsigned long incfs_pseudo_file_inodes[] = { INCFS_PENDING_READS_INODE,
 +						   INCFS_LOG_INODE,
 +						   INCFS_BLOCKS_WRITTEN_INODE };
 +
 +static const struct file_operations *const pseudo_file_operations[] = {
 +	&incfs_pending_reads_file_ops, &incfs_log_file_ops,
 +	&incfs_blocks_written_file_ops
 +};
 +
 +static bool is_pseudo_filename(struct mem_range name)
 +{
 +	int i = 0;
 +
 +	for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i)
 +		if (incfs_equal_ranges(incfs_pseudo_file_names[i], name))
 +			return true;
 +	return false;
 +}
 +
 +static bool get_pseudo_inode(int ino, struct inode *inode)
 +{
 +	int i = 0;
 +
 +	for (; i < ARRAY_SIZE(incfs_pseudo_file_inodes); ++i)
 +		if (ino == incfs_pseudo_file_inodes[i])
 +			break;
 +	if (i == ARRAY_SIZE(incfs_pseudo_file_inodes))
 +		return false;
 +
 +	inode_set_mtime(inode, 0, 0);
 +	inode_set_atime(inode, 0, 0);
 +	inode_set_ctime(inode, 0, 0);
 +	inode->i_size = 0;
 +	inode->i_ino = ino;
 +	inode->i_private = NULL;
 +	inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG | READ_WRITE_FILE_MODE);
 +	inode->i_op = &incfs_file_inode_ops;
 +	inode->i_fop = pseudo_file_operations[i];
 +	return true;
 +}
 +
 +struct inode_search {
 +	unsigned long ino;
 +};
 +
 +static int inode_test(struct inode *inode, void *opaque)
 +{
 +	struct inode_search *search = opaque;
 +
 +	return inode->i_ino == search->ino;
 +}
 +
 +static int inode_set(struct inode *inode, void *opaque)
 +{
 +	struct inode_search *search = opaque;
 +
 +	if (get_pseudo_inode(search->ino, inode))
 +		return 0;
 +
 +	/* Unknown inode requested. */
 +	return -EINVAL;
 +}
 +
 +static struct inode *fetch_inode(struct super_block *sb, unsigned long ino)
 +{
 +	struct inode_search search = {
 +		.ino = ino
 +	};
 +	struct inode *inode = iget5_locked(sb, search.ino, inode_test,
 +				inode_set, &search);
 +
 +	if (!inode)
 +		return ERR_PTR(-ENOMEM);
 +
 +	if (inode->i_state & I_NEW)
 +		unlock_new_inode(inode);
 +
 +	return inode;
 +}
 +
 +int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry)
 +{
 +	struct mem_range name_range =
 +			range((u8 *)dentry->d_name.name, dentry->d_name.len);
 +	unsigned long ino;
 +	struct inode *inode;
 +	int i = 0;
 +
 +	for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i)
 +		if (incfs_equal_ranges(incfs_pseudo_file_names[i], name_range))
 +			break;
 +	if (i == ARRAY_SIZE(incfs_pseudo_file_names))
 +		return -ENOENT;
 +
 +	ino = incfs_pseudo_file_inodes[i];
 +
 +	inode = fetch_inode(sb, ino);
 +	if (IS_ERR(inode))
 +		return PTR_ERR(inode);
 +
 +	d_add(dentry, inode);
 +	return 0;
 +}
 +
 +int emit_pseudo_files(struct dir_context *ctx)
 +{
 +	loff_t i = ctx->pos;
 +
 +	for (; i < ARRAY_SIZE(incfs_pseudo_file_names); ++i) {
 +		if (!dir_emit(ctx, incfs_pseudo_file_names[i].data,
 +			      incfs_pseudo_file_names[i].len,
 +			      incfs_pseudo_file_inodes[i], DT_REG))
 +			return -EINVAL;
 +
 +		ctx->pos++;
 +	}
 +	return 0;
 +}
 diff --git a/fs/incfs/pseudo_files.h b/fs/incfs/pseudo_files.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/pseudo_files.h
 @@ -0,0 +1,20 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2020 Google LLC
 + */
 +
 +#ifndef _INCFS_PSEUDO_FILES_H
 +#define _INCFS_PSEUDO_FILES_H
 +
 +#include "internal.h"
 +
 +#define PSEUDO_FILE_COUNT 3
 +#define INCFS_START_INO_RANGE 10
 +
 +extern const struct mem_range incfs_pseudo_file_names[PSEUDO_FILE_COUNT];
 +extern const unsigned long incfs_pseudo_file_inodes[PSEUDO_FILE_COUNT];
 +
 +int dir_lookup_pseudo_files(struct super_block *sb, struct dentry *dentry);
 +int emit_pseudo_files(struct dir_context *ctx);
 +
 +#endif
 diff --git a/fs/incfs/sysfs.c b/fs/incfs/sysfs.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/sysfs.c
 @@ -0,0 +1,205 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2021 Google LLC
 + */
 +#include <linux/fs.h>
 +#include <linux/kobject.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "sysfs.h"
 +#include "data_mgmt.h"
 +#include "vfs.h"
 +
 +/******************************************************************************
 + * Define sys/fs/incrementalfs & sys/fs/incrementalfs/features
 + *****************************************************************************/
 +#define INCFS_NODE_FEATURES "features"
 +#define INCFS_NODE_INSTANCES "instances"
 +
 +static struct kobject *sysfs_root;
 +static struct kobject *features_node;
 +static struct kobject *instances_node;
 +
 +#define DECLARE_FEATURE_FLAG(name)					\
 +	static ssize_t name##_show(struct kobject *kobj,		\
 +			 struct kobj_attribute *attr, char *buff)	\
 +{									\
 +	return sysfs_emit(buff, "supported\n");				\
 +}									\
 +									\
 +static struct kobj_attribute name##_attr = __ATTR_RO(name)
 +
 +DECLARE_FEATURE_FLAG(corefs);
 +DECLARE_FEATURE_FLAG(zstd);
 +DECLARE_FEATURE_FLAG(v2);
 +DECLARE_FEATURE_FLAG(bugfix_throttling);
 +DECLARE_FEATURE_FLAG(bugfix_inode_eviction);
 +
 +static struct attribute *attributes[] = {
 +	&corefs_attr.attr,
 +	&zstd_attr.attr,
 +	&v2_attr.attr,
 +	&bugfix_throttling_attr.attr,
 +	&bugfix_inode_eviction_attr.attr,
 +	NULL,
 +};
 +
 +static const struct attribute_group attr_group = {
 +	.attrs = attributes,
 +};
 +
 +int __init incfs_init_sysfs(void)
 +{
 +	int res = -ENOMEM;
 +
 +	sysfs_root = kobject_create_and_add(INCFS_NAME, fs_kobj);
 +	if (!sysfs_root)
 +		return -ENOMEM;
 +
 +	instances_node = kobject_create_and_add(INCFS_NODE_INSTANCES,
 +						sysfs_root);
 +	if (!instances_node)
 +		goto err_put_root;
 +
 +	features_node = kobject_create_and_add(INCFS_NODE_FEATURES,
 +						sysfs_root);
 +	if (!features_node)
 +		goto err_put_instances;
 +
 +	res = sysfs_create_group(features_node, &attr_group);
 +	if (res)
 +		goto err_put_features;
 +
 +	return 0;
 +
 +err_put_features:
 +	kobject_put(features_node);
 +err_put_instances:
 +	kobject_put(instances_node);
 +err_put_root:
 +	kobject_put(sysfs_root);
 +
 +	return res;
 +}
 +
 +void incfs_cleanup_sysfs(void)
 +{
 +	if (features_node) {
 +		sysfs_remove_group(features_node, &attr_group);
 +		kobject_put(features_node);
 +	}
 +
 +	kobject_put(instances_node);
 +	kobject_put(sysfs_root);
 +}
 +
 +/******************************************************************************
 + * Define sys/fs/incrementalfs/instances/<name>/
 + *****************************************************************************/
 +#define __DECLARE_STATUS_FLAG(name)					\
 +static ssize_t name##_show(struct kobject *kobj,			\
 +			 struct kobj_attribute *attr, char *buff)	\
 +{									\
 +	struct incfs_sysfs_node *node = container_of(kobj,		\
 +			struct incfs_sysfs_node, isn_sysfs_node);	\
 +									\
 +	return sysfs_emit(buff, "%d\n", node->isn_mi->mi_##name);	\
 +}									\
 +									\
 +static struct kobj_attribute name##_attr = __ATTR_RO(name)
 +
 +#define __DECLARE_STATUS_FLAG64(name)					\
 +static ssize_t name##_show(struct kobject *kobj,			\
 +			 struct kobj_attribute *attr, char *buff)	\
 +{									\
 +	struct incfs_sysfs_node *node = container_of(kobj,		\
 +			struct incfs_sysfs_node, isn_sysfs_node);	\
 +									\
 +	return sysfs_emit(buff, "%lld\n", node->isn_mi->mi_##name);	\
 +}									\
 +									\
 +static struct kobj_attribute name##_attr = __ATTR_RO(name)
 +
 +__DECLARE_STATUS_FLAG(reads_failed_timed_out);
 +__DECLARE_STATUS_FLAG(reads_failed_hash_verification);
 +__DECLARE_STATUS_FLAG(reads_failed_other);
 +__DECLARE_STATUS_FLAG(reads_delayed_pending);
 +__DECLARE_STATUS_FLAG64(reads_delayed_pending_us);
 +__DECLARE_STATUS_FLAG(reads_delayed_min);
 +__DECLARE_STATUS_FLAG64(reads_delayed_min_us);
 +
 +static struct attribute *mount_attributes[] = {
 +	&reads_failed_timed_out_attr.attr,
 +	&reads_failed_hash_verification_attr.attr,
 +	&reads_failed_other_attr.attr,
 +	&reads_delayed_pending_attr.attr,
 +	&reads_delayed_pending_us_attr.attr,
 +	&reads_delayed_min_attr.attr,
 +	&reads_delayed_min_us_attr.attr,
 +	NULL,
 +};
 +
 +static void incfs_sysfs_release(struct kobject *kobj)
 +{
 +	struct incfs_sysfs_node *node = container_of(kobj,
 +				struct incfs_sysfs_node, isn_sysfs_node);
 +
 +	complete(&node->isn_completion);
 +}
 +
 +static const struct attribute_group mount_attr_group = {
 +	.attrs = mount_attributes,
 +};
 +
 +static struct kobj_type incfs_kobj_node_ktype = {
 +	.sysfs_ops	= &kobj_sysfs_ops,
 +	.release	= &incfs_sysfs_release,
 +};
 +
 +struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name,
 +					      struct mount_info *mi)
 +{
 +	struct incfs_sysfs_node *node = NULL;
 +	int error;
 +
 +	if (!name)
 +		return NULL;
 +
 +	node = kzalloc(sizeof(*node), GFP_NOFS);
 +	if (!node)
 +		return ERR_PTR(-ENOMEM);
 +
 +	node->isn_mi = mi;
 +
 +	init_completion(&node->isn_completion);
 +	kobject_init(&node->isn_sysfs_node, &incfs_kobj_node_ktype);
 +	error = kobject_add(&node->isn_sysfs_node, instances_node, "%s", name);
 +	if (error)
 +		goto err;
 +
 +	error = sysfs_create_group(&node->isn_sysfs_node, &mount_attr_group);
 +	if (error)
 +		goto err;
 +
 +	return node;
 +
 +err:
 +	/*
 +	 * Note kobject_put always calls release, so incfs_sysfs_release will
 +	 * free node
 +	 */
 +	kobject_put(&node->isn_sysfs_node);
 +	return ERR_PTR(error);
 +}
 +
 +void incfs_free_sysfs_node(struct incfs_sysfs_node *node)
 +{
 +	if (!node)
 +		return;
 +
 +	sysfs_remove_group(&node->isn_sysfs_node, &mount_attr_group);
 +	kobject_put(&node->isn_sysfs_node);
 +	wait_for_completion_interruptible(&node->isn_completion);
 +	kfree(node);
 +}
 diff --git a/fs/incfs/sysfs.h b/fs/incfs/sysfs.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/sysfs.h
 @@ -0,0 +1,22 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2021 Google LLC
 + */
 +#ifndef _INCFS_SYSFS_H
 +#define _INCFS_SYSFS_H
 +
 +struct incfs_sysfs_node {
 +	struct kobject isn_sysfs_node;
 +
 +	struct completion isn_completion;
 +
 +	struct mount_info *isn_mi;
 +};
 +
 +int incfs_init_sysfs(void);
 +void incfs_cleanup_sysfs(void);
 +struct incfs_sysfs_node *incfs_add_sysfs_node(const char *name,
 +					      struct mount_info *mi);
 +void incfs_free_sysfs_node(struct incfs_sysfs_node *node);
 +
 +#endif
 diff --git a/fs/incfs/verity.c b/fs/incfs/verity.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/verity.c
 @@ -0,0 +1,821 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2020 Google LLC
 + */
 +
 +/*
 + * fs-verity integration into incfs
 + *
 + * Since incfs has its own merkle tree implementation, most of fs/verity/ is not
 + * needed. incfs also only needs to support the case where
 + * CONFIG_FS_VERITY_BUILTIN_SIGNATURES=n. Therefore, the integration consists of
 + * the following modifications:
 + *
 + * 1. Add the (optional) verity signature to the incfs file format. (Not really
 + *    needed anymore, but this is kept around since this is the behavior of
 + *    fs/verity/ even when CONFIG_FS_VERITY_BUILTIN_SIGNATURES=n.)
 + * 2. Add a pointer to the digest of the fs-verity descriptor struct to the
 + *    data_file struct that incfs attaches to each file inode.
 + * 3. Add the following ioclts:
 + *  - FS_IOC_ENABLE_VERITY
 + *  - FS_IOC_GETFLAGS
 + *  - FS_IOC_MEASURE_VERITY
 + * 4. When FS_IOC_ENABLE_VERITY is called on a non-verity file, the
 + *    fs-verity descriptor struct is populated and digested. Then the S_VERITY
 + *    flag is set and the xattr incfs.verity is set. If the signature is
 + *    non-NULL, an INCFS_MD_VERITY_SIGNATURE is added to the backing file
 + *    containing the signature.
 + * 5. When a file with an incfs.verity xattr's inode is initialized, the
 + *    inode’s S_VERITY flag is set.
 + * 6. When a file with the S_VERITY flag set on its inode is opened, the
 + *    data_file is checked for its verity digest. If the file doesn’t have a
 + *    digest, the file’s digest is calculated as above, checked, and set, or the
 + *    open is denied if it is not valid.
 + * 7. FS_IOC_GETFLAGS simply returns the value of the S_VERITY flag
 + * 8. FS_IOC_MEASURE_VERITY simply returns the cached digest
 + * 9. The final complication is that if FS_IOC_ENABLE_VERITY is called on a file
 + *    which doesn’t have a merkle tree, the merkle tree is calculated before the
 + *    rest of the process is completed.
 + */
 +
 +#include <crypto/hash.h>
 +#include <crypto/sha2.h>
 +#include <linux/fsverity.h>
 +#include <linux/mount.h>
 +
 +#include "verity.h"
 +
 +#include "data_mgmt.h"
 +#include "format.h"
 +#include "integrity.h"
 +#include "vfs.h"
 +
 +#define FS_VERITY_MAX_SIGNATURE_SIZE	16128
 +
 +static int incfs_get_root_hash(struct file *filp, u8 *root_hash)
 +{
 +	struct data_file *df = get_incfs_data_file(filp);
 +
 +	if (!df)
 +		return -EINVAL;
 +
 +	memcpy(root_hash, df->df_hash_tree->root_hash,
 +	       df->df_hash_tree->alg->digest_size);
 +
 +	return 0;
 +}
 +
 +static int incfs_end_enable_verity(struct file *filp, u8 *sig, size_t sig_size)
 +{
 +	struct inode *inode = file_inode(filp);
 +	struct mem_range signature = {
 +		.data = sig,
 +		.len = sig_size,
 +	};
 +	struct data_file *df = get_incfs_data_file(filp);
 +	struct backing_file_context *bfc;
 +	int error;
 +	struct incfs_df_verity_signature *vs = NULL;
 +	loff_t offset;
 +
 +	if (!df || !df->df_backing_file_context)
 +		return -EFSCORRUPTED;
 +
 +	if (sig) {
 +		vs = kzalloc(sizeof(*vs), GFP_NOFS);
 +		if (!vs)
 +			return -ENOMEM;
 +	}
 +
 +	bfc = df->df_backing_file_context;
 +	error = mutex_lock_interruptible(&bfc->bc_mutex);
 +	if (error)
 +		goto out;
 +
 +	error = incfs_write_verity_signature_to_backing_file(bfc, signature,
 +							     &offset);
 +	mutex_unlock(&bfc->bc_mutex);
 +	if (error)
 +		goto out;
 +
 +	/*
 +	 * Set verity xattr so we can set S_VERITY without opening backing file
 +	 */
 +	error = vfs_setxattr(&nop_mnt_idmap, bfc->bc_file->f_path.dentry,
 +			     INCFS_XATTR_VERITY_NAME, NULL, 0, XATTR_CREATE);
 +	if (error) {
 +		pr_warn("incfs: error setting verity xattr: %d\n", error);
 +		goto out;
 +	}
 +
 +	if (sig) {
 +		*vs = (struct incfs_df_verity_signature) {
 +			.size = signature.len,
 +			.offset = offset,
 +		};
 +
 +		df->df_verity_signature = vs;
 +		vs = NULL;
 +	}
 +
 +	inode_set_flags(inode, S_VERITY, S_VERITY);
 +
 +out:
 +	kfree(vs);
 +	return error;
 +}
 +
 +static int incfs_compute_file_digest(struct incfs_hash_alg *alg,
 +				struct fsverity_descriptor *desc,
 +				u8 *digest)
 +{
 +	SHASH_DESC_ON_STACK(d, alg->shash);
 +
 +	d->tfm = alg->shash;
 +	return crypto_shash_digest(d, (u8 *)desc, sizeof(*desc), digest);
 +}
 +
 +static enum incfs_hash_tree_algorithm incfs_convert_fsverity_hash_alg(
 +								int hash_alg)
 +{
 +	switch (hash_alg) {
 +	case FS_VERITY_HASH_ALG_SHA256:
 +		return INCFS_HASH_TREE_SHA256;
 +	default:
 +		return -EINVAL;
 +	}
 +}
 +
 +static struct mem_range incfs_get_verity_digest(struct inode *inode)
 +{
 +	struct inode_info *node = get_incfs_node(inode);
 +	struct data_file *df;
 +	struct mem_range verity_file_digest;
 +
 +	if (!node) {
 +		pr_warn("Invalid inode\n");
 +		return range(NULL, 0);
 +	}
 +
 +	df = node->n_file;
 +
 +	/*
 +	 * Pairs with the cmpxchg_release() in incfs_set_verity_digest().
 +	 * I.e., another task may publish ->df_verity_file_digest concurrently,
 +	 * executing a RELEASE barrier.  We need to use smp_load_acquire() here
 +	 * to safely ACQUIRE the memory the other task published.
 +	 */
 +	verity_file_digest.data = smp_load_acquire(
 +					&df->df_verity_file_digest.data);
 +	verity_file_digest.len = df->df_verity_file_digest.len;
 +	return verity_file_digest;
 +}
 +
 +static void incfs_set_verity_digest(struct inode *inode,
 +				     struct mem_range verity_file_digest)
 +{
 +	struct inode_info *node = get_incfs_node(inode);
 +	struct data_file *df;
 +
 +	if (!node) {
 +		pr_warn("Invalid inode\n");
 +		kfree(verity_file_digest.data);
 +		return;
 +	}
 +
 +	df = node->n_file;
 +	df->df_verity_file_digest.len = verity_file_digest.len;
 +
 +	/*
 +	 * Multiple tasks may race to set ->df_verity_file_digest.data, so use
 +	 * cmpxchg_release().  This pairs with the smp_load_acquire() in
 +	 * incfs_get_verity_digest().  I.e., here we publish
 +	 * ->df_verity_file_digest.data, with a RELEASE barrier so that other
 +	 * tasks can ACQUIRE it.
 +	 */
 +	if (cmpxchg_release(&df->df_verity_file_digest.data, NULL,
 +			    verity_file_digest.data) != NULL)
 +		/* Lost the race, so free the file_digest we allocated. */
 +		kfree(verity_file_digest.data);
 +}
 +
 +/* Calculate the digest of the fsverity_descriptor. */
 +static struct mem_range incfs_calc_verity_digest_from_desc(
 +					const struct inode *inode,
 +					struct fsverity_descriptor *desc)
 +{
 +	enum incfs_hash_tree_algorithm incfs_hash_alg;
 +	struct mem_range verity_file_digest;
 +	int err;
 +	struct incfs_hash_alg *hash_alg;
 +
 +	incfs_hash_alg = incfs_convert_fsverity_hash_alg(desc->hash_algorithm);
 +	if (incfs_hash_alg < 0)
 +		return range(ERR_PTR(incfs_hash_alg), 0);
 +
 +	hash_alg = incfs_get_hash_alg(incfs_hash_alg);
 +	if (IS_ERR(hash_alg))
 +		return range((u8 *)hash_alg, 0);
 +
 +	verity_file_digest = range(kzalloc(hash_alg->digest_size, GFP_KERNEL),
 +				   hash_alg->digest_size);
 +	if (!verity_file_digest.data)
 +		return range(ERR_PTR(-ENOMEM), 0);
 +
 +	err = incfs_compute_file_digest(hash_alg, desc,
 +					verity_file_digest.data);
 +	if (err) {
 +		pr_err("Error %d computing file digest", err);
 +		kfree(verity_file_digest.data);
 +		return range(ERR_PTR(err), 0);
 +	}
 +	pr_debug("Computed file digest: %s:%*phN\n",
 +		 hash_alg->name, (int) verity_file_digest.len,
 +		 verity_file_digest.data);
 +	return verity_file_digest;
 +}
 +
 +static struct fsverity_descriptor *incfs_get_fsverity_descriptor(
 +					struct file *filp, int hash_algorithm)
 +{
 +	struct inode *inode = file_inode(filp);
 +	struct fsverity_descriptor *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 +	int err;
 +
 +	if (!desc)
 +		return ERR_PTR(-ENOMEM);
 +
 +	*desc = (struct fsverity_descriptor) {
 +		.version = 1,
 +		.hash_algorithm = hash_algorithm,
 +		.log_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE),
 +		.data_size = cpu_to_le64(inode->i_size),
 +	};
 +
 +	err = incfs_get_root_hash(filp, desc->root_hash);
 +	if (err) {
 +		kfree(desc);
 +		return ERR_PTR(err);
 +	}
 +
 +	return desc;
 +}
 +
 +static struct mem_range incfs_calc_verity_digest(
 +					struct inode *inode, struct file *filp,
 +					int hash_algorithm)
 +{
 +	struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp,
 +							hash_algorithm);
 +	struct mem_range verity_file_digest;
 +
 +	if (IS_ERR(desc))
 +		return range((u8 *)desc, 0);
 +	verity_file_digest = incfs_calc_verity_digest_from_desc(inode, desc);
 +	kfree(desc);
 +	return verity_file_digest;
 +}
 +
 +static int incfs_build_merkle_tree(struct file *f, struct data_file *df,
 +			     struct backing_file_context *bfc,
 +			     struct mtree *hash_tree, loff_t hash_offset,
 +			     struct incfs_hash_alg *alg, struct mem_range hash)
 +{
 +	int error = 0;
 +	int limit, lvl, i, result;
 +	struct mem_range buf = {.len = INCFS_DATA_FILE_BLOCK_SIZE};
 +	struct mem_range tmp = {.len = 2 * INCFS_DATA_FILE_BLOCK_SIZE};
 +
 +	buf.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(buf.len));
 +	tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len));
 +	if (!buf.data || !tmp.data) {
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +
 +	/*
 +	 * lvl - 1 is the level we are reading, lvl the level we are writing
 +	 * lvl == -1 means actual blocks
 +	 * lvl == hash_tree->depth means root hash
 +	 */
 +	limit = df->df_data_block_count;
 +	for (lvl = 0; lvl <= hash_tree->depth; lvl++) {
 +		for (i = 0; i < limit; ++i) {
 +			loff_t hash_level_offset;
 +			struct mem_range partial_buf = buf;
 +
 +			if (lvl == 0)
 +				result = incfs_read_data_file_block(partial_buf,
 +						f, i, tmp, NULL, NULL);
 +			else {
 +				hash_level_offset = hash_offset +
 +				       hash_tree->hash_level_suboffset[lvl - 1];
 +
 +				result = incfs_kread(bfc, partial_buf.data,
 +						partial_buf.len,
 +						hash_level_offset + i *
 +						INCFS_DATA_FILE_BLOCK_SIZE);
 +			}
 +
 +			if (result < 0) {
 +				error = result;
 +				goto out;
 +			}
 +
 +			partial_buf.len = result;
 +			error = incfs_calc_digest(alg, partial_buf, hash);
 +			if (error)
 +				goto out;
 +
 +			/*
 +			 * last level - only one hash to take and it is stored
 +			 * in the incfs signature record
 +			 */
 +			if (lvl == hash_tree->depth)
 +				break;
 +
 +			hash_level_offset = hash_offset +
 +				hash_tree->hash_level_suboffset[lvl];
 +
 +			result = incfs_kwrite(bfc, hash.data, hash.len,
 +					hash_level_offset + hash.len * i);
 +
 +			if (result < 0) {
 +				error = result;
 +				goto out;
 +			}
 +
 +			if (result != hash.len) {
 +				error = -EIO;
 +				goto out;
 +			}
 +		}
 +		limit = DIV_ROUND_UP(limit,
 +				     INCFS_DATA_FILE_BLOCK_SIZE / hash.len);
 +	}
 +
 +out:
 +	free_pages((unsigned long)tmp.data, get_order(tmp.len));
 +	free_pages((unsigned long)buf.data, get_order(buf.len));
 +	return error;
 +}
 +
 +/*
 + * incfs files have a signature record that is separate from the
 + * verity_signature record. The signature record does not actually contain a
 + * signature, rather it contains the size/offset of the hash tree, and a binary
 + * blob which contains the root hash and potentially a signature.
 + *
 + * If the file was created with a signature record, then this function simply
 + * returns.
 + *
 + * Otherwise it will create a signature record with a minimal binary blob as
 + * defined by the structure below, create space for the hash tree and then
 + * populate it using incfs_build_merkle_tree
 + */
 +static int incfs_add_signature_record(struct file *f)
 +{
 +	/* See incfs_parse_signature */
 +	struct {
 +		__le32 version;
 +		__le32 size_of_hash_info_section;
 +		struct {
 +			__le32 hash_algorithm;
 +			u8 log2_blocksize;
 +			__le32 salt_size;
 +			u8 salt[0];
 +			__le32 hash_size;
 +			u8 root_hash[32];
 +		} __packed hash_section;
 +		__le32 size_of_signing_info_section;
 +		u8 signing_info_section[0];
 +	} __packed sig = {
 +		.version = cpu_to_le32(INCFS_SIGNATURE_VERSION),
 +		.size_of_hash_info_section =
 +			cpu_to_le32(sizeof(sig.hash_section)),
 +		.hash_section = {
 +			.hash_algorithm = cpu_to_le32(INCFS_HASH_TREE_SHA256),
 +			.log2_blocksize = ilog2(INCFS_DATA_FILE_BLOCK_SIZE),
 +			.hash_size = cpu_to_le32(SHA256_DIGEST_SIZE),
 +		},
 +	};
 +
 +	struct data_file *df = get_incfs_data_file(f);
 +	struct mtree *hash_tree = NULL;
 +	struct backing_file_context *bfc;
 +	int error;
 +	loff_t hash_offset, sig_offset;
 +	struct incfs_hash_alg *alg = incfs_get_hash_alg(INCFS_HASH_TREE_SHA256);
 +	u8 hash_buf[INCFS_MAX_HASH_SIZE];
 +	int hash_size = alg->digest_size;
 +	struct mem_range hash = range(hash_buf, hash_size);
 +	int result;
 +	struct incfs_df_signature *signature = NULL;
 +
 +	if (!df)
 +		return -EINVAL;
 +
 +	if (df->df_header_flags & INCFS_FILE_MAPPED)
 +		return -EINVAL;
 +
 +	/* Already signed? */
 +	if (df->df_signature && df->df_hash_tree)
 +		return 0;
 +
 +	if (df->df_signature || df->df_hash_tree)
 +		return -EFSCORRUPTED;
 +
 +	/* Add signature metadata record to file */
 +	hash_tree = incfs_alloc_mtree(range((u8 *)&sig, sizeof(sig)),
 +				      df->df_data_block_count);
 +	if (IS_ERR(hash_tree))
 +		return PTR_ERR(hash_tree);
 +
 +	bfc = df->df_backing_file_context;
 +	if (!bfc) {
 +		error = -EFSCORRUPTED;
 +		goto out;
 +	}
 +
 +	error = mutex_lock_interruptible(&bfc->bc_mutex);
 +	if (error)
 +		goto out;
 +
 +	error = incfs_write_signature_to_backing_file(bfc,
 +				range((u8 *)&sig, sizeof(sig)),
 +				hash_tree->hash_tree_area_size,
 +				&hash_offset, &sig_offset);
 +	mutex_unlock(&bfc->bc_mutex);
 +	if (error)
 +		goto out;
 +
 +	/* Populate merkle tree */
 +	error = incfs_build_merkle_tree(f, df, bfc, hash_tree, hash_offset, alg,
 +				  hash);
 +	if (error)
 +		goto out;
 +
 +	/* Update signature metadata record */
 +	memcpy(sig.hash_section.root_hash, hash.data, alg->digest_size);
 +	result = incfs_kwrite(bfc, &sig, sizeof(sig), sig_offset);
 +	if (result < 0) {
 +		error = result;
 +		goto out;
 +	}
 +
 +	if (result != sizeof(sig)) {
 +		error = -EIO;
 +		goto out;
 +	}
 +
 +	/* Update in-memory records */
 +	memcpy(hash_tree->root_hash, hash.data, alg->digest_size);
 +	signature = kzalloc(sizeof(*signature), GFP_NOFS);
 +	if (!signature) {
 +		error = -ENOMEM;
 +		goto out;
 +	}
 +	*signature = (struct incfs_df_signature) {
 +		.hash_offset = hash_offset,
 +		.hash_size = hash_tree->hash_tree_area_size,
 +		.sig_offset = sig_offset,
 +		.sig_size = sizeof(sig),
 +	};
 +	df->df_signature = signature;
 +	signature = NULL;
 +
 +	/*
 +	 * Use memory barrier to prevent readpage seeing the hash tree until
 +	 * it's fully there
 +	 */
 +	smp_store_release(&df->df_hash_tree, hash_tree);
 +	hash_tree = NULL;
 +
 +out:
 +	kfree(signature);
 +	kfree(hash_tree);
 +	return error;
 +}
 +
 +static int incfs_enable_verity(struct file *filp,
 +			 const struct fsverity_enable_arg *arg)
 +{
 +	struct inode *inode = file_inode(filp);
 +	struct data_file *df = get_incfs_data_file(filp);
 +	u8 *signature = NULL;
 +	struct mem_range verity_file_digest = range(NULL, 0);
 +	int err;
 +
 +	if (!df)
 +		return -EFSCORRUPTED;
 +
 +	err = mutex_lock_interruptible(&df->df_enable_verity);
 +	if (err)
 +		return err;
 +
 +	if (IS_VERITY(inode)) {
 +		err = -EEXIST;
 +		goto out;
 +	}
 +
 +	err = incfs_add_signature_record(filp);
 +	if (err)
 +		goto out;
 +
 +	/* Get the signature if the user provided one */
 +	if (arg->sig_size) {
 +		signature = memdup_user(u64_to_user_ptr(arg->sig_ptr),
 +					arg->sig_size);
 +		if (IS_ERR(signature)) {
 +			err = PTR_ERR(signature);
 +			signature = NULL;
 +			goto out;
 +		}
 +	}
 +
 +	verity_file_digest = incfs_calc_verity_digest(inode, filp,
 +					arg->hash_algorithm);
 +	if (IS_ERR(verity_file_digest.data)) {
 +		err = PTR_ERR(verity_file_digest.data);
 +		verity_file_digest.data = NULL;
 +		goto out;
 +	}
 +
 +	err = incfs_end_enable_verity(filp, signature, arg->sig_size);
 +	if (err)
 +		goto out;
 +
 +	/* Successfully enabled verity */
 +	incfs_set_verity_digest(inode, verity_file_digest);
 +	verity_file_digest.data = NULL;
 +out:
 +	mutex_unlock(&df->df_enable_verity);
 +	kfree(signature);
 +	kfree(verity_file_digest.data);
 +	if (err)
 +		pr_err("%s failed with err %d\n", __func__, err);
 +	return err;
 +}
 +
 +int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg)
 +{
 +	struct inode *inode = file_inode(filp);
 +	struct fsverity_enable_arg arg;
 +
 +	if (copy_from_user(&arg, uarg, sizeof(arg)))
 +		return -EFAULT;
 +
 +	if (arg.version != 1)
 +		return -EINVAL;
 +
 +	if (arg.__reserved1 ||
 +	    memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
 +		return -EINVAL;
 +
 +	if (arg.hash_algorithm != FS_VERITY_HASH_ALG_SHA256)
 +		return -EINVAL;
 +
 +	if (arg.block_size != PAGE_SIZE)
 +		return -EINVAL;
 +
 +	if (arg.salt_size)
 +		return -EINVAL;
 +
 +	if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)
 +		return -EMSGSIZE;
 +
 +	if (S_ISDIR(inode->i_mode))
 +		return -EISDIR;
 +
 +	if (!S_ISREG(inode->i_mode))
 +		return -EINVAL;
 +
 +	return incfs_enable_verity(filp, &arg);
 +}
 +
 +static u8 *incfs_get_verity_signature(struct file *filp, size_t *sig_size)
 +{
 +	struct data_file *df = get_incfs_data_file(filp);
 +	struct incfs_df_verity_signature *vs;
 +	u8 *signature;
 +	int res;
 +
 +	if (!df || !df->df_backing_file_context)
 +		return ERR_PTR(-EFSCORRUPTED);
 +
 +	vs = df->df_verity_signature;
 +	if (!vs) {
 +		*sig_size = 0;
 +		return NULL;
 +	}
 +
 +	if (!vs->size) {
 +		*sig_size = 0;
 +		return ERR_PTR(-EFSCORRUPTED);
 +	}
 +
 +	signature = kzalloc(vs->size, GFP_KERNEL);
 +	if (!signature)
 +		return ERR_PTR(-ENOMEM);
 +
 +	res = incfs_kread(df->df_backing_file_context,
 +			  signature, vs->size, vs->offset);
 +
 +	if (res < 0)
 +		goto err_out;
 +
 +	if (res != vs->size) {
 +		res = -EINVAL;
 +		goto err_out;
 +	}
 +
 +	*sig_size = vs->size;
 +	return signature;
 +
 +err_out:
 +	kfree(signature);
 +	return ERR_PTR(res);
 +}
 +
 +/* Ensure data_file->df_verity_file_digest is populated */
 +static int ensure_verity_info(struct inode *inode, struct file *filp)
 +{
 +	struct mem_range verity_file_digest;
 +
 +	/* See if this file's verity file digest is already cached */
 +	verity_file_digest = incfs_get_verity_digest(inode);
 +	if (verity_file_digest.data)
 +		return 0;
 +
 +	verity_file_digest = incfs_calc_verity_digest(inode, filp,
 +						     FS_VERITY_HASH_ALG_SHA256);
 +	if (IS_ERR(verity_file_digest.data))
 +		return PTR_ERR(verity_file_digest.data);
 +
 +	incfs_set_verity_digest(inode, verity_file_digest);
 +	return 0;
 +}
 +
 +/**
 + * incfs_fsverity_file_open() - prepare to open a file that may be
 + * verity-enabled
 + * @inode: the inode being opened
 + * @filp: the struct file being set up
 + *
 + * When opening a verity file, set up data_file->df_verity_file_digest if not
 + * already done. Note that incfs does not allow opening for writing, so there is
 + * no need for that check.
 + *
 + * Return: 0 on success, -errno on failure
 + */
 +int incfs_fsverity_file_open(struct inode *inode, struct file *filp)
 +{
 +	if (IS_VERITY(inode))
 +		return ensure_verity_info(inode, filp);
 +
 +	return 0;
 +}
 +
 +int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg)
 +{
 +	struct inode *inode = file_inode(filp);
 +	struct mem_range verity_file_digest = incfs_get_verity_digest(inode);
 +	struct fsverity_digest __user *uarg = _uarg;
 +	struct fsverity_digest arg;
 +
 +	if (!verity_file_digest.data || !verity_file_digest.len)
 +		return -ENODATA; /* not a verity file */
 +
 +	/*
 +	 * The user specifies the digest_size their buffer has space for; we can
 +	 * return the digest if it fits in the available space.  We write back
 +	 * the actual size, which may be shorter than the user-specified size.
 +	 */
 +
 +	if (get_user(arg.digest_size, &uarg->digest_size))
 +		return -EFAULT;
 +	if (arg.digest_size < verity_file_digest.len)
 +		return -EOVERFLOW;
 +
 +	memset(&arg, 0, sizeof(arg));
 +	arg.digest_algorithm = FS_VERITY_HASH_ALG_SHA256;
 +	arg.digest_size = verity_file_digest.len;
 +
 +	if (copy_to_user(uarg, &arg, sizeof(arg)))
 +		return -EFAULT;
 +
 +	if (copy_to_user(uarg->digest, verity_file_digest.data,
 +			 verity_file_digest.len))
 +		return -EFAULT;
 +
 +	return 0;
 +}
 +
 +static int incfs_read_merkle_tree(struct file *filp, void __user *buf,
 +				  u64 start_offset, int length)
 +{
 +	struct mem_range tmp_buf;
 +	size_t offset;
 +	int retval = 0;
 +	int err = 0;
 +	struct data_file *df = get_incfs_data_file(filp);
 +
 +	if (!df)
 +		return -EINVAL;
 +
 +	tmp_buf = (struct mem_range) {
 +		.data = kzalloc(INCFS_DATA_FILE_BLOCK_SIZE, GFP_NOFS),
 +		.len = INCFS_DATA_FILE_BLOCK_SIZE,
 +	};
 +	if (!tmp_buf.data)
 +		return -ENOMEM;
 +
 +	for (offset = start_offset; offset < start_offset + length;
 +	     offset += tmp_buf.len) {
 +		err = incfs_read_merkle_tree_blocks(tmp_buf, df, offset);
 +
 +		if (err < 0)
 +			break;
 +
 +		if (err != tmp_buf.len)
 +			break;
 +
 +		if (copy_to_user(buf, tmp_buf.data, tmp_buf.len))
 +			break;
 +
 +		buf += tmp_buf.len;
 +		retval += tmp_buf.len;
 +	}
 +
 +	kfree(tmp_buf.data);
 +	return retval ? retval : err;
 +}
 +
 +static int incfs_read_descriptor(struct file *filp,
 +				 void __user *buf, u64 offset, int length)
 +{
 +	int err;
 +	struct fsverity_descriptor *desc = incfs_get_fsverity_descriptor(filp,
 +						FS_VERITY_HASH_ALG_SHA256);
 +
 +	if (IS_ERR(desc))
 +		return PTR_ERR(desc);
 +	length = min_t(u64, length, sizeof(*desc));
 +	err = copy_to_user(buf, desc, length);
 +	kfree(desc);
 +	return err ? err : length;
 +}
 +
 +static int incfs_read_signature(struct file *filp,
 +				void __user *buf, u64 offset, int length)
 +{
 +	size_t sig_size;
 +	static u8 *signature;
 +	int err;
 +
 +	signature = incfs_get_verity_signature(filp, &sig_size);
 +	if (IS_ERR(signature))
 +		return PTR_ERR(signature);
 +
 +	if (!signature)
 +		return -ENODATA;
 +
 +	length = min_t(u64, length, sig_size);
 +	err = copy_to_user(buf, signature, length);
 +	kfree(signature);
 +	return err ? err : length;
 +}
 +
 +int incfs_ioctl_read_verity_metadata(struct file *filp,
 +				     const void __user *uarg)
 +{
 +	struct fsverity_read_metadata_arg arg;
 +	int length;
 +	void __user *buf;
 +
 +	if (copy_from_user(&arg, uarg, sizeof(arg)))
 +		return -EFAULT;
 +
 +	if (arg.__reserved)
 +		return -EINVAL;
 +
 +	/* offset + length must not overflow. */
 +	if (arg.offset + arg.length < arg.offset)
 +		return -EINVAL;
 +
 +	/* Ensure that the return value will fit in INT_MAX. */
 +	length = min_t(u64, arg.length, INT_MAX);
 +
 +	buf = u64_to_user_ptr(arg.buf_ptr);
 +
 +	switch (arg.metadata_type) {
 +	case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
 +		return incfs_read_merkle_tree(filp, buf, arg.offset, length);
 +	case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
 +		return incfs_read_descriptor(filp, buf, arg.offset, length);
 +	case FS_VERITY_METADATA_TYPE_SIGNATURE:
 +		return incfs_read_signature(filp, buf, arg.offset, length);
 +	default:
 +		return -EINVAL;
 +	}
 +}
 diff --git a/fs/incfs/verity.h b/fs/incfs/verity.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/verity.h
 @@ -0,0 +1,49 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2020 Google LLC
 + */
 +
 +#ifndef _INCFS_VERITY_H
 +#define _INCFS_VERITY_H
 +
 +/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
 +#define FS_VERITY_MAX_SIGNATURE_SIZE	16128
 +
 +#ifdef CONFIG_FS_VERITY
 +
 +int incfs_ioctl_enable_verity(struct file *filp, const void __user *uarg);
 +int incfs_ioctl_measure_verity(struct file *filp, void __user *_uarg);
 +
 +int incfs_fsverity_file_open(struct inode *inode, struct file *filp);
 +int incfs_ioctl_read_verity_metadata(struct file *filp,
 +				     const void __user *uarg);
 +
 +#else /* !CONFIG_FS_VERITY */
 +
 +static inline int incfs_ioctl_enable_verity(struct file *filp,
 +					    const void __user *uarg)
 +{
 +	return -EOPNOTSUPP;
 +}
 +
 +static inline int incfs_ioctl_measure_verity(struct file *filp,
 +					     void __user *_uarg)
 +{
 +	return -EOPNOTSUPP;
 +}
 +
 +static inline int incfs_fsverity_file_open(struct inode *inode,
 +					   struct file *filp)
 +{
 +	return -EOPNOTSUPP;
 +}
 +
 +static inline int incfs_ioctl_read_verity_metadata(struct file *filp,
 +						const void __user *uarg)
 +{
 +	return -EOPNOTSUPP;
 +}
 +
 +#endif /* !CONFIG_FS_VERITY */
 +
 +#endif
 diff --git a/fs/incfs/vfs.c b/fs/incfs/vfs.c
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/vfs.c
 @@ -0,0 +1,1994 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2018 Google LLC
 + */
 +
 +#include <linux/blkdev.h>
 +#include <linux/compat.h>
 +#include <linux/delay.h>
 +#include <linux/file.h>
 +#include <linux/fs.h>
 +#include <linux/fs_stack.h>
 +#include <linux/fsnotify.h>
 +#include <linux/fsverity.h>
 +#include <linux/mmap_lock.h>
 +#include <linux/namei.h>
 +#include <linux/pagemap.h>
 +#include <linux/parser.h>
 +#include <linux/seq_file.h>
 +#include <linux/backing-dev-defs.h>
 +
 +#include <uapi/linux/incrementalfs.h>
 +
 +#include "vfs.h"
 +
 +#include "data_mgmt.h"
 +#include "format.h"
 +#include "internal.h"
 +#include "pseudo_files.h"
 +#include "sysfs.h"
 +#include "verity.h"
 +
 +static int incfs_remount_fs(struct super_block *sb, int *flags, char *data);
 +
 +static int dentry_revalidate(struct dentry *dentry, unsigned int flags);
 +static void dentry_release(struct dentry *d);
 +
 +static int iterate_incfs_dir(struct file *file, struct dir_context *ctx);
 +static struct dentry *dir_lookup(struct inode *dir_inode,
 +		struct dentry *dentry, unsigned int flags);
 +static int dir_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 +		     struct dentry *dentry, umode_t mode);
 +static int dir_unlink(struct inode *dir, struct dentry *dentry);
 +static int dir_link(struct dentry *old_dentry, struct inode *dir,
 +			 struct dentry *new_dentry);
 +static int dir_rmdir(struct inode *dir, struct dentry *dentry);
 +static int dir_rename(struct inode *old_dir, struct dentry *old_dentry,
 +		struct inode *new_dir, struct dentry *new_dentry,
 +		unsigned int flags);
 +
 +static int file_open(struct inode *inode, struct file *file);
 +static int file_release(struct inode *inode, struct file *file);
 +static int read_folio(struct file *f, struct folio *folio);
 +static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg);
 +
 +#ifdef CONFIG_COMPAT
 +static long incfs_compat_ioctl(struct file *file, unsigned int cmd,
 +			 unsigned long arg);
 +#endif
 +
 +static struct inode *alloc_inode(struct super_block *sb);
 +static void free_inode(struct inode *inode);
 +static void evict_inode(struct inode *inode);
 +
 +static int incfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 +			 struct iattr *ia);
 +static int incfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 +			 struct kstat *stat, u32 request_mask,
 +			 unsigned int query_flags);
 +static ssize_t incfs_getxattr(struct dentry *d, const char *name,
 +			void *value, size_t size);
 +static ssize_t incfs_setxattr(struct mnt_idmap *idmap, struct dentry *d,
 +			      const char *name, void *value, size_t size,
 +			      int flags);
 +static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size);
 +
 +static int show_options(struct seq_file *, struct dentry *);
 +
 +static const struct super_operations incfs_super_ops = {
 +	.statfs = simple_statfs,
 +	.remount_fs = incfs_remount_fs,
 +	.alloc_inode	= alloc_inode,
 +	.destroy_inode	= free_inode,
 +	.evict_inode = evict_inode,
 +	.show_options = show_options
 +};
 +
 +static int dir_rename_wrap(struct mnt_idmap *idmap, struct inode *old_dir,
 +			   struct dentry *old_dentry, struct inode *new_dir,
 +			   struct dentry *new_dentry, unsigned int flags)
 +{
 +	return dir_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 +}
 +
 +static const struct inode_operations incfs_dir_inode_ops = {
 +	.lookup = dir_lookup,
 +	.mkdir = dir_mkdir,
 +	.rename = dir_rename_wrap,
 +	.unlink = dir_unlink,
 +	.link = dir_link,
 +	.rmdir = dir_rmdir,
 +	.setattr = incfs_setattr,
 +};
 +
 +WRAP_DIR_ITER(iterate_incfs_dir) // FIXME!
 +static const struct file_operations incfs_dir_fops = {
 +	.llseek = generic_file_llseek,
 +	.read = generic_read_dir,
 +	.iterate_shared	= shared_iterate_incfs_dir,
 +	.open = file_open,
 +	.release = file_release,
 +};
 +
 +static const struct dentry_operations incfs_dentry_ops = {
 +	.d_revalidate = dentry_revalidate,
 +	.d_release = dentry_release
 +};
 +
 +static const struct address_space_operations incfs_address_space_ops = {
 +	.read_folio = read_folio,
 +	/* .readpages = readpages */
 +};
 +
 +static vm_fault_t incfs_fault(struct vm_fault *vmf)
 +{
 +	vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY;
 +	return filemap_fault(vmf);
 +}
 +
 +static const struct vm_operations_struct incfs_file_vm_ops = {
 +	.fault		= incfs_fault,
 +	.map_pages	= filemap_map_pages,
 +	.page_mkwrite	= filemap_page_mkwrite,
 +};
 +
 +/* This is used for a general mmap of a disk file */
 +
 +static int incfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 +{
 +	struct address_space *mapping = file->f_mapping;
 +
 +	if (!mapping->a_ops->read_folio)
 +		return -ENOEXEC;
 +	file_accessed(file);
 +	vma->vm_ops = &incfs_file_vm_ops;
 +	return 0;
 +}
 +
 +const struct file_operations incfs_file_ops = {
 +	.open = file_open,
 +	.release = file_release,
 +	.read_iter = generic_file_read_iter,
 +	.mmap = incfs_file_mmap,
 +	.splice_read = filemap_splice_read,
 +	.llseek = generic_file_llseek,
 +	.unlocked_ioctl = dispatch_ioctl,
 +#ifdef CONFIG_COMPAT
 +	.compat_ioctl = incfs_compat_ioctl,
 +#endif
 +};
 +
 +const struct inode_operations incfs_file_inode_ops = {
 +	.setattr = incfs_setattr,
 +	.getattr = incfs_getattr,
 +	.listxattr = incfs_listxattr
 +};
 +
 +static int incfs_handler_getxattr(const struct xattr_handler *xh,
 +				  struct dentry *d, struct inode *inode,
 +				  const char *name, void *buffer, size_t size)
 +{
 +	return incfs_getxattr(d, name, buffer, size);
 +}
 +
 +static int incfs_handler_setxattr(const struct xattr_handler *xh,
 +				  struct mnt_idmap *idmap,
 +				  struct dentry *d, struct inode *inode,
 +				  const char *name, const void *buffer,
 +				  size_t size, int flags)
 +{
 +	return incfs_setxattr(idmap, d, name, (void *)buffer, size, flags);
 +}
 +
 +static const struct xattr_handler incfs_xattr_handler = {
 +	.prefix = "",	/* AKA all attributes */
 +	.get = incfs_handler_getxattr,
 +	.set = incfs_handler_setxattr,
 +};
 +
 +static const struct xattr_handler *incfs_xattr_ops[] = {
 +	&incfs_xattr_handler,
 +	NULL,
 +};
 +
 +struct inode_search {
 +	unsigned long ino;
 +
 +	struct dentry *backing_dentry;
 +
 +	size_t size;
 +
 +	bool verity;
 +};
 +
 +enum parse_parameter {
 +	Opt_read_timeout,
 +	Opt_readahead_pages,
 +	Opt_rlog_pages,
 +	Opt_rlog_wakeup_cnt,
 +	Opt_report_uid,
 +	Opt_sysfs_name,
 +	Opt_err
 +};
 +
 +static const match_table_t option_tokens = {
 +	{ Opt_read_timeout, "read_timeout_ms=%u" },
 +	{ Opt_readahead_pages, "readahead=%u" },
 +	{ Opt_rlog_pages, "rlog_pages=%u" },
 +	{ Opt_rlog_wakeup_cnt, "rlog_wakeup_cnt=%u" },
 +	{ Opt_report_uid, "report_uid" },
 +	{ Opt_sysfs_name, "sysfs_name=%s" },
 +	{ Opt_err, NULL }
 +};
 +
 +static void free_options(struct mount_options *opts)
 +{
 +	kfree(opts->sysfs_name);
 +	opts->sysfs_name = NULL;
 +}
 +
 +static int parse_options(struct mount_options *opts, char *str)
 +{
 +	substring_t args[MAX_OPT_ARGS];
 +	int value;
 +	char *position;
 +
 +	if (opts == NULL)
 +		return -EFAULT;
 +
 +	*opts = (struct mount_options) {
 +		.read_timeout_ms = 1000, /* Default: 1s */
 +		.readahead_pages = 10,
 +		.read_log_pages = 2,
 +		.read_log_wakeup_count = 10,
 +	};
 +
 +	if (str == NULL || *str == 0)
 +		return 0;
 +
 +	while ((position = strsep(&str, ",")) != NULL) {
 +		int token;
 +
 +		if (!*position)
 +			continue;
 +
 +		token = match_token(position, option_tokens, args);
 +
 +		switch (token) {
 +		case Opt_read_timeout:
 +			if (match_int(&args[0], &value))
 +				return -EINVAL;
 +			if (value > 3600000)
 +				return -EINVAL;
 +			opts->read_timeout_ms = value;
 +			break;
 +		case Opt_readahead_pages:
 +			if (match_int(&args[0], &value))
 +				return -EINVAL;
 +			opts->readahead_pages = value;
 +			break;
 +		case Opt_rlog_pages:
 +			if (match_int(&args[0], &value))
 +				return -EINVAL;
 +			opts->read_log_pages = value;
 +			break;
 +		case Opt_rlog_wakeup_cnt:
 +			if (match_int(&args[0], &value))
 +				return -EINVAL;
 +			opts->read_log_wakeup_count = value;
 +			break;
 +		case Opt_report_uid:
 +			opts->report_uid = true;
 +			break;
 +		case Opt_sysfs_name:
 +			opts->sysfs_name = match_strdup(&args[0]);
 +			break;
 +		default:
 +			free_options(opts);
 +			return -EINVAL;
 +		}
 +	}
 +
 +	return 0;
 +}
 +
 +/* Read file size from the attribute. Quicker than reading the header */
 +static u64 read_size_attr(struct dentry *backing_dentry)
 +{
 +	__le64 attr_value;
 +	ssize_t bytes_read;
 +
 +	bytes_read = vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_SIZE_NAME,
 +			(char *)&attr_value, sizeof(attr_value));
 +
 +	if (bytes_read != sizeof(attr_value))
 +		return 0;
 +
 +	return le64_to_cpu(attr_value);
 +}
 +
 +/* Read verity flag from the attribute. Quicker than reading the header */
 +static bool read_verity_attr(struct dentry *backing_dentry)
 +{
 +	return vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_VERITY_NAME, NULL, 0)
 +		>= 0;
 +}
 +
 +static int inode_test(struct inode *inode, void *opaque)
 +{
 +	struct inode_search *search = opaque;
 +	struct inode_info *node = get_incfs_node(inode);
 +	struct inode *backing_inode = d_inode(search->backing_dentry);
 +
 +	if (!node)
 +		return 0;
 +
 +	return node->n_backing_inode == backing_inode &&
 +		inode->i_ino == search->ino;
 +}
 +
 +static int inode_set(struct inode *inode, void *opaque)
 +{
 +	struct inode_search *search = opaque;
 +	struct inode_info *node = get_incfs_node(inode);
 +	struct dentry *backing_dentry = search->backing_dentry;
 +	struct inode *backing_inode = d_inode(backing_dentry);
 +
 +	fsstack_copy_attr_all(inode, backing_inode);
 +	if (S_ISREG(inode->i_mode)) {
 +		u64 size = search->size;
 +
 +		inode->i_size = size;
 +		inode->i_blocks = get_blocks_count_for_size(size);
 +		inode->i_mapping->a_ops = &incfs_address_space_ops;
 +		inode->i_op = &incfs_file_inode_ops;
 +		inode->i_fop = &incfs_file_ops;
 +		inode->i_mode &= ~0222;
 +		if (search->verity)
 +			inode_set_flags(inode, S_VERITY, S_VERITY);
 +	} else if (S_ISDIR(inode->i_mode)) {
 +		inode->i_size = 0;
 +		inode->i_blocks = 1;
 +		inode->i_mapping->a_ops = &incfs_address_space_ops;
 +		inode->i_op = &incfs_dir_inode_ops;
 +		inode->i_fop = &incfs_dir_fops;
 +	} else {
 +		pr_warn_once("incfs: Unexpected inode type\n");
 +		return -EBADF;
 +	}
 +
 +	ihold(backing_inode);
 +	node->n_backing_inode = backing_inode;
 +	node->n_mount_info = get_mount_info(inode->i_sb);
 +	inode_set_ctime_to_ts(inode, inode_get_ctime(backing_inode));
 +	inode_set_mtime_to_ts(inode, inode_get_mtime(backing_inode));
 +	inode_set_atime_to_ts(inode, inode_get_atime(backing_inode));
 +	inode->i_ino = backing_inode->i_ino;
 +	if (backing_inode->i_ino < INCFS_START_INO_RANGE) {
 +		pr_warn("incfs: ino conflict with backing FS %ld\n",
 +			backing_inode->i_ino);
 +	}
 +
 +	return 0;
 +}
 +
 +static struct inode *fetch_regular_inode(struct super_block *sb,
 +					struct dentry *backing_dentry)
 +{
 +	struct inode *backing_inode = d_inode(backing_dentry);
 +	struct inode_search search = {
 +		.ino = backing_inode->i_ino,
 +		.backing_dentry = backing_dentry,
 +		.size = read_size_attr(backing_dentry),
 +		.verity = read_verity_attr(backing_dentry),
 +	};
 +	struct inode *inode = iget5_locked(sb, search.ino, inode_test,
 +				inode_set, &search);
 +
 +	if (!inode)
 +		return ERR_PTR(-ENOMEM);
 +
 +	if (inode->i_state & I_NEW)
 +		unlock_new_inode(inode);
 +
 +	return inode;
 +}
 +
 +static int iterate_incfs_dir(struct file *file, struct dir_context *ctx)
 +{
 +	struct dir_file *dir = get_incfs_dir_file(file);
 +	int error = 0;
 +	struct mount_info *mi = get_mount_info(file_superblock(file));
 +	bool root;
 +
 +	if (!dir) {
 +		error = -EBADF;
 +		goto out;
 +	}
 +
 +	root = dir->backing_dir->f_inode
 +			== d_inode(mi->mi_backing_dir_path.dentry);
 +
 +	if (root) {
 +		error = emit_pseudo_files(ctx);
 +		if (error)
 +			goto out;
 +	}
 +
 +	ctx->pos -= PSEUDO_FILE_COUNT;
 +	error = iterate_dir(dir->backing_dir, ctx);
 +	ctx->pos += PSEUDO_FILE_COUNT;
 +	file->f_pos = dir->backing_dir->f_pos;
 +out:
 +	if (error)
 +		pr_warn("incfs: %s %s %d\n", __func__,
 +			file->f_path.dentry->d_name.name, error);
 +	return error;
 +}
 +
 +static int incfs_init_dentry(struct dentry *dentry, struct path *path)
 +{
 +	struct dentry_info *d_info = NULL;
 +
 +	if (!dentry || !path)
 +		return -EFAULT;
 +
 +	d_info = kzalloc(sizeof(*d_info), GFP_NOFS);
 +	if (!d_info)
 +		return -ENOMEM;
 +
 +	d_info->backing_path = *path;
 +	path_get(path);
 +
 +	dentry->d_fsdata = d_info;
 +	return 0;
 +}
 +
 +static struct dentry *open_or_create_special_dir(struct dentry *backing_dir,
 +						 const char *name,
 +						 bool *created)
 +{
 +	struct dentry *index_dentry;
 +	struct inode *backing_inode = d_inode(backing_dir);
 +	int err = 0;
 +
 +	index_dentry = incfs_lookup_dentry(backing_dir, name);
 +	if (!index_dentry) {
 +		return ERR_PTR(-EINVAL);
 +	} else if (IS_ERR(index_dentry)) {
 +		return index_dentry;
 +	} else if (d_really_is_positive(index_dentry)) {
 +		/* Index already exists. */
 +		*created = false;
 +		return index_dentry;
 +	}
 +
 +	/* Index needs to be created. */
 +	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
 +	err = vfs_mkdir(&nop_mnt_idmap, backing_inode, index_dentry, 0777);
 +	inode_unlock(backing_inode);
 +
 +	if (err) {
 +		dput(index_dentry);
 +		return ERR_PTR(err);
 +	}
 +
 +	if (!d_really_is_positive(index_dentry) ||
 +		unlikely(d_unhashed(index_dentry))) {
 +		dput(index_dentry);
 +		return ERR_PTR(-EINVAL);
 +	}
 +
 +	*created = true;
 +	return index_dentry;
 +}
 +
 +static int read_single_page_timeouts(struct data_file *df, struct file *f,
 +				     int block_index, struct mem_range range,
 +				     struct mem_range tmp,
 +				     unsigned int *delayed_min_us)
 +{
 +	struct mount_info *mi = df->df_mount_info;
 +	struct incfs_read_data_file_timeouts timeouts = {
 +		.max_pending_time_us = U32_MAX,
 +	};
 +	int uid = current_uid().val;
 +	int i;
 +
 +	spin_lock(&mi->mi_per_uid_read_timeouts_lock);
 +	for (i = 0; i < mi->mi_per_uid_read_timeouts_size /
 +		sizeof(*mi->mi_per_uid_read_timeouts); ++i) {
 +		struct incfs_per_uid_read_timeouts *t =
 +			&mi->mi_per_uid_read_timeouts[i];
 +
 +		if(t->uid == uid) {
 +			timeouts.min_time_us = t->min_time_us;
 +			timeouts.min_pending_time_us = t->min_pending_time_us;
 +			timeouts.max_pending_time_us = t->max_pending_time_us;
 +			break;
 +		}
 +	}
 +	spin_unlock(&mi->mi_per_uid_read_timeouts_lock);
 +	if (timeouts.max_pending_time_us == U32_MAX) {
 +		u64 read_timeout_us = (u64)mi->mi_options.read_timeout_ms *
 +					1000;
 +
 +		timeouts.max_pending_time_us = read_timeout_us <= U32_MAX ?
 +					       read_timeout_us : U32_MAX;
 +	}
 +
 +	return incfs_read_data_file_block(range, f, block_index, tmp,
 +					  &timeouts, delayed_min_us);
 +}
 +
 +static int usleep_interruptible(u32 us)
 +{
 +	/* See:
 +	 * https://www.kernel.org/doc/Documentation/timers/timers-howto.txt
 +	 * for explanation
 +	 */
 +	if (us < 10) {
 +		udelay(us);
 +		return 0;
 +	} else if (us < 20000) {
 +		usleep_range(us, us + us / 10);
 +		return 0;
 +	} else
 +		return msleep_interruptible(us / 1000);
 +}
 +
 +static int read_folio(struct file *f, struct folio *folio)
 +{
 +	struct page *page = &folio->page;
 +	loff_t offset = 0;
 +	loff_t size = 0;
 +	ssize_t bytes_to_read = 0;
 +	ssize_t read_result = 0;
 +	struct data_file *df = get_incfs_data_file(f);
 +	int result = 0;
 +	void *page_start;
 +	int block_index;
 +	unsigned int delayed_min_us = 0;
 +
 +	if (!df) {
 +		SetPageError(page);
 +		unlock_page(page);
 +		return -EBADF;
 +	}
 +
 +	page_start = kmap(page);
 +	offset = page_offset(page);
 +	block_index = (offset + df->df_mapped_offset) /
 +		INCFS_DATA_FILE_BLOCK_SIZE;
 +	size = df->df_size;
 +
 +	if (offset < size) {
 +		struct mem_range tmp = {
 +			.len = 2 * INCFS_DATA_FILE_BLOCK_SIZE
 +		};
 +		tmp.data = (u8 *)__get_free_pages(GFP_NOFS, get_order(tmp.len));
 +		if (!tmp.data) {
 +			read_result = -ENOMEM;
 +			goto err;
 +		}
 +		bytes_to_read = min_t(loff_t, size - offset, PAGE_SIZE);
 +
 +		read_result = read_single_page_timeouts(df, f, block_index,
 +					range(page_start, bytes_to_read), tmp,
 +					&delayed_min_us);
 +
 +		free_pages((unsigned long)tmp.data, get_order(tmp.len));
 +	} else {
 +		bytes_to_read = 0;
 +		read_result = 0;
 +	}
 +
 +err:
 +	if (read_result < 0)
 +		result = read_result;
 +	else if (read_result < PAGE_SIZE)
 +		zero_user(page, read_result, PAGE_SIZE - read_result);
 +
 +	if (result == 0)
 +		SetPageUptodate(page);
 +	else
 +		SetPageError(page);
 +
 +	flush_dcache_page(page);
 +	kunmap(page);
 +	unlock_page(page);
 +	if (delayed_min_us)
 +		usleep_interruptible(delayed_min_us);
 +	return result;
 +}
 +
 +int incfs_link(struct dentry *what, struct dentry *where)
 +{
 +	struct dentry *parent_dentry = dget_parent(where);
 +	struct inode *pinode = d_inode(parent_dentry);
 +	int error = 0;
 +
 +	inode_lock_nested(pinode, I_MUTEX_PARENT);
 +	error = vfs_link(what, &nop_mnt_idmap, pinode, where, NULL);
 +	inode_unlock(pinode);
 +
 +	dput(parent_dentry);
 +	return error;
 +}
 +
 +int incfs_unlink(struct dentry *dentry)
 +{
 +	struct dentry *parent_dentry = dget_parent(dentry);
 +	struct inode *pinode = d_inode(parent_dentry);
 +	int error = 0;
 +
 +	inode_lock_nested(pinode, I_MUTEX_PARENT);
 +	error = vfs_unlink(&nop_mnt_idmap, pinode, dentry, NULL);
 +	inode_unlock(pinode);
 +
 +	dput(parent_dentry);
 +	return error;
 +}
 +
 +static int incfs_rmdir(struct dentry *dentry)
 +{
 +	struct dentry *parent_dentry = dget_parent(dentry);
 +	struct inode *pinode = d_inode(parent_dentry);
 +	int error = 0;
 +
 +	inode_lock_nested(pinode, I_MUTEX_PARENT);
 +	error = vfs_rmdir(&nop_mnt_idmap, pinode, dentry);
 +	inode_unlock(pinode);
 +
 +	dput(parent_dentry);
 +	return error;
 +}
 +
 +static void notify_unlink(struct dentry *dentry, const char *file_id_str,
 +			  const char *special_directory)
 +{
 +	struct dentry *root = dentry;
 +	struct dentry *file = NULL;
 +	struct dentry *dir = NULL;
 +	int error = 0;
 +	bool take_lock = root->d_parent != root->d_parent->d_parent;
 +
 +	while (root != root->d_parent)
 +		root = root->d_parent;
 +
 +	if (take_lock)
 +		dir = incfs_lookup_dentry(root, special_directory);
 +	else
 +		dir = lookup_one_len(special_directory, root,
 +				     strlen(special_directory));
 +
 +	if (IS_ERR(dir)) {
 +		error = PTR_ERR(dir);
 +		goto out;
 +	}
 +	if (d_is_negative(dir)) {
 +		error = -ENOENT;
 +		goto out;
 +	}
 +
 +	file = incfs_lookup_dentry(dir, file_id_str);
 +	if (IS_ERR(file)) {
 +		error = PTR_ERR(file);
 +		goto out;
 +	}
 +	if (d_is_negative(file)) {
 +		error = -ENOENT;
 +		goto out;
 +	}
 +
 +	fsnotify_unlink(d_inode(dir), file);
 +	d_delete(file);
 +
 +out:
 +	if (error)
 +		pr_warn("%s failed with error %d\n", __func__, error);
 +
 +	dput(dir);
 +	dput(file);
 +}
 +
 +static void handle_file_completed(struct file *f, struct data_file *df)
 +{
 +	struct backing_file_context *bfc;
 +	struct mount_info *mi = df->df_mount_info;
 +	char *file_id_str = NULL;
 +	struct dentry *incomplete_file_dentry = NULL;
 +	const struct cred *old_cred = override_creds(mi->mi_owner);
 +	int error;
 +
 +	/* Truncate file to remove any preallocated space */
 +	bfc = df->df_backing_file_context;
 +	if (bfc) {
 +		struct file *f = bfc->bc_file;
 +
 +		if (f) {
 +			loff_t size = i_size_read(file_inode(f));
 +
 +			error = vfs_truncate(&f->f_path, size);
 +			if (error)
 +				/* No useful action on failure */
 +				pr_warn("incfs: Failed to truncate complete file: %d\n",
 +					error);
 +		}
 +	}
 +
 +	/* This is best effort - there is no useful action to take on failure */
 +	file_id_str = file_id_to_str(df->df_id);
 +	if (!file_id_str)
 +		goto out;
 +
 +	incomplete_file_dentry = incfs_lookup_dentry(
 +					df->df_mount_info->mi_incomplete_dir,
 +					file_id_str);
 +	if (!incomplete_file_dentry || IS_ERR(incomplete_file_dentry)) {
 +		incomplete_file_dentry = NULL;
 +		goto out;
 +	}
 +
 +	if (!d_really_is_positive(incomplete_file_dentry))
 +		goto out;
 +
 +	vfs_fsync(df->df_backing_file_context->bc_file, 0);
 +	error = incfs_unlink(incomplete_file_dentry);
 +	if (error) {
 +		pr_warn("incfs: Deleting incomplete file failed: %d\n", error);
 +		goto out;
 +	}
 +
 +	notify_unlink(f->f_path.dentry, file_id_str, INCFS_INCOMPLETE_NAME);
 +
 +out:
 +	dput(incomplete_file_dentry);
 +	kfree(file_id_str);
 +	revert_creds(old_cred);
 +}
 +
 +static long ioctl_fill_blocks(struct file *f, void __user *arg)
 +{
 +	struct incfs_fill_blocks __user *usr_fill_blocks = arg;
 +	struct incfs_fill_blocks fill_blocks;
 +	struct incfs_fill_block __user *usr_fill_block_array;
 +	struct data_file *df = get_incfs_data_file(f);
 +	struct incfs_file_data *fd = f->private_data;
 +	const ssize_t data_buf_size = 2 * INCFS_DATA_FILE_BLOCK_SIZE;
 +	u8 *data_buf = NULL;
 +	ssize_t error = 0;
 +	int i = 0;
 +	bool complete = false;
 +
 +	if (!df)
 +		return -EBADF;
 +
 +	if (!fd || fd->fd_fill_permission != CAN_FILL)
 +		return -EPERM;
 +
 +	if (copy_from_user(&fill_blocks, usr_fill_blocks, sizeof(fill_blocks)))
 +		return -EFAULT;
 +
 +	usr_fill_block_array = u64_to_user_ptr(fill_blocks.fill_blocks);
 +	data_buf = (u8 *)__get_free_pages(GFP_NOFS | __GFP_COMP,
 +					  get_order(data_buf_size));
 +	if (!data_buf)
 +		return -ENOMEM;
 +
 +	for (i = 0; i < fill_blocks.count; i++) {
 +		struct incfs_fill_block fill_block = {};
 +
 +		if (copy_from_user(&fill_block, &usr_fill_block_array[i],
 +				   sizeof(fill_block)) > 0) {
 +			error = -EFAULT;
 +			break;
 +		}
 +
 +		if (fill_block.data_len > data_buf_size) {
 +			error = -E2BIG;
 +			break;
 +		}
 +
 +		if (copy_from_user(data_buf, u64_to_user_ptr(fill_block.data),
 +				   fill_block.data_len) > 0) {
 +			error = -EFAULT;
 +			break;
 +		}
 +		fill_block.data = 0; /* To make sure nobody uses it. */
 +		if (fill_block.flags & INCFS_BLOCK_FLAGS_HASH) {
 +			error = incfs_process_new_hash_block(df, &fill_block,
 +							     data_buf);
 +		} else {
 +			error = incfs_process_new_data_block(df, &fill_block,
 +							data_buf, &complete);
 +		}
 +		if (error)
 +			break;
 +	}
 +
 +	if (data_buf)
 +		free_pages((unsigned long)data_buf, get_order(data_buf_size));
 +
 +	if (complete)
 +		handle_file_completed(f, df);
 +
 +	/*
 +	 * Only report the error if no records were processed, otherwise
 +	 * just return how many were processed successfully.
 +	 */
 +	if (i == 0)
 +		return error;
 +
 +	return i;
 +}
 +
 +static long ioctl_read_file_signature(struct file *f, void __user *arg)
 +{
 +	struct incfs_get_file_sig_args __user *args_usr_ptr = arg;
 +	struct incfs_get_file_sig_args args = {};
 +	u8 *sig_buffer = NULL;
 +	size_t sig_buf_size = 0;
 +	int error = 0;
 +	int read_result = 0;
 +	struct data_file *df = get_incfs_data_file(f);
 +
 +	if (!df)
 +		return -EINVAL;
 +
 +	if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
 +		return -EINVAL;
 +
 +	sig_buf_size = args.file_signature_buf_size;
 +	if (sig_buf_size > INCFS_MAX_SIGNATURE_SIZE)
 +		return -E2BIG;
 +
 +	sig_buffer = kzalloc(sig_buf_size, GFP_NOFS | __GFP_COMP);
 +	if (!sig_buffer)
 +		return -ENOMEM;
 +
 +	read_result = incfs_read_file_signature(df,
 +			range(sig_buffer, sig_buf_size));
 +
 +	if (read_result < 0) {
 +		error = read_result;
 +		goto out;
 +	}
 +
 +	if (copy_to_user(u64_to_user_ptr(args.file_signature), sig_buffer,
 +			read_result)) {
 +		error = -EFAULT;
 +		goto out;
 +	}
 +
 +	args.file_signature_len_out = read_result;
 +	if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
 +		error = -EFAULT;
 +
 +out:
 +	kfree(sig_buffer);
 +
 +	return error;
 +}
 +
 +static long ioctl_get_filled_blocks(struct file *f, void __user *arg)
 +{
 +	struct incfs_get_filled_blocks_args __user *args_usr_ptr = arg;
 +	struct incfs_get_filled_blocks_args args = {};
 +	struct data_file *df = get_incfs_data_file(f);
 +	struct incfs_file_data *fd = f->private_data;
 +	int error;
 +
 +	if (!df || !fd)
 +		return -EINVAL;
 +
 +	if (fd->fd_fill_permission != CAN_FILL)
 +		return -EPERM;
 +
 +	if (copy_from_user(&args, args_usr_ptr, sizeof(args)) > 0)
 +		return -EINVAL;
 +
 +	error = incfs_get_filled_blocks(df, fd, &args);
 +
 +	if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
 +		return -EFAULT;
 +
 +	return error;
 +}
 +
 +static long ioctl_get_block_count(struct file *f, void __user *arg)
 +{
 +	struct incfs_get_block_count_args __user *args_usr_ptr = arg;
 +	struct incfs_get_block_count_args args = {};
 +	struct data_file *df = get_incfs_data_file(f);
 +
 +	if (!df)
 +		return -EINVAL;
 +
 +	args.total_data_blocks_out = df->df_data_block_count;
 +	args.filled_data_blocks_out = atomic_read(&df->df_data_blocks_written);
 +	args.total_hash_blocks_out = df->df_total_block_count -
 +		df->df_data_block_count;
 +	args.filled_hash_blocks_out = atomic_read(&df->df_hash_blocks_written);
 +
 +	if (copy_to_user(args_usr_ptr, &args, sizeof(args)))
 +		return -EFAULT;
 +
 +	return 0;
 +}
 +
 +static int incfs_ioctl_get_flags(struct file *f, void __user *arg)
 +{
 +	u32 flags = IS_VERITY(file_inode(f)) ? FS_VERITY_FL : 0;
 +
 +	return put_user(flags, (int __user *) arg);
 +}
 +
 +static long dispatch_ioctl(struct file *f, unsigned int req, unsigned long arg)
 +{
 +	switch (req) {
 +	case INCFS_IOC_FILL_BLOCKS:
 +		return ioctl_fill_blocks(f, (void __user *)arg);
 +	case INCFS_IOC_READ_FILE_SIGNATURE:
 +		return ioctl_read_file_signature(f, (void __user *)arg);
 +	case INCFS_IOC_GET_FILLED_BLOCKS:
 +		return ioctl_get_filled_blocks(f, (void __user *)arg);
 +	case INCFS_IOC_GET_BLOCK_COUNT:
 +		return ioctl_get_block_count(f, (void __user *)arg);
 +	case FS_IOC_ENABLE_VERITY:
 +		return incfs_ioctl_enable_verity(f, (const void __user *)arg);
 +	case FS_IOC_GETFLAGS:
 +		return incfs_ioctl_get_flags(f, (void __user *) arg);
 +	case FS_IOC_MEASURE_VERITY:
 +		return incfs_ioctl_measure_verity(f, (void __user *)arg);
 +	case FS_IOC_READ_VERITY_METADATA:
 +		return incfs_ioctl_read_verity_metadata(f, (void __user *)arg);
 +	default:
 +		return -EINVAL;
 +	}
 +}
 +
 +#ifdef CONFIG_COMPAT
 +static long incfs_compat_ioctl(struct file *file, unsigned int cmd,
 +			       unsigned long arg)
 +{
 +	switch (cmd) {
 +	case FS_IOC32_GETFLAGS:
 +		cmd = FS_IOC_GETFLAGS;
 +		break;
 +	case INCFS_IOC_FILL_BLOCKS:
 +	case INCFS_IOC_READ_FILE_SIGNATURE:
 +	case INCFS_IOC_GET_FILLED_BLOCKS:
 +	case INCFS_IOC_GET_BLOCK_COUNT:
 +	case FS_IOC_ENABLE_VERITY:
 +	case FS_IOC_MEASURE_VERITY:
 +	case FS_IOC_READ_VERITY_METADATA:
 +		break;
 +	default:
 +		return -ENOIOCTLCMD;
 +	}
 +	return dispatch_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 +}
 +#endif
 +
 +static struct dentry *dir_lookup(struct inode *dir_inode, struct dentry *dentry,
 +				 unsigned int flags)
 +{
 +	struct mount_info *mi = get_mount_info(dir_inode->i_sb);
 +	struct dentry *dir_dentry = NULL;
 +	struct dentry *backing_dentry = NULL;
 +	struct path dir_backing_path = {};
 +	struct inode_info *dir_info = get_incfs_node(dir_inode);
 +	int err = 0;
 +
 +	if (!mi || !dir_info || !dir_info->n_backing_inode)
 +		return ERR_PTR(-EBADF);
 +
 +	if (d_inode(mi->mi_backing_dir_path.dentry) ==
 +		dir_info->n_backing_inode) {
 +		/* We do lookup in the FS root. Show pseudo files. */
 +		err = dir_lookup_pseudo_files(dir_inode->i_sb, dentry);
 +		if (err != -ENOENT)
 +			goto out;
 +		err = 0;
 +	}
 +
 +	dir_dentry = dget_parent(dentry);
 +	get_incfs_backing_path(dir_dentry, &dir_backing_path);
 +	backing_dentry = incfs_lookup_dentry(dir_backing_path.dentry,
 +						dentry->d_name.name);
 +
 +	if (!backing_dentry || IS_ERR(backing_dentry)) {
 +		err = IS_ERR(backing_dentry)
 +			? PTR_ERR(backing_dentry)
 +			: -EFAULT;
 +		backing_dentry = NULL;
 +		goto out;
 +	} else {
 +		struct inode *inode = NULL;
 +		struct path backing_path = {
 +			.mnt = dir_backing_path.mnt,
 +			.dentry = backing_dentry
 +		};
 +
 +		err = incfs_init_dentry(dentry, &backing_path);
 +		if (err)
 +			goto out;
 +
 +		if (!d_really_is_positive(backing_dentry)) {
 +			/*
 +			 * No such entry found in the backing dir.
 +			 * Create a negative entry.
 +			 */
 +			d_add(dentry, NULL);
 +			err = 0;
 +			goto out;
 +		}
 +
 +		if (d_inode(backing_dentry)->i_sb !=
 +				dir_info->n_backing_inode->i_sb) {
 +			/*
 +			 * Somehow after the path lookup we ended up in a
 +			 * different fs mount. If we keep going it's going
 +			 * to end badly.
 +			 */
 +			err = -EXDEV;
 +			goto out;
 +		}
 +
 +		inode = fetch_regular_inode(dir_inode->i_sb, backing_dentry);
 +		if (IS_ERR(inode)) {
 +			err = PTR_ERR(inode);
 +			goto out;
 +		}
 +
 +		d_add(dentry, inode);
 +	}
 +
 +out:
 +	dput(dir_dentry);
 +	dput(backing_dentry);
 +	path_put(&dir_backing_path);
 +	if (err)
 +		pr_debug("incfs: %s %s %d\n", __func__,
 +			 dentry->d_name.name, err);
 +	return ERR_PTR(err);
 +}
 +
 +static int dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode)
 +{
 +	struct mount_info *mi = get_mount_info(dir->i_sb);
 +	struct inode_info *dir_node = get_incfs_node(dir);
 +	struct dentry *backing_dentry = NULL;
 +	struct path backing_path = {};
 +	int err = 0;
 +
 +
 +	if (!mi || !dir_node || !dir_node->n_backing_inode)
 +		return -EBADF;
 +
 +	err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (err)
 +		return err;
 +
 +	get_incfs_backing_path(dentry, &backing_path);
 +	backing_dentry = backing_path.dentry;
 +
 +	if (!backing_dentry) {
 +		err = -EBADF;
 +		goto path_err;
 +	}
 +
 +	if (backing_dentry->d_parent == mi->mi_index_dir) {
 +		/* Can't create a subdir inside .index */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +
 +	if (backing_dentry->d_parent == mi->mi_incomplete_dir) {
 +		/* Can't create a subdir inside .incomplete */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +	inode_lock_nested(dir_node->n_backing_inode, I_MUTEX_PARENT);
 +	err = vfs_mkdir(idmap, dir_node->n_backing_inode, backing_dentry, mode | 0222);
 +	inode_unlock(dir_node->n_backing_inode);
 +	if (!err) {
 +		struct inode *inode = NULL;
 +
 +		if (d_really_is_negative(backing_dentry) ||
 +			unlikely(d_unhashed(backing_dentry))) {
 +			err = -EINVAL;
 +			goto out;
 +		}
 +
 +		inode = fetch_regular_inode(dir->i_sb, backing_dentry);
 +		if (IS_ERR(inode)) {
 +			err = PTR_ERR(inode);
 +			goto out;
 +		}
 +		d_instantiate(dentry, inode);
 +	}
 +
 +out:
 +	if (d_really_is_negative(dentry))
 +		d_drop(dentry);
 +	path_put(&backing_path);
 +
 +path_err:
 +	mutex_unlock(&mi->mi_dir_struct_mutex);
 +	if (err)
 +		pr_debug("incfs: %s err:%d\n", __func__, err);
 +	return err;
 +}
 +
 +/*
 + * Delete file referenced by backing_dentry and if appropriate its hardlink
 + * from .index and .incomplete
 + */
 +static int file_delete(struct mount_info *mi, struct dentry *dentry,
 +			struct dentry *backing_dentry, int nlink)
 +{
 +	struct dentry *index_file_dentry = NULL;
 +	struct dentry *incomplete_file_dentry = NULL;
 +	/* 2 chars per byte of file ID + 1 char for \0 */
 +	char file_id_str[2 * sizeof(incfs_uuid_t) + 1] = {0};
 +	ssize_t uuid_size = 0;
 +	int error = 0;
 +
 +	WARN_ON(!mutex_is_locked(&mi->mi_dir_struct_mutex));
 +
 +	if (nlink > 3)
 +		goto just_unlink;
 +
 +	uuid_size = vfs_getxattr(&nop_mnt_idmap, backing_dentry, INCFS_XATTR_ID_NAME,
 +			file_id_str, 2 * sizeof(incfs_uuid_t));
 +	if (uuid_size < 0) {
 +		error = uuid_size;
 +		goto out;
 +	}
 +
 +	if (uuid_size != 2 * sizeof(incfs_uuid_t)) {
 +		error = -EBADMSG;
 +		goto out;
 +	}
 +
 +	index_file_dentry = incfs_lookup_dentry(mi->mi_index_dir, file_id_str);
 +	if (IS_ERR(index_file_dentry)) {
 +		error = PTR_ERR(index_file_dentry);
 +		index_file_dentry = NULL;
 +		goto out;
 +	}
 +
 +	if (d_really_is_positive(index_file_dentry) && nlink > 0)
 +		nlink--;
 +
 +	if (nlink > 2)
 +		goto just_unlink;
 +
 +	incomplete_file_dentry = incfs_lookup_dentry(mi->mi_incomplete_dir,
 +						     file_id_str);
 +	if (IS_ERR(incomplete_file_dentry)) {
 +		error = PTR_ERR(incomplete_file_dentry);
 +		incomplete_file_dentry = NULL;
 +		goto out;
 +	}
 +
 +	if (d_really_is_positive(incomplete_file_dentry) && nlink > 0)
 +		nlink--;
 +
 +	if (nlink > 1)
 +		goto just_unlink;
 +
 +	if (d_really_is_positive(index_file_dentry)) {
 +		error = incfs_unlink(index_file_dentry);
 +		if (error)
 +			goto out;
 +		notify_unlink(dentry, file_id_str, INCFS_INDEX_NAME);
 +	}
 +
 +	if (d_really_is_positive(incomplete_file_dentry)) {
 +		error = incfs_unlink(incomplete_file_dentry);
 +		if (error)
 +			goto out;
 +		notify_unlink(dentry, file_id_str, INCFS_INCOMPLETE_NAME);
 +	}
 +
 +just_unlink:
 +	error = incfs_unlink(backing_dentry);
 +
 +out:
 +	dput(index_file_dentry);
 +	dput(incomplete_file_dentry);
 +	if (error)
 +		pr_debug("incfs: delete_file_from_index err:%d\n", error);
 +	return error;
 +}
 +
 +static int dir_unlink(struct inode *dir, struct dentry *dentry)
 +{
 +	struct mount_info *mi = get_mount_info(dir->i_sb);
 +	struct path backing_path = {};
 +	struct kstat stat;
 +	int err = 0;
 +
 +	if (!mi)
 +		return -EBADF;
 +
 +	err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (err)
 +		return err;
 +
 +	get_incfs_backing_path(dentry, &backing_path);
 +	if (!backing_path.dentry) {
 +		err = -EBADF;
 +		goto path_err;
 +	}
 +
 +	if (backing_path.dentry->d_parent == mi->mi_index_dir) {
 +		/* Direct unlink from .index are not allowed. */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +
 +	if (backing_path.dentry->d_parent == mi->mi_incomplete_dir) {
 +		/* Direct unlink from .incomplete are not allowed. */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +
 +	err = vfs_getattr(&backing_path, &stat, STATX_NLINK,
 +			  AT_STATX_SYNC_AS_STAT);
 +	if (err)
 +		goto out;
 +
 +	err = file_delete(mi, dentry, backing_path.dentry, stat.nlink);
 +
 +	d_drop(dentry);
 +out:
 +	path_put(&backing_path);
 +path_err:
 +	if (err)
 +		pr_debug("incfs: %s err:%d\n", __func__, err);
 +	mutex_unlock(&mi->mi_dir_struct_mutex);
 +	return err;
 +}
 +
 +static int dir_link(struct dentry *old_dentry, struct inode *dir,
 +			 struct dentry *new_dentry)
 +{
 +	struct mount_info *mi = get_mount_info(dir->i_sb);
 +	struct path backing_old_path = {};
 +	struct path backing_new_path = {};
 +	int error = 0;
 +
 +	if (!mi)
 +		return -EBADF;
 +
 +	error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (error)
 +		return error;
 +
 +	get_incfs_backing_path(old_dentry, &backing_old_path);
 +	get_incfs_backing_path(new_dentry, &backing_new_path);
 +
 +	if (backing_new_path.dentry->d_parent == mi->mi_index_dir) {
 +		/* Can't link to .index */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	if (backing_new_path.dentry->d_parent == mi->mi_incomplete_dir) {
 +		/* Can't link to .incomplete */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	error = incfs_link(backing_old_path.dentry, backing_new_path.dentry);
 +	if (!error) {
 +		struct inode *inode = NULL;
 +		struct dentry *bdentry = backing_new_path.dentry;
 +
 +		if (d_really_is_negative(bdentry)) {
 +			error = -EINVAL;
 +			goto out;
 +		}
 +
 +		inode = fetch_regular_inode(dir->i_sb, bdentry);
 +		if (IS_ERR(inode)) {
 +			error = PTR_ERR(inode);
 +			goto out;
 +		}
 +		d_instantiate(new_dentry, inode);
 +	}
 +
 +out:
 +	path_put(&backing_old_path);
 +	path_put(&backing_new_path);
 +	if (error)
 +		pr_debug("incfs: %s err:%d\n", __func__, error);
 +	mutex_unlock(&mi->mi_dir_struct_mutex);
 +	return error;
 +}
 +
 +static int dir_rmdir(struct inode *dir, struct dentry *dentry)
 +{
 +	struct mount_info *mi = get_mount_info(dir->i_sb);
 +	struct path backing_path = {};
 +	int err = 0;
 +
 +	if (!mi)
 +		return -EBADF;
 +
 +	err = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (err)
 +		return err;
 +
 +	get_incfs_backing_path(dentry, &backing_path);
 +	if (!backing_path.dentry) {
 +		err = -EBADF;
 +		goto path_err;
 +	}
 +
 +	if (backing_path.dentry == mi->mi_index_dir) {
 +		/* Can't delete .index */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +
 +	if (backing_path.dentry == mi->mi_incomplete_dir) {
 +		/* Can't delete .incomplete */
 +		err = -EBUSY;
 +		goto out;
 +	}
 +
 +	err = incfs_rmdir(backing_path.dentry);
 +	if (!err)
 +		d_drop(dentry);
 +out:
 +	path_put(&backing_path);
 +
 +path_err:
 +	if (err)
 +		pr_debug("incfs: %s err:%d\n", __func__, err);
 +	mutex_unlock(&mi->mi_dir_struct_mutex);
 +	return err;
 +}
 +
 +static int dir_rename(struct inode *old_dir, struct dentry *old_dentry,
 +		struct inode *new_dir, struct dentry *new_dentry,
 +		unsigned int flags)
 +{
 +	struct mount_info *mi = get_mount_info(old_dir->i_sb);
 +	struct dentry *backing_old_dentry;
 +	struct dentry *backing_new_dentry;
 +	struct dentry *backing_old_dir_dentry;
 +	struct dentry *backing_new_dir_dentry;
 +	struct inode *target_inode;
 +	struct dentry *trap;
 +	struct renamedata rd = {};
 +	int error = 0;
 +
 +	error = mutex_lock_interruptible(&mi->mi_dir_struct_mutex);
 +	if (error)
 +		return error;
 +
 +	backing_old_dentry = get_incfs_dentry(old_dentry)->backing_path.dentry;
 +
 +	if (!backing_old_dentry || backing_old_dentry == mi->mi_index_dir ||
 +	    backing_old_dentry == mi->mi_incomplete_dir) {
 +		/* Renaming .index or .incomplete not allowed */
 +		error = -EBUSY;
 +		goto exit;
 +	}
 +
 +	backing_new_dentry = get_incfs_dentry(new_dentry)->backing_path.dentry;
 +	dget(backing_old_dentry);
 +	dget(backing_new_dentry);
 +
 +	backing_old_dir_dentry = dget_parent(backing_old_dentry);
 +	backing_new_dir_dentry = dget_parent(backing_new_dentry);
 +	target_inode = d_inode(new_dentry);
 +
 +	if (backing_old_dir_dentry == mi->mi_index_dir ||
 +	    backing_old_dir_dentry == mi->mi_incomplete_dir) {
 +		/* Direct moves from .index or .incomplete are not allowed. */
 +		error = -EBUSY;
 +		goto out;
 +	}
 +
 +	trap = lock_rename(backing_old_dir_dentry, backing_new_dir_dentry);
 +
 +	if (trap == backing_old_dentry) {
 +		error = -EINVAL;
 +		goto unlock_out;
 +	}
 +	if (trap == backing_new_dentry) {
 +		error = -ENOTEMPTY;
 +		goto unlock_out;
 +	}
 +
 +	rd.old_dir	= d_inode(backing_old_dir_dentry);
 +	rd.old_dentry	= backing_old_dentry;
 +	rd.new_dir	= d_inode(backing_new_dir_dentry);
 +	rd.new_dentry	= backing_new_dentry;
 +	rd.flags	= flags;
 +	rd.old_mnt_idmap = &nop_mnt_idmap;
 +	rd.new_mnt_idmap = &nop_mnt_idmap;
 +	rd.delegated_inode = NULL;
 +
 +	error = vfs_rename(&rd);
 +	if (error)
 +		goto unlock_out;
 +	if (target_inode)
 +		fsstack_copy_attr_all(target_inode,
 +			get_incfs_node(target_inode)->n_backing_inode);
 +	fsstack_copy_attr_all(new_dir, d_inode(backing_new_dir_dentry));
 +	if (new_dir != old_dir)
 +		fsstack_copy_attr_all(old_dir, d_inode(backing_old_dir_dentry));
 +
 +unlock_out:
 +	unlock_rename(backing_old_dir_dentry, backing_new_dir_dentry);
 +
 +out:
 +	dput(backing_new_dir_dentry);
 +	dput(backing_old_dir_dentry);
 +	dput(backing_new_dentry);
 +	dput(backing_old_dentry);
 +
 +exit:
 +	mutex_unlock(&mi->mi_dir_struct_mutex);
 +	if (error)
 +		pr_debug("incfs: %s err:%d\n", __func__, error);
 +	return error;
 +}
 +
 +
 +static int file_open(struct inode *inode, struct file *file)
 +{
 +	struct mount_info *mi = get_mount_info(inode->i_sb);
 +	struct file *backing_file = NULL;
 +	struct path backing_path = {};
 +	int err = 0;
 +	int flags = O_NOATIME | O_LARGEFILE |
 +		(S_ISDIR(inode->i_mode) ? O_RDONLY : O_RDWR);
 +	const struct cred *old_cred;
 +
 +	WARN_ON(file->private_data);
 +
 +	if (!mi)
 +		return -EBADF;
 +
 +	get_incfs_backing_path(file->f_path.dentry, &backing_path);
 +	if (!backing_path.dentry)
 +		return -EBADF;
 +
 +	old_cred = override_creds(mi->mi_owner);
 +	backing_file = dentry_open(&backing_path, flags, current_cred());
 +	revert_creds(old_cred);
 +	path_put(&backing_path);
 +
 +	if (IS_ERR(backing_file)) {
 +		err = PTR_ERR(backing_file);
 +		backing_file = NULL;
 +		goto out;
 +	}
 +
 +	if (S_ISREG(inode->i_mode)) {
 +		struct incfs_file_data *fd = kzalloc(sizeof(*fd), GFP_NOFS);
 +
 +		if (!fd) {
 +			err = -ENOMEM;
 +			goto out;
 +		}
 +
 +		*fd = (struct incfs_file_data) {
 +			.fd_fill_permission = CANT_FILL,
 +		};
 +		file->private_data = fd;
 +
 +		err = make_inode_ready_for_data_ops(mi, inode, backing_file);
 +		if (err)
 +			goto out;
 +
 +		err = incfs_fsverity_file_open(inode, file);
 +		if (err)
 +			goto out;
 +	} else if (S_ISDIR(inode->i_mode)) {
 +		struct dir_file *dir = NULL;
 +
 +		dir = incfs_open_dir_file(mi, backing_file);
 +		if (IS_ERR(dir))
 +			err = PTR_ERR(dir);
 +		else
 +			file->private_data = dir;
 +	} else
 +		err = -EBADF;
 +
 +out:
 +	if (err) {
 +		pr_debug("name:%s err: %d\n",
 +			 file->f_path.dentry->d_name.name, err);
 +		if (S_ISREG(inode->i_mode))
 +			kfree(file->private_data);
 +		else if (S_ISDIR(inode->i_mode))
 +			incfs_free_dir_file(file->private_data);
 +
 +		file->private_data = NULL;
 +	}
 +
 +	if (backing_file)
 +		fput(backing_file);
 +	return err;
 +}
 +
 +static int file_release(struct inode *inode, struct file *file)
 +{
 +	if (S_ISREG(inode->i_mode)) {
 +		kfree(file->private_data);
 +		file->private_data = NULL;
 +	} else if (S_ISDIR(inode->i_mode)) {
 +		struct dir_file *dir = get_incfs_dir_file(file);
 +
 +		incfs_free_dir_file(dir);
 +	}
 +
 +	return 0;
 +}
 +
 +static int dentry_revalidate(struct dentry *d, unsigned int flags)
 +{
 +	struct path backing_path = {};
 +	struct inode_info *info = get_incfs_node(d_inode(d));
 +	struct inode *binode = (info == NULL) ? NULL : info->n_backing_inode;
 +	struct dentry *backing_dentry = NULL;
 +	int result = 0;
 +
 +	if (flags & LOOKUP_RCU)
 +		return -ECHILD;
 +
 +	get_incfs_backing_path(d, &backing_path);
 +	backing_dentry = backing_path.dentry;
 +	if (!backing_dentry)
 +		goto out;
 +
 +	if (d_inode(backing_dentry) != binode) {
 +		/*
 +		 * Backing inodes obtained via dentry and inode don't match.
 +		 * It indicates that most likely backing dir has changed
 +		 * directly bypassing Incremental FS interface.
 +		 */
 +		goto out;
 +	}
 +
 +	if (backing_dentry->d_flags & DCACHE_OP_REVALIDATE) {
 +		result = backing_dentry->d_op->d_revalidate(backing_dentry,
 +				flags);
 +	} else
 +		result = 1;
 +
 +out:
 +	path_put(&backing_path);
 +	return result;
 +}
 +
 +static void dentry_release(struct dentry *d)
 +{
 +	struct dentry_info *di = get_incfs_dentry(d);
 +
 +	if (di)
 +		path_put(&di->backing_path);
 +	kfree(d->d_fsdata);
 +	d->d_fsdata = NULL;
 +}
 +
 +static struct inode *alloc_inode(struct super_block *sb)
 +{
 +	struct inode_info *node = kzalloc(sizeof(*node), GFP_NOFS);
 +
 +	/* TODO: add a slab-based cache here. */
 +	if (!node)
 +		return NULL;
 +	inode_init_once(&node->n_vfs_inode);
 +	return &node->n_vfs_inode;
 +}
 +
 +static void free_inode(struct inode *inode)
 +{
 +	struct inode_info *node = get_incfs_node(inode);
 +
 +	kfree(node);
 +}
 +
 +static void evict_inode(struct inode *inode)
 +{
 +	struct inode_info *node = get_incfs_node(inode);
 +
 +	if (node) {
 +		if (node->n_backing_inode) {
 +			iput(node->n_backing_inode);
 +			node->n_backing_inode = NULL;
 +		}
 +		if (node->n_file) {
 +			incfs_free_data_file(node->n_file);
 +			node->n_file = NULL;
 +		}
 +	}
 +
 +	truncate_inode_pages(&inode->i_data, 0);
 +	clear_inode(inode);
 +}
 +
 +static int incfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 +			 struct iattr *ia)
 +{
 +	struct dentry_info *di = get_incfs_dentry(dentry);
 +	struct dentry *backing_dentry;
 +	struct inode *backing_inode;
 +	int error;
 +
 +	if (ia->ia_valid & ATTR_SIZE)
 +		return -EINVAL;
 +
 +	if ((ia->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
 +	    (ia->ia_valid & ATTR_MODE))
 +		return -EINVAL;
 +
 +	if (!di)
 +		return -EINVAL;
 +	backing_dentry = di->backing_path.dentry;
 +	if (!backing_dentry)
 +		return -EINVAL;
 +
 +	backing_inode = d_inode(backing_dentry);
 +
 +	/* incfs files are readonly, but the backing files must be writeable */
 +	if (S_ISREG(backing_inode->i_mode)) {
 +		if ((ia->ia_valid & ATTR_MODE) && (ia->ia_mode & 0222))
 +			return -EINVAL;
 +
 +		ia->ia_mode |= 0222;
 +	}
 +
 +	inode_lock(d_inode(backing_dentry));
 +	error = notify_change(idmap, backing_dentry, ia, NULL);
 +	inode_unlock(d_inode(backing_dentry));
 +
 +	if (error)
 +		return error;
 +
 +	if (S_ISREG(backing_inode->i_mode))
 +		ia->ia_mode &= ~0222;
 +
 +	return simple_setattr(idmap, dentry, ia);
 +}
 +
 +
 +static int incfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 +			 struct kstat *stat, u32 request_mask,
 +			 unsigned int query_flags)
 +{
 +	struct inode *inode = d_inode(path->dentry);
 +
 +	generic_fillattr(idmap, request_mask, inode, stat);
 +
 +	if (inode->i_ino < INCFS_START_INO_RANGE)
 +		return 0;
 +
 +	stat->attributes &= ~STATX_ATTR_VERITY;
 +	if (IS_VERITY(inode))
 +		stat->attributes |= STATX_ATTR_VERITY;
 +	stat->attributes_mask |= STATX_ATTR_VERITY;
 +
 +	if (request_mask & STATX_BLOCKS) {
 +		struct kstat backing_kstat;
 +		struct dentry_info *di = get_incfs_dentry(path->dentry);
 +		int error = 0;
 +		struct path *backing_path;
 +
 +		if (!di)
 +			return -EFSCORRUPTED;
 +		backing_path = &di->backing_path;
 +		error = vfs_getattr(backing_path, &backing_kstat, STATX_BLOCKS,
 +				    AT_STATX_SYNC_AS_STAT);
 +		if (error)
 +			return error;
 +
 +		stat->blocks = backing_kstat.blocks;
 +	}
 +
 +	return 0;
 +}
 +
 +static ssize_t incfs_getxattr(struct dentry *d, const char *name,
 +			void *value, size_t size)
 +{
 +	struct dentry_info *di = get_incfs_dentry(d);
 +	struct mount_info *mi = get_mount_info(d->d_sb);
 +	char *stored_value;
 +	size_t stored_size;
 +	int i;
 +
 +	if (di && di->backing_path.dentry)
 +		return vfs_getxattr(&nop_mnt_idmap, di->backing_path.dentry, name, value, size);
 +
 +	if (strcmp(name, "security.selinux"))
 +		return -ENODATA;
 +
 +	for (i = 0; i < PSEUDO_FILE_COUNT; ++i)
 +		if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data))
 +			break;
 +	if (i == PSEUDO_FILE_COUNT)
 +		return -ENODATA;
 +
 +	stored_value = mi->pseudo_file_xattr[i].data;
 +	stored_size = mi->pseudo_file_xattr[i].len;
 +	if (!stored_value)
 +		return -ENODATA;
 +
 +	if (stored_size > size)
 +		return -E2BIG;
 +
 +	memcpy(value, stored_value, stored_size);
 +	return stored_size;
 +}
 +
 +
 +static ssize_t incfs_setxattr(struct mnt_idmap *idmap, struct dentry *d,
 +			      const char *name, void *value, size_t size,
 +			      int flags)
 +{
 +	struct dentry_info *di = get_incfs_dentry(d);
 +	struct mount_info *mi = get_mount_info(d->d_sb);
 +	u8 **stored_value;
 +	size_t *stored_size;
 +	int i;
 +
 +	if (di && di->backing_path.dentry)
 +		return vfs_setxattr(idmap, di->backing_path.dentry, name, value,
 +				    size, flags);
 +
 +	if (strcmp(name, "security.selinux"))
 +		return -ENODATA;
 +
 +	if (size > INCFS_MAX_FILE_ATTR_SIZE)
 +		return -E2BIG;
 +
 +	for (i = 0; i < PSEUDO_FILE_COUNT; ++i)
 +		if (!strcmp(d->d_iname, incfs_pseudo_file_names[i].data))
 +			break;
 +	if (i == PSEUDO_FILE_COUNT)
 +		return -ENODATA;
 +
 +	stored_value = &mi->pseudo_file_xattr[i].data;
 +	stored_size = &mi->pseudo_file_xattr[i].len;
 +	kfree (*stored_value);
 +	*stored_value = kzalloc(size, GFP_NOFS);
 +	if (!*stored_value)
 +		return -ENOMEM;
 +
 +	memcpy(*stored_value, value, size);
 +	*stored_size = size;
 +	return 0;
 +}
 +
 +static ssize_t incfs_listxattr(struct dentry *d, char *list, size_t size)
 +{
 +	struct dentry_info *di = get_incfs_dentry(d);
 +
 +	if (!di || !di->backing_path.dentry)
 +		return -ENODATA;
 +
 +	return vfs_listxattr(di->backing_path.dentry, list, size);
 +}
 +
 +struct dentry *incfs_mount_fs(struct file_system_type *type, int flags,
 +			      const char *dev_name, void *data)
 +{
 +	struct mount_options options = {};
 +	struct mount_info *mi = NULL;
 +	struct path backing_dir_path = {};
 +	struct dentry *index_dir = NULL;
 +	struct dentry *incomplete_dir = NULL;
 +	struct super_block *src_fs_sb = NULL;
 +	struct inode *root_inode = NULL;
 +	struct super_block *sb = sget(type, NULL, set_anon_super, flags, NULL);
 +	bool dir_created = false;
 +	int error = 0;
 +
 +	if (IS_ERR(sb))
 +		return ERR_CAST(sb);
 +
 +	sb->s_op = &incfs_super_ops;
 +	sb->s_d_op = &incfs_dentry_ops;
 +	sb->s_flags |= S_NOATIME;
 +	sb->s_magic = INCFS_MAGIC_NUMBER;
 +	sb->s_time_gran = 1;
 +	sb->s_blocksize = INCFS_DATA_FILE_BLOCK_SIZE;
 +	sb->s_blocksize_bits = blksize_bits(sb->s_blocksize);
 +	sb->s_xattr = incfs_xattr_ops;
 +
 +	BUILD_BUG_ON(PAGE_SIZE != INCFS_DATA_FILE_BLOCK_SIZE);
 +
 +	if (!dev_name) {
 +		pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n");
 +		error = -ENOENT;
 +		goto err_deactivate;
 +	}
 +
 +	error = parse_options(&options, (char *)data);
 +	if (error != 0) {
 +		pr_err("incfs: Options parsing error. %d\n", error);
 +		goto err_deactivate;
 +	}
 +
 +	sb->s_bdi->ra_pages = options.readahead_pages;
 +	if (!dev_name) {
 +		pr_err("incfs: Backing dir is not set, filesystem can't be mounted.\n");
 +		error = -ENOENT;
 +		goto err_free_opts;
 +	}
 +
 +	error = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 +			&backing_dir_path);
 +	if (error || backing_dir_path.dentry == NULL ||
 +		!d_really_is_positive(backing_dir_path.dentry)) {
 +		pr_err("incfs: Error accessing: %s.\n",
 +			dev_name);
 +		goto err_free_opts;
 +	}
 +	src_fs_sb = backing_dir_path.dentry->d_sb;
 +	sb->s_maxbytes = src_fs_sb->s_maxbytes;
 +	sb->s_stack_depth = src_fs_sb->s_stack_depth + 1;
 +
 +	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
 +		error = -EINVAL;
 +		goto err_put_path;
 +	}
 +
 +	mi = incfs_alloc_mount_info(sb, &options, &backing_dir_path);
 +	if (IS_ERR_OR_NULL(mi)) {
 +		error = PTR_ERR(mi);
 +		pr_err("incfs: Error allocating mount info. %d\n", error);
 +		goto err_put_path;
 +	}
 +
 +	sb->s_fs_info = mi;
 +	mi->mi_backing_dir_path = backing_dir_path;
 +	index_dir = open_or_create_special_dir(backing_dir_path.dentry,
 +					       INCFS_INDEX_NAME, &dir_created);
 +	if (IS_ERR_OR_NULL(index_dir)) {
 +		error = PTR_ERR(index_dir);
 +		pr_err("incfs: Can't find or create .index dir in %s\n",
 +			dev_name);
 +		/* No need to null index_dir since we don't put it */
 +		goto err_put_path;
 +	}
 +
 +	mi->mi_index_dir = index_dir;
 +	mi->mi_index_free = dir_created;
 +
 +	incomplete_dir = open_or_create_special_dir(backing_dir_path.dentry,
 +						    INCFS_INCOMPLETE_NAME,
 +						    &dir_created);
 +	if (IS_ERR_OR_NULL(incomplete_dir)) {
 +		error = PTR_ERR(incomplete_dir);
 +		pr_err("incfs: Can't find or create .incomplete dir in %s\n",
 +			dev_name);
 +		/* No need to null incomplete_dir since we don't put it */
 +		goto err_put_path;
 +	}
 +	mi->mi_incomplete_dir = incomplete_dir;
 +	mi->mi_incomplete_free = dir_created;
 +
 +	root_inode = fetch_regular_inode(sb, backing_dir_path.dentry);
 +	if (IS_ERR(root_inode)) {
 +		error = PTR_ERR(root_inode);
 +		goto err_put_path;
 +	}
 +
 +	sb->s_root = d_make_root(root_inode);
 +	if (!sb->s_root) {
 +		error = -ENOMEM;
 +		goto err_put_path;
 +	}
 +	error = incfs_init_dentry(sb->s_root, &backing_dir_path);
 +	if (error)
 +		goto err_put_path;
 +
 +	path_put(&backing_dir_path);
 +	sb->s_flags |= SB_ACTIVE;
 +
 +	pr_debug("incfs: mount\n");
 +	return dget(sb->s_root);
 +
 +err_put_path:
 +	path_put(&backing_dir_path);
 +err_free_opts:
 +	free_options(&options);
 +err_deactivate:
 +	deactivate_locked_super(sb);
 +	pr_err("incfs: mount failed %d\n", error);
 +	return ERR_PTR(error);
 +}
 +
 +static int incfs_remount_fs(struct super_block *sb, int *flags, char *data)
 +{
 +	struct mount_options options;
 +	struct mount_info *mi = get_mount_info(sb);
 +	int err = 0;
 +
 +	sync_filesystem(sb);
 +	err = parse_options(&options, (char *)data);
 +	if (err)
 +		return err;
 +
 +	if (options.report_uid != mi->mi_options.report_uid) {
 +		pr_err("incfs: Can't change report_uid mount option on remount\n");
 +		err = -EOPNOTSUPP;
 +		goto out;
 +	}
 +
 +	err = incfs_realloc_mount_info(mi, &options);
 +	if (err)
 +		goto out;
 +
 +	pr_debug("incfs: remount\n");
 +
 +out:
 +	free_options(&options);
 +	return err;
 +}
 +
 +void incfs_kill_sb(struct super_block *sb)
 +{
 +	struct mount_info *mi = sb->s_fs_info;
 +	struct inode *dinode = NULL;
 +
 +	pr_debug("incfs: unmount\n");
 +
 +	/*
 +	 * We must kill the super before freeing mi, since killing the super
 +	 * triggers inode eviction, which triggers the final update of the
 +	 * backing file, which uses certain information for mi
 +	 */
 +	kill_anon_super(sb);
 +
 +	if (mi) {
 +		if (mi->mi_backing_dir_path.dentry)
 +			dinode = d_inode(mi->mi_backing_dir_path.dentry);
 +
 +		if (dinode) {
 +			if (mi->mi_index_dir && mi->mi_index_free)
 +				vfs_rmdir(&nop_mnt_idmap, dinode,
 +					  mi->mi_index_dir);
 +
 +			if (mi->mi_incomplete_dir && mi->mi_incomplete_free)
 +				vfs_rmdir(&nop_mnt_idmap, dinode,
 +					  mi->mi_incomplete_dir);
 +		}
 +
 +		incfs_free_mount_info(mi);
 +		sb->s_fs_info = NULL;
 +	}
 +}
 +
 +static int show_options(struct seq_file *m, struct dentry *root)
 +{
 +	struct mount_info *mi = get_mount_info(root->d_sb);
 +
 +	seq_printf(m, ",read_timeout_ms=%u", mi->mi_options.read_timeout_ms);
 +	seq_printf(m, ",readahead=%u", mi->mi_options.readahead_pages);
 +	if (mi->mi_options.read_log_pages != 0) {
 +		seq_printf(m, ",rlog_pages=%u", mi->mi_options.read_log_pages);
 +		seq_printf(m, ",rlog_wakeup_cnt=%u",
 +			   mi->mi_options.read_log_wakeup_count);
 +	}
 +	if (mi->mi_options.report_uid)
 +		seq_puts(m, ",report_uid");
 +
 +	if (mi->mi_sysfs_node)
 +		seq_printf(m, ",sysfs_name=%s",
 +			   kobject_name(&mi->mi_sysfs_node->isn_sysfs_node));
 +	return 0;
 +}
 diff --git a/fs/incfs/vfs.h b/fs/incfs/vfs.h
 new file mode 100644
 --- /dev/null
 +++ b/fs/incfs/vfs.h
 @@ -0,0 +1,33 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright 2018 Google LLC
 + */
 +
 +#ifndef _INCFS_VFS_H
 +#define _INCFS_VFS_H
 +
 +extern const struct file_operations incfs_file_ops;
 +extern const struct inode_operations incfs_file_inode_ops;
 +
 +void incfs_kill_sb(struct super_block *sb);
 +struct dentry *incfs_mount_fs(struct file_system_type *type, int flags,
 +			      const char *dev_name, void *data);
 +int incfs_link(struct dentry *what, struct dentry *where);
 +int incfs_unlink(struct dentry *dentry);
 +
 +static inline struct mount_info *get_mount_info(struct super_block *sb)
 +{
 +	struct mount_info *result = sb->s_fs_info;
 +
 +	WARN_ON(!result);
 +	return result;
 +}
 +
 +static inline struct super_block *file_superblock(struct file *f)
 +{
 +	struct inode *inode = file_inode(f);
 +
 +	return inode->i_sb;
 +}
 +
 +#endif
 diff --git a/include/uapi/linux/incrementalfs.h b/include/uapi/linux/incrementalfs.h
 new file mode 100644
 --- /dev/null
 +++ b/include/uapi/linux/incrementalfs.h
 @@ -0,0 +1,590 @@
 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 +/*
 + * Userspace interface for Incremental FS.
 + *
 + * Incremental FS is special-purpose Linux virtual file system that allows
 + * execution of a program while its binary and resource files are still being
 + * lazily downloaded over the network, USB etc.
 + *
 + * Copyright 2019 Google LLC
 + */
 +#ifndef _UAPI_LINUX_INCREMENTALFS_H
 +#define _UAPI_LINUX_INCREMENTALFS_H
 +
 +#include <linux/limits.h>
 +#include <linux/ioctl.h>
 +#include <linux/types.h>
 +#include <linux/xattr.h>
 +
 +/* ===== constants ===== */
 +#define INCFS_NAME "incremental-fs"
 +
 +/*
 + * Magic number used in file header and in memory superblock
 + * Note that it is a 5 byte unsigned long. Thus on 32 bit kernels, it is
 + * truncated to a 4 byte number
 + */
 +#define INCFS_MAGIC_NUMBER (0x5346434e49ul & ULONG_MAX)
 +
 +#define INCFS_DATA_FILE_BLOCK_SIZE 4096
 +#define INCFS_HEADER_VER 1
 +
 +/* TODO: This value is assumed in incfs_copy_signature_info_from_user to be the
 + * actual signature length. Set back to 64 when fixed.
 + */
 +#define INCFS_MAX_HASH_SIZE 32
 +#define INCFS_MAX_FILE_ATTR_SIZE 512
 +
 +#define INCFS_INDEX_NAME ".index"
 +#define INCFS_INCOMPLETE_NAME ".incomplete"
 +#define INCFS_PENDING_READS_FILENAME ".pending_reads"
 +#define INCFS_LOG_FILENAME ".log"
 +#define INCFS_BLOCKS_WRITTEN_FILENAME ".blocks_written"
 +#define INCFS_XATTR_ID_NAME (XATTR_USER_PREFIX "incfs.id")
 +#define INCFS_XATTR_SIZE_NAME (XATTR_USER_PREFIX "incfs.size")
 +#define INCFS_XATTR_METADATA_NAME (XATTR_USER_PREFIX "incfs.metadata")
 +#define INCFS_XATTR_VERITY_NAME (XATTR_USER_PREFIX "incfs.verity")
 +
 +#define INCFS_MAX_SIGNATURE_SIZE 8096
 +#define INCFS_SIGNATURE_VERSION 2
 +#define INCFS_SIGNATURE_SECTIONS 2
 +
 +#define INCFS_IOCTL_BASE_CODE 'g'
 +
 +/* ===== ioctl requests on the command dir ===== */
 +
 +/*
 + * Create a new file
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_CREATE_FILE \
 +	_IOWR(INCFS_IOCTL_BASE_CODE, 30, struct incfs_new_file_args)
 +
 +/* Read file signature */
 +#define INCFS_IOC_READ_FILE_SIGNATURE                                          \
 +	_IOR(INCFS_IOCTL_BASE_CODE, 31, struct incfs_get_file_sig_args)
 +
 +/*
 + * Fill in one or more data block. This may only be called on a handle
 + * passed as a parameter to INCFS_IOC_PERMIT_FILLING
 + *
 + * Returns number of blocks filled in, or error if none were
 + */
 +#define INCFS_IOC_FILL_BLOCKS                                                  \
 +	_IOR(INCFS_IOCTL_BASE_CODE, 32, struct incfs_fill_blocks)
 +
 +/*
 + * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor
 + * May only be called on .pending_reads file
 + *
 + * Returns 0 on success or error
 + */
 +#define INCFS_IOC_PERMIT_FILL                                                  \
 +	_IOW(INCFS_IOCTL_BASE_CODE, 33, struct incfs_permit_fill)
 +
 +/*
 + * Fills buffer with ranges of populated blocks
 + *
 + * Returns 0 if all ranges written
 + *	   error otherwise
 + *
 + *	   Either way, range_buffer_size_out is set to the number
 + *	   of bytes written. Should be set to 0 by caller. The ranges
 + *	   filled are valid, but if an error was returned there might
 + *	   be more ranges to come.
 + *
 + *	   Ranges are ranges of filled blocks:
 + *
 + *	   1 2 7 9
 + *
 + *	   means blocks 1, 2, 7, 8, 9 are filled, 0, 3, 4, 5, 6 and 10 on
 + *	   are not
 + *
 + *	   If hashing is enabled for the file, the hash blocks are simply
 + *	   treated as though they immediately followed the data blocks.
 + */
 +#define INCFS_IOC_GET_FILLED_BLOCKS                                            \
 +	_IOR(INCFS_IOCTL_BASE_CODE, 34, struct incfs_get_filled_blocks_args)
 +
 +/*
 + * Creates a new mapped file
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_CREATE_MAPPED_FILE \
 +	_IOWR(INCFS_IOCTL_BASE_CODE, 35, struct incfs_create_mapped_file_args)
 +
 +/*
 + * Get number of blocks, total and filled
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_GET_BLOCK_COUNT \
 +	_IOR(INCFS_IOCTL_BASE_CODE, 36, struct incfs_get_block_count_args)
 +
 +/*
 + * Get per UID read timeouts
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_GET_READ_TIMEOUTS \
 +	_IOR(INCFS_IOCTL_BASE_CODE, 37, struct incfs_get_read_timeouts_args)
 +
 +/*
 + * Set per UID read timeouts
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_SET_READ_TIMEOUTS \
 +	_IOW(INCFS_IOCTL_BASE_CODE, 38, struct incfs_set_read_timeouts_args)
 +
 +/*
 + * Get last read error
 + * May only be called on .pending_reads file
 + */
 +#define INCFS_IOC_GET_LAST_READ_ERROR \
 +	_IOW(INCFS_IOCTL_BASE_CODE, 39, struct incfs_get_last_read_error_args)
 +
 +/* ===== sysfs feature flags ===== */
 +/*
 + * Each flag is represented by a file in /sys/fs/incremental-fs/features
 + * If the file exists the feature is supported
 + * Also the file contents will be the line "supported"
 + */
 +
 +/*
 + * Basic flag stating that the core incfs file system is available
 + */
 +#define INCFS_FEATURE_FLAG_COREFS "corefs"
 +
 +/*
 + * zstd compression support
 + */
 +#define INCFS_FEATURE_FLAG_ZSTD "zstd"
 +
 +/*
 + * v2 feature set support. Covers:
 + *   INCFS_IOC_CREATE_MAPPED_FILE
 + *   INCFS_IOC_GET_BLOCK_COUNT
 + *   INCFS_IOC_GET_READ_TIMEOUTS/INCFS_IOC_SET_READ_TIMEOUTS
 + *   .blocks_written status file
 + *   .incomplete folder
 + *   report_uid mount option
 + */
 +#define INCFS_FEATURE_FLAG_V2 "v2"
 +
 +enum incfs_compression_alg {
 +	COMPRESSION_NONE = 0,
 +	COMPRESSION_LZ4 = 1,
 +	COMPRESSION_ZSTD = 2,
 +};
 +
 +enum incfs_block_flags {
 +	INCFS_BLOCK_FLAGS_NONE = 0,
 +	INCFS_BLOCK_FLAGS_HASH = 1,
 +};
 +
 +typedef struct {
 +	__u8 bytes[16];
 +} incfs_uuid_t __attribute__((aligned (8)));
 +
 +/*
 + * Description of a pending read. A pending read - a read call by
 + * a userspace program for which the filesystem currently doesn't have data.
 + *
 + * Reads from .pending_reads and .log return an array of these structure
 + */
 +struct incfs_pending_read_info {
 +	/* Id of a file that is being read from. */
 +	incfs_uuid_t file_id;
 +
 +	/* A number of microseconds since system boot to the read. */
 +	__aligned_u64 timestamp_us;
 +
 +	/* Index of a file block that is being read. */
 +	__u32 block_index;
 +
 +	/* A serial number of this pending read. */
 +	__u32 serial_number;
 +};
 +
 +/*
 + * Description of a pending read. A pending read - a read call by
 + * a userspace program for which the filesystem currently doesn't have data.
 + *
 + * This version of incfs_pending_read_info is used whenever the file system is
 + * mounted with the report_uid flag
 + */
 +struct incfs_pending_read_info2 {
 +	/* Id of a file that is being read from. */
 +	incfs_uuid_t file_id;
 +
 +	/* A number of microseconds since system boot to the read. */
 +	__aligned_u64 timestamp_us;
 +
 +	/* Index of a file block that is being read. */
 +	__u32 block_index;
 +
 +	/* A serial number of this pending read. */
 +	__u32 serial_number;
 +
 +	/* The UID of the reading process */
 +	__u32 uid;
 +
 +	__u32 reserved;
 +};
 +
 +/*
 + * Description of a data or hash block to add to a data file.
 + */
 +struct incfs_fill_block {
 +	/* Index of a data block. */
 +	__u32 block_index;
 +
 +	/* Length of data */
 +	__u32 data_len;
 +
 +	/*
 +	 * A pointer to an actual data for the block.
 +	 *
 +	 * Equivalent to: __u8 *data;
 +	 */
 +	__aligned_u64 data;
 +
 +	/*
 +	 * Compression algorithm used to compress the data block.
 +	 * Values from enum incfs_compression_alg.
 +	 */
 +	__u8 compression;
 +
 +	/* Values from enum incfs_block_flags */
 +	__u8 flags;
 +
 +	__u16 reserved1;
 +
 +	__u32 reserved2;
 +
 +	__aligned_u64 reserved3;
 +};
 +
 +/*
 + * Description of a number of blocks to add to a data file
 + *
 + * Argument for INCFS_IOC_FILL_BLOCKS
 + */
 +struct incfs_fill_blocks {
 +	/* Number of blocks */
 +	__u64 count;
 +
 +	/* A pointer to an array of incfs_fill_block structs */
 +	__aligned_u64 fill_blocks;
 +};
 +
 +/*
 + * Permit INCFS_IOC_FILL_BLOCKS on the given file descriptor
 + * May only be called on .pending_reads file
 + *
 + * Argument for INCFS_IOC_PERMIT_FILL
 + */
 +struct incfs_permit_fill {
 +	/* File to permit fills on */
 +	__u32 file_descriptor;
 +};
 +
 +enum incfs_hash_tree_algorithm {
 +	INCFS_HASH_TREE_NONE = 0,
 +	INCFS_HASH_TREE_SHA256 = 1
 +};
 +
 +/*
 + * Create a new file or directory.
 + */
 +struct incfs_new_file_args {
 +	/* Id of a file to create. */
 +	incfs_uuid_t file_id;
 +
 +	/*
 +	 * Total size of the new file. Ignored if S_ISDIR(mode).
 +	 */
 +	__aligned_u64 size;
 +
 +	/*
 +	 * File mode. Permissions and dir flag.
 +	 */
 +	__u16 mode;
 +
 +	__u16 reserved1;
 +
 +	__u32 reserved2;
 +
 +	/*
 +	 * A pointer to a null-terminated relative path to the file's parent
 +	 * dir.
 +	 * Max length: PATH_MAX
 +	 *
 +	 * Equivalent to: char *directory_path;
 +	 */
 +	__aligned_u64 directory_path;
 +
 +	/*
 +	 * A pointer to a null-terminated file's name.
 +	 * Max length: PATH_MAX
 +	 *
 +	 * Equivalent to: char *file_name;
 +	 */
 +	__aligned_u64 file_name;
 +
 +	/*
 +	 * A pointer to a file attribute to be set on creation.
 +	 *
 +	 * Equivalent to: u8 *file_attr;
 +	 */
 +	__aligned_u64 file_attr;
 +
 +	/*
 +	 * Length of the data buffer specfied by file_attr.
 +	 * Max value: INCFS_MAX_FILE_ATTR_SIZE
 +	 */
 +	__u32 file_attr_len;
 +
 +	__u32 reserved4;
 +
 +	/*
 +	 * Points to an APK V4 Signature data blob
 +	 * Signature must have two sections
 +	 * Format is:
 +	 *	u32 version
 +	 *	u32 size_of_hash_info_section
 +	 *	u8 hash_info_section[]
 +	 *	u32 size_of_signing_info_section
 +	 *	u8 signing_info_section[]
 +	 *
 +	 * Note that incfs does not care about what is in signing_info_section
 +	 *
 +	 * hash_info_section has following format:
 +	 *	u32 hash_algorithm; // Must be SHA256 == 1
 +	 *	u8 log2_blocksize;  // Must be 12 for 4096 byte blocks
 +	 *	u32 salt_size;
 +	 *	u8 salt[];
 +	 *	u32 hash_size;
 +	 *	u8 root_hash[];
 +	 */
 +	__aligned_u64 signature_info;
 +
 +	/* Size of signature_info */
 +	__aligned_u64 signature_size;
 +
 +	__aligned_u64 reserved6;
 +};
 +
 +/*
 + * Request a digital signature blob for a given file.
 + * Argument for INCFS_IOC_READ_FILE_SIGNATURE ioctl
 + */
 +struct incfs_get_file_sig_args {
 +	/*
 +	 * A pointer to the data buffer to save an signature blob to.
 +	 *
 +	 * Equivalent to: u8 *file_signature;
 +	 */
 +	__aligned_u64 file_signature;
 +
 +	/* Size of the buffer at file_signature. */
 +	__u32 file_signature_buf_size;
 +
 +	/*
 +	 * Number of bytes save file_signature buffer.
 +	 * It is set after ioctl done.
 +	 */
 +	__u32 file_signature_len_out;
 +};
 +
 +struct incfs_filled_range {
 +	__u32 begin;
 +	__u32 end;
 +};
 +
 +/*
 + * Request ranges of filled blocks
 + * Argument for INCFS_IOC_GET_FILLED_BLOCKS
 + */
 +struct incfs_get_filled_blocks_args {
 +	/*
 +	 * A buffer to populate with ranges of filled blocks
 +	 *
 +	 * Equivalent to struct incfs_filled_ranges *range_buffer
 +	 */
 +	__aligned_u64 range_buffer;
 +
 +	/* Size of range_buffer */
 +	__u32 range_buffer_size;
 +
 +	/* Start index to read from */
 +	__u32 start_index;
 +
 +	/*
 +	 * End index to read to. 0 means read to end. This is a range,
 +	 * so incfs will read from start_index to end_index - 1
 +	 */
 +	__u32 end_index;
 +
 +	/* Actual number of blocks in file */
 +	__u32 total_blocks_out;
 +
 +	/* The  number of data blocks in file */
 +	__u32 data_blocks_out;
 +
 +	/* Number of bytes written to range buffer */
 +	__u32 range_buffer_size_out;
 +
 +	/* Sector scanned up to, if the call was interrupted */
 +	__u32 index_out;
 +};
 +
 +/*
 + * Create a new mapped file
 + * Argument for INCFS_IOC_CREATE_MAPPED_FILE
 + */
 +struct incfs_create_mapped_file_args {
 +	/*
 +	 * Total size of the new file.
 +	 */
 +	__aligned_u64 size;
 +
 +	/*
 +	 * File mode. Permissions and dir flag.
 +	 */
 +	__u16 mode;
 +
 +	__u16 reserved1;
 +
 +	__u32 reserved2;
 +
 +	/*
 +	 * A pointer to a null-terminated relative path to the incfs mount
 +	 * point
 +	 * Max length: PATH_MAX
 +	 *
 +	 * Equivalent to: char *directory_path;
 +	 */
 +	__aligned_u64 directory_path;
 +
 +	/*
 +	 * A pointer to a null-terminated file name.
 +	 * Max length: PATH_MAX
 +	 *
 +	 * Equivalent to: char *file_name;
 +	 */
 +	__aligned_u64 file_name;
 +
 +	/* Id of source file to map. */
 +	incfs_uuid_t source_file_id;
 +
 +	/*
 +	 * Offset in source file to start mapping. Must be a multiple of
 +	 * INCFS_DATA_FILE_BLOCK_SIZE
 +	 */
 +	__aligned_u64 source_offset;
 +};
 +
 +/*
 + * Get information about the blocks in this file
 + * Argument for INCFS_IOC_GET_BLOCK_COUNT
 + */
 +struct incfs_get_block_count_args {
 +	/* Total number of data blocks in the file */
 +	__u32 total_data_blocks_out;
 +
 +	/* Number of filled data blocks in the file */
 +	__u32 filled_data_blocks_out;
 +
 +	/* Total number of hash blocks in the file */
 +	__u32 total_hash_blocks_out;
 +
 +	/* Number of filled hash blocks in the file */
 +	__u32 filled_hash_blocks_out;
 +};
 +
 +/* Description of timeouts for one UID */
 +struct incfs_per_uid_read_timeouts {
 +	/* UID to apply these timeouts to */
 +	__u32 uid;
 +
 +	/*
 +	 * Min time in microseconds to read any block. Note that this doesn't
 +	 * apply to reads which are satisfied from the page cache.
 +	 */
 +	__u32 min_time_us;
 +
 +	/*
 +	 * Min time in microseconds to satisfy a pending read. Any pending read
 +	 * which is filled before this time will be delayed so that the total
 +	 * read time >= this value.
 +	 */
 +	__u32 min_pending_time_us;
 +
 +	/*
 +	 * Max time in microseconds to satisfy a pending read before the read
 +	 * times out. If set to U32_MAX, defaults to mount options
 +	 * read_timeout_ms * 1000. Must be >= min_pending_time_us
 +	 */
 +	__u32 max_pending_time_us;
 +};
 +
 +/*
 + * Get the read timeouts array
 + * Argument for INCFS_IOC_GET_READ_TIMEOUTS
 + */
 +struct incfs_get_read_timeouts_args {
 +	/*
 +	 * A pointer to a buffer to fill with the current timeouts
 +	 *
 +	 * Equivalent to struct incfs_per_uid_read_timeouts *
 +	 */
 +	__aligned_u64 timeouts_array;
 +
 +	/* Size of above buffer in bytes */
 +	__u32 timeouts_array_size;
 +
 +	/* Size used in bytes, or size needed if -ENOMEM returned */
 +	__u32 timeouts_array_size_out;
 +};
 +
 +/*
 + * Set the read timeouts array
 + * Arguments for INCFS_IOC_SET_READ_TIMEOUTS
 + */
 +struct incfs_set_read_timeouts_args {
 +	/*
 +	 * A pointer to an array containing the new timeouts
 +	 * This will replace any existing timeouts
 +	 *
 +	 * Equivalent to struct incfs_per_uid_read_timeouts *
 +	 */
 +	__aligned_u64 timeouts_array;
 +
 +	/* Size of above array in bytes. Must be < 256 */
 +	__u32 timeouts_array_size;
 +};
 +
 +/*
 + * Get last read error struct
 + * Arguments for INCFS_IOC_GET_LAST_READ_ERROR
 + */
 +struct incfs_get_last_read_error_args {
 +	/* File id of last file that had a read error */
 +	incfs_uuid_t	file_id_out;
 +
 +	/* Time of last read error, in us, from CLOCK_MONOTONIC */
 +	__u64	time_us_out;
 +
 +	/* Index of page that was being read at last read error */
 +	__u32	page_out;
 +
 +	/* errno of last read error */
 +	__u32	errno_out;
 +
 +	/* uid of last read error */
 +	__u32	uid_out;
 +
 +	__u32	reserved1;
 +	__u64	reserved2;
 +};
 +
 +#endif /* _UAPI_LINUX_INCREMENTALFS_H */