Merge 541c43310e85 ("Merge tag 'fs_for_v6.20-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs") into android-mainline
Steps on the way to v7.0-rc1
Change-Id: Ic4fc3d1532368e0faa67cb498a197c4590980cf2
Signed-off-by: Carlos Llamas <cmllamas@google.com>
diff --git a/Documentation/filesystems/ext2.rst b/Documentation/filesystems/ext2.rst
index 92aae68..95f48c1 100644
--- a/Documentation/filesystems/ext2.rst
+++ b/Documentation/filesystems/ext2.rst
@@ -388,7 +388,7 @@
======================= ===========================================================
Windows 95/98/NT/2000 http://www.chrysocome.net/explore2fs
-Windows 95 [1]_ http://www.yipton.net/content.html#FSDEXT2
+Windows 95 [1]_ http://www.yipton.net/content/fsdext2/
DOS client [1]_ ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
OS/2 [2]_ ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
RISC OS client http://www.esw-heim.tu-clausthal.de/~marco/smorbrod/IscaFS/
diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
index 1003630..badb2fe 100644
--- a/Documentation/netlink/specs/nfsd.yaml
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -78,6 +78,9 @@
-
name: scope
type: string
+ -
+ name: min-threads
+ type: u32
-
name: version
attributes:
@@ -159,6 +162,7 @@
- gracetime
- leasetime
- scope
+ - min-threads
-
name: threads-get
doc: get the number of running threads
@@ -170,6 +174,7 @@
- gracetime
- leasetime
- scope
+ - min-threads
-
name: version-set
doc: set nfs enabled versions
diff --git a/Documentation/sunrpc/xdr/nfs4_1.x b/Documentation/sunrpc/xdr/nfs4_1.x
index ca95150a..5b45547 100644
--- a/Documentation/sunrpc/xdr/nfs4_1.x
+++ b/Documentation/sunrpc/xdr/nfs4_1.x
@@ -53,6 +53,11 @@
*/
typedef uint32_t bitmap4<>;
+typedef opaque utf8string<>;
+typedef utf8string utf8str_cis;
+typedef utf8string utf8str_cs;
+typedef utf8string utf8str_mixed;
+
/*
* Timeval
*/
@@ -184,3 +189,59 @@
OPEN_DELEGATE_READ_ATTRS_DELEG = 4,
OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5
};
+
+
+/*
+ * The following content was extracted from draft-ietf-nfsv4-posix-acls
+ */
+
+enum aclmodel4 {
+ ACL_MODEL_NFS4 = 1,
+ ACL_MODEL_POSIX_DRAFT = 2,
+ ACL_MODEL_NONE = 3
+};
+pragma public aclmodel4;
+
+enum aclscope4 {
+ ACL_SCOPE_FILE_OBJECT = 1,
+ ACL_SCOPE_FILE_SYSTEM = 2,
+ ACL_SCOPE_SERVER = 3
+};
+pragma public aclscope4;
+
+enum posixacetag4 {
+ POSIXACE4_TAG_USER_OBJ = 1,
+ POSIXACE4_TAG_USER = 2,
+ POSIXACE4_TAG_GROUP_OBJ = 3,
+ POSIXACE4_TAG_GROUP = 4,
+ POSIXACE4_TAG_MASK = 5,
+ POSIXACE4_TAG_OTHER = 6
+};
+pragma public posixacetag4;
+
+typedef uint32_t posixaceperm4;
+pragma public posixaceperm4;
+
+/* Bit definitions for posixaceperm4. */
+const POSIXACE4_PERM_EXECUTE = 0x00000001;
+const POSIXACE4_PERM_WRITE = 0x00000002;
+const POSIXACE4_PERM_READ = 0x00000004;
+
+struct posixace4 {
+ posixacetag4 tag;
+ posixaceperm4 perm;
+ utf8str_mixed who;
+};
+
+typedef aclmodel4 fattr4_acl_trueform;
+typedef aclscope4 fattr4_acl_trueform_scope;
+typedef posixace4 fattr4_posix_default_acl<>;
+typedef posixace4 fattr4_posix_access_acl<>;
+
+%/*
+% * New for POSIX ACL extension
+% */
+const FATTR4_ACL_TRUEFORM = 89;
+const FATTR4_ACL_TRUEFORM_SCOPE = 90;
+const FATTR4_POSIX_DEFAULT_ACL = 91;
+const FATTR4_POSIX_ACCESS_ACL = 92;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index f00f1d3..0ee855a 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -793,6 +793,9 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
mutex_lock(&lock);
list_for_each_entry(cma_dev, &dev_list, list) {
+ if (id_priv->restricted_node_type != RDMA_NODE_UNSPECIFIED &&
+ id_priv->restricted_node_type != cma_dev->device->node_type)
+ continue;
rdma_for_each_port (cma_dev->device, port) {
gidp = rdma_protocol_roce(cma_dev->device, port) ?
&iboe_gid : &gid;
@@ -1015,6 +1018,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
return ERR_PTR(-ENOMEM);
id_priv->state = RDMA_CM_IDLE;
+ id_priv->restricted_node_type = RDMA_NODE_UNSPECIFIED;
id_priv->id.context = context;
id_priv->id.event_handler = event_handler;
id_priv->id.ps = ps;
@@ -4177,6 +4181,32 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
}
EXPORT_SYMBOL(rdma_resolve_addr);
+int rdma_restrict_node_type(struct rdma_cm_id *id, u8 node_type)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret = 0;
+
+ switch (node_type) {
+ case RDMA_NODE_UNSPECIFIED:
+ case RDMA_NODE_IB_CA:
+ case RDMA_NODE_RNIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ mutex_lock(&lock);
+ if (id_priv->cma_dev)
+ ret = -EALREADY;
+ else
+ id_priv->restricted_node_type = node_type;
+ mutex_unlock(&lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_restrict_node_type);
+
int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
struct rdma_id_private *id_priv =
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index c604b60..04332eb 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -72,6 +72,7 @@ struct rdma_id_private {
int internal_id;
enum rdma_cm_state state;
+ u8 restricted_node_type;
spinlock_t lock;
struct mutex qp_mutex;
diff --git a/fs/attr.c b/fs/attr.c
index b9ec6b4..e7d7c6d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -169,7 +169,17 @@ int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry,
* ATTR_FORCE.
*/
if (ia_valid & ATTR_SIZE) {
- int error = inode_newsize_ok(inode, attr->ia_size);
+ int error;
+
+ /*
+ * Verity files are immutable, so deny truncates. This isn't
+ * covered by the open-time check because sys_truncate() takes a
+ * path, not an open file.
+ */
+ if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
+ return -EPERM;
+
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 73602ee..55c272f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -339,10 +339,6 @@ struct btrfs_inode {
struct rw_semaphore i_mmap_lock;
-#ifdef CONFIG_FS_VERITY
- struct fsverity_info *i_verity_info;
-#endif
-
struct inode vfs_inode;
};
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3df399d..744a1ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -475,25 +475,25 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
end, page_ops);
}
-static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len)
+static bool btrfs_verify_folio(struct fsverity_info *vi, struct folio *folio,
+ u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- if (!fsverity_active(folio->mapping->host) ||
- btrfs_folio_test_uptodate(fs_info, folio, start, len) ||
- start >= i_size_read(folio->mapping->host))
+ if (!vi || btrfs_folio_test_uptodate(fs_info, folio, start, len))
return true;
- return fsverity_verify_folio(folio);
+ return fsverity_verify_folio(vi, folio);
}
-static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len)
+static void end_folio_read(struct fsverity_info *vi, struct folio *folio,
+ bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
start + len <= folio_next_pos(folio));
- if (uptodate && btrfs_verify_folio(folio, start, len))
+ if (uptodate && btrfs_verify_folio(vi, folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
@@ -573,14 +573,19 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct inode *inode = &bbio->inode->vfs_inode;
struct bio *bio = &bbio->bio;
+ struct fsverity_info *vi = NULL;
struct folio_iter fi;
ASSERT(!bio_flagged(bio, BIO_CLONED));
+
+ if (bbio->file_offset < i_size_read(inode))
+ vi = fsverity_get_info(inode);
+
bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
struct folio *folio = fi.folio;
- struct inode *inode = folio->mapping->host;
u64 start = folio_pos(folio) + fi.offset;
btrfs_debug(fs_info,
@@ -615,7 +620,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/* Update page status and unlock. */
- end_folio_read(folio, uptodate, start, fi.length);
+ end_folio_read(vi, folio, uptodate, start, fi.length);
}
bio_put(bio);
}
@@ -990,7 +995,8 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl)
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct fsverity_info *vi)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1034,11 +1040,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
folio_zero_range(folio, pg_offset, end - cur + 1);
- end_folio_read(folio, true, cur, end - cur + 1);
+ end_folio_read(vi, folio, true, cur, end - cur + 1);
break;
}
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
- end_folio_read(folio, true, cur, blocksize);
+ end_folio_read(vi, folio, true, cur, blocksize);
continue;
}
/*
@@ -1050,7 +1056,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
*/
em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached);
if (IS_ERR(em)) {
- end_folio_read(folio, false, cur, end + 1 - cur);
+ end_folio_read(vi, folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
}
extent_offset = cur - em->start;
@@ -1127,12 +1133,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
folio_zero_range(folio, pg_offset, blocksize);
- end_folio_read(folio, true, cur, blocksize);
+ end_folio_read(vi, folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, blocksize);
+ end_folio_read(vi, folio, true, cur, blocksize);
continue;
}
@@ -1329,7 +1335,8 @@ static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
int btrfs_read_folio(struct file *file, struct folio *folio)
{
- struct btrfs_inode *inode = folio_to_inode(folio);
+ struct inode *vfs_inode = folio->mapping->host;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
@@ -1338,10 +1345,13 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
.last_em_start = U64_MAX,
};
struct extent_map *em_cached = NULL;
+ struct fsverity_info *vi = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+ if (folio_pos(folio) < i_size_read(vfs_inode))
+ vi = fsverity_get_info(vfs_inode);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, vi);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -2714,16 +2724,19 @@ void btrfs_readahead(struct readahead_control *rac)
.last_em_start = U64_MAX,
};
struct folio *folio;
- struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ struct inode *vfs_inode = rac->mapping->host;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
const u64 start = readahead_pos(rac);
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
+ struct fsverity_info *vi = NULL;
lock_extents_for_read(inode, start, end, &cached_state);
-
+ if (start < i_size_read(vfs_inode))
+ vi = fsverity_get_info(vfs_inode);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl, vi);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 82df115..8451644 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <linux/unaligned.h>
-#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -5616,11 +5615,8 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
- if (!root) {
- fsverity_cleanup_inode(inode);
- clear_inode(inode);
- return;
- }
+ if (!root)
+ goto clear_inode;
fs_info = inode_to_fs_info(inode);
evict_inode_truncate_pages(inode);
@@ -5720,7 +5716,7 @@ void btrfs_evict_inode(struct inode *inode)
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
- fsverity_cleanup_inode(inode);
+clear_inode:
clear_inode(inode);
}
@@ -8151,9 +8147,6 @@ static void init_once(void *foo)
struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
-#ifdef CONFIG_FS_VERITY
- ei->i_verity_info = NULL;
-#endif
}
void __cold btrfs_destroy_cachep(void)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 06cbd6f..d12537a 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -694,7 +694,6 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
*
* @inode: inode to read a merkle tree page for
* @index: page index relative to the start of the merkle tree
- * @num_ra_pages: number of pages to readahead. Optional, we ignore it
*
* The Merkle tree is stored in the filesystem btree, but its pages are cached
* with a logical position past EOF in the inode's mapping.
@@ -702,8 +701,7 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
* Returns the page we read, or an ERR_PTR on error.
*/
static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages)
+ pgoff_t index)
{
struct folio *folio;
u64 off = (u64)index << PAGE_SHIFT;
@@ -771,16 +769,17 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
/*
* fsverity op that writes a Merkle tree block into the btree.
*
- * @inode: inode to write a Merkle tree block for
+ * @file: file to write a Merkle tree block for
* @buf: Merkle tree block to write
* @pos: the position of the block in the Merkle tree (in bytes)
* @size: the Merkle tree block size (in bytes)
*
* Returns 0 on success or negative error code on failure
*/
-static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+static int btrfs_write_merkle_tree_block(struct file *file, const void *buf,
u64 pos, unsigned int size)
{
+ struct inode *inode = file_inode(file);
loff_t merkle_pos = merkle_file_pos(inode);
if (merkle_pos < 0)
@@ -793,8 +792,6 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations btrfs_verityops = {
- .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
- (int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
diff --git a/fs/buffer.c b/fs/buffer.c
index b677916..ed724a9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -303,6 +303,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
struct postprocess_bh_ctx {
struct work_struct work;
struct buffer_head *bh;
+ struct fsverity_info *vi;
};
static void verify_bh(struct work_struct *work)
@@ -312,21 +313,12 @@ static void verify_bh(struct work_struct *work)
struct buffer_head *bh = ctx->bh;
bool valid;
- valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
+ valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
+ bh_offset(bh));
end_buffer_async_read(bh, valid);
kfree(ctx);
}
-static bool need_fsverity(struct buffer_head *bh)
-{
- struct folio *folio = bh->b_folio;
- struct inode *inode = folio->mapping->host;
-
- return fsverity_active(inode) &&
- /* needed by ext4 */
- folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-}
-
static void decrypt_bh(struct work_struct *work)
{
struct postprocess_bh_ctx *ctx =
@@ -336,7 +328,7 @@ static void decrypt_bh(struct work_struct *work)
err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
bh_offset(bh));
- if (err == 0 && need_fsverity(bh)) {
+ if (err == 0 && ctx->vi) {
/*
* We use different work queues for decryption and for verity
* because verity may require reading metadata pages that need
@@ -358,15 +350,20 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_folio->mapping->host;
bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
- bool verify = need_fsverity(bh);
+ struct fsverity_info *vi = NULL;
+
+ /* needed by ext4 */
+ if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
+ vi = fsverity_get_info(inode);
/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
- if (uptodate && (decrypt || verify)) {
+ if (uptodate && (decrypt || vi)) {
struct postprocess_bh_ctx *ctx =
kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx) {
ctx->bh = bh;
+ ctx->vi = vi;
if (decrypt) {
INIT_WORK(&ctx->work, decrypt_bh);
fscrypt_enqueue_decrypt_work(&ctx->work);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 62c091b..293f698 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -707,15 +707,6 @@ enum {
* found an unwritten extent, we need to split it.
*/
#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008
- /*
- * Caller is from the dio or dioread_nolock buffered IO, reqest to
- * create an unwritten extent if it does not exist or split the
- * found unwritten extent. Also do not merge the newly created
- * unwritten extent, io end will convert unwritten to written,
- * and try to merge the written extent.
- */
-#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Convert unwritten extent to initialized. */
#define EXT4_GET_BLOCKS_CONVERT 0x0010
/* Eventual metadata allocation (due to growing extent tree)
@@ -1205,10 +1196,6 @@ struct ext4_inode_info {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_inode_info *i_crypt_info;
#endif
-
-#ifdef CONFIG_FS_VERITY
- struct fsverity_info *i_verity_info;
-#endif
};
/*
@@ -1692,6 +1679,8 @@ struct ext4_sb_info {
/* timer for periodic error stats printing */
struct timer_list s_err_report;
+ /* timeout in seconds for s_err_report; 0 disables the timer. */
+ unsigned long s_err_report_sec;
/* Lazy inode table initialization info */
struct ext4_li_request *s_li_request;
@@ -1795,6 +1784,10 @@ struct ext4_sb_info {
* Main fast commit lock. This lock protects accesses to the
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
+ *
+ * s_fc_lock can be taken from reclaim context (inode eviction) and is
+ * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers
+ * when acquiring / releasing the lock.
*/
struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
@@ -1839,6 +1832,18 @@ static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}
+static inline int ext4_fc_lock(struct super_block *sb)
+{
+ mutex_lock(&EXT4_SB(sb)->s_fc_lock);
+ return memalloc_nofs_save();
+}
+
+static inline void ext4_fc_unlock(struct super_block *sb, int ctx)
+{
+ memalloc_nofs_restore(ctx);
+ mutex_unlock(&EXT4_SB(sb)->s_fc_lock);
+}
+
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -2373,7 +2378,6 @@ static inline int ext4_emergency_state(struct super_block *sb)
#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
-
/*
* Minimum number of groups in a flexgroup before we separate out
* directories into the first block group of a flexgroup
@@ -3199,6 +3203,7 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
ext4_group_t block_group);
+extern void print_daily_error_info(struct timer_list *t);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
@@ -3735,8 +3740,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
}
/* readpages.c */
-extern int ext4_mpage_readpages(struct inode *inode,
- struct readahead_control *rac, struct folio *folio);
+int ext4_read_folio(struct file *file, struct folio *folio);
+void ext4_readahead(struct readahead_control *rac);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
@@ -3795,6 +3800,10 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
@@ -3909,7 +3918,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
-extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
new file mode 100644
index 0000000..4879e68
--- /dev/null
+++ b/fs/ext4/extents-test.c
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Written by Ojaswin Mujoo <ojaswin@linux.ibm.com> (IBM)
+ *
+ * These Kunit tests are designed to test the functionality of
+ * extent split and conversion in ext4.
+ *
+ * Currently, ext4 can split extents in 2 ways:
+ * 1. By splitting the extents in the extent tree and optionally converting them
+ * to written or unwritten based on flags passed.
+ * 2. In case 1 encounters an error, ext4 instead zerooes out the unwritten
+ * areas of the extent and marks the complete extent written.
+ *
+ * The primary function that handles this is ext4_split_convert_extents().
+ *
+ * We test both of the methods of split. The behavior we try to enforce is:
+ * 1. When passing EXT4_GET_BLOCKS_CONVERT flag to ext4_split_convert_extents(),
+ * the split extent should be converted to initialized.
+ * 2. When passing EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flag to
+ * ext4_split_convert_extents(), the split extent should be converted to
+ * uninitialized.
+ * 3. In case we use the zeroout method, then we should correctly write zeroes
+ * to the unwritten areas of the extent and we should not corrupt/leak any
+ * data.
+ *
+ * Enforcing 1 and 2 is straight forward, we just setup a minimal inode with
+ * extent tree, call ext4_split_convert_extents() and check the final state of
+ * the extent tree.
+ *
+ * For zeroout testing, we maintain a separate buffer which represents the disk
+ * data corresponding to the extents. We then override ext4's zeroout functions
+ * to instead write zeroes to our buffer. Then, we override
+ * ext4_ext_insert_extent() to return -ENOSPC, which triggers the zeroout.
+ * Finally, we check the state of the extent tree and zeroout buffer to confirm
+ * everything went well.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+#include <linux/gfp_types.h>
+#include <linux/stddef.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+
+#define EXT_DATA_PBLK 100
+#define EXT_DATA_LBLK 10
+#define EXT_DATA_LEN 3
+
+struct kunit_ctx {
+ /*
+ * Ext4 inode which has only 1 unwrit extent
+ */
+ struct ext4_inode_info *k_ei;
+ /*
+ * Represents the underlying data area (used for zeroout testing)
+ */
+ char *k_data;
+} k_ctx;
+
+/*
+ * describes the state of an expected extent in extent tree.
+ */
+struct kunit_ext_state {
+ ext4_lblk_t ex_lblk;
+ ext4_lblk_t ex_len;
+ bool is_unwrit;
+};
+
+/*
+ * describes the state of the data area of a writ extent. Used for testing
+ * correctness of zeroout.
+ */
+struct kunit_ext_data_state {
+ char exp_char;
+ ext4_lblk_t off_blk;
+ ext4_lblk_t len_blk;
+};
+
+enum kunit_test_types {
+ TEST_SPLIT_CONVERT,
+ TEST_CREATE_BLOCKS,
+};
+
+struct kunit_ext_test_param {
+ /* description of test */
+ char *desc;
+
+ /* determines which function will be tested */
+ int type;
+
+ /* is extent unwrit at beginning of test */
+ bool is_unwrit_at_start;
+
+ /* flags to pass while splitting */
+ int split_flags;
+
+ /* map describing range to split */
+ struct ext4_map_blocks split_map;
+
+ /* disable zeroout */
+ bool disable_zeroout;
+
+ /* no of extents expected after split */
+ int nr_exp_ext;
+
+ /*
+ * expected state of extents after split. We will never split into more
+ * than 3 extents
+ */
+ struct kunit_ext_state exp_ext_state[3];
+
+ /* Below fields used for zeroout tests */
+
+ bool is_zeroout_test;
+ /*
+ * no of expected data segments (zeroout tests). Example, if we expect
+ * data to be 4kb 0s, followed by 8kb non-zero, then nr_exp_data_segs==2
+ */
+ int nr_exp_data_segs;
+
+ /*
+ * expected state of data area after zeroout.
+ */
+ struct kunit_ext_data_state exp_data_state[3];
+};
+
+static void ext_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static int ext_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
+static struct file_system_type ext_fs_type = {
+ .name = "extents test",
+ .kill_sb = ext_kill_sb,
+};
+
+static void extents_kunit_exit(struct kunit *test)
+{
+ struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info;
+
+ kfree(sbi);
+ kfree(k_ctx.k_ei);
+ kfree(k_ctx.k_data);
+}
+
+static int __ext4_ext_dirty_stub(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ return 0;
+}
+
+static struct ext4_ext_path *
+ext4_ext_insert_extent_stub(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags)
+{
+ return ERR_PTR(-ENOSPC);
+}
+
+/*
+ * We will zeroout the equivalent range in the data area
+ */
+static int ext4_ext_zeroout_stub(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_lblk_t ee_block, off_blk;
+ loff_t ee_len;
+ loff_t off_bytes;
+ struct kunit *test = kunit_get_current_test();
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ KUNIT_EXPECT_EQ_MSG(test, 1, ee_block >= EXT_DATA_LBLK, "ee_block=%d",
+ ee_block);
+ KUNIT_EXPECT_EQ(test, 1,
+ ee_block + ee_len <= EXT_DATA_LBLK + EXT_DATA_LEN);
+
+ off_blk = ee_block - EXT_DATA_LBLK;
+ off_bytes = off_blk << inode->i_sb->s_blocksize_bits;
+ memset(k_ctx.k_data + off_bytes, 0,
+ ee_len << inode->i_sb->s_blocksize_bits);
+
+ return 0;
+}
+
+static int ext4_issue_zeroout_stub(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len)
+{
+ ext4_lblk_t off_blk;
+ loff_t off_bytes;
+ struct kunit *test = kunit_get_current_test();
+
+ kunit_log(KERN_ALERT, test,
+ "%s: lblk=%u pblk=%llu len=%u", __func__, lblk, pblk, len);
+ KUNIT_EXPECT_EQ(test, 1, lblk >= EXT_DATA_LBLK);
+ KUNIT_EXPECT_EQ(test, 1, lblk + len <= EXT_DATA_LBLK + EXT_DATA_LEN);
+ KUNIT_EXPECT_EQ(test, 1, lblk - EXT_DATA_LBLK == pblk - EXT_DATA_PBLK);
+
+ off_blk = lblk - EXT_DATA_LBLK;
+ off_bytes = off_blk << inode->i_sb->s_blocksize_bits;
+ memset(k_ctx.k_data + off_bytes, 0,
+ len << inode->i_sb->s_blocksize_bits);
+
+ return 0;
+}
+
+static int extents_kunit_init(struct kunit *test)
+{
+ struct ext4_extent_header *eh = NULL;
+ struct ext4_inode_info *ei;
+ struct inode *inode;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi = NULL;
+ struct kunit_ext_test_param *param =
+ (struct kunit_ext_test_param *)(test->param_value);
+ int err;
+
+ sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ sb->s_blocksize = 4096;
+ sb->s_blocksize_bits = 12;
+
+ sbi = kzalloc(sizeof(struct ext4_sb_info), GFP_KERNEL);
+ if (sbi == NULL)
+ return -ENOMEM;
+
+ sbi->s_sb = sb;
+ sb->s_fs_info = sbi;
+
+ if (!param || !param->disable_zeroout)
+ sbi->s_extent_max_zeroout_kb = 32;
+
+ /* setup the mock inode */
+ k_ctx.k_ei = kzalloc(sizeof(struct ext4_inode_info), GFP_KERNEL);
+ if (k_ctx.k_ei == NULL)
+ return -ENOMEM;
+ ei = k_ctx.k_ei;
+ inode = &ei->vfs_inode;
+
+ err = ext4_es_register_shrinker(sbi);
+ if (err)
+ return err;
+
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_list);
+ ei->i_es_all_nr = 0;
+ ei->i_es_shk_nr = 0;
+ ei->i_es_shrink_lblk = 0;
+
+ ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10)
+ << sb->s_blocksize_bits;
+ ei->i_flags = 0;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ inode->i_sb = sb;
+
+ k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL);
+ if (k_ctx.k_data == NULL)
+ return -ENOMEM;
+
+ /*
+ * set the data area to a junk value
+ */
+ memset(k_ctx.k_data, 'X', EXT_DATA_LEN * 4096);
+
+ /* create a tree with depth 0 */
+ eh = (struct ext4_extent_header *)k_ctx.k_ei->i_data;
+
+ /* Fill extent header */
+ eh = ext_inode_hdr(&k_ctx.k_ei->vfs_inode);
+ eh->eh_depth = 0;
+ eh->eh_entries = cpu_to_le16(1);
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max =
+ cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0));
+ eh->eh_generation = 0;
+
+ /*
+ * add 1 extent in leaf node covering:
+ * - lblks: [EXT_DATA_LBLK, EXT_DATA_LBLK * + EXT_DATA_LEN)
+ * - pblks: [EXT_DATA_PBLK, EXT_DATA_PBLK + EXT_DATA_LEN)
+ */
+ EXT_FIRST_EXTENT(eh)->ee_block = cpu_to_le32(EXT_DATA_LBLK);
+ EXT_FIRST_EXTENT(eh)->ee_len = cpu_to_le16(EXT_DATA_LEN);
+ ext4_ext_store_pblock(EXT_FIRST_EXTENT(eh), EXT_DATA_PBLK);
+ if (!param || param->is_unwrit_at_start)
+ ext4_ext_mark_unwritten(EXT_FIRST_EXTENT(eh));
+
+ ext4_es_insert_extent(inode, EXT_DATA_LBLK, EXT_DATA_LEN, EXT_DATA_PBLK,
+ ext4_ext_is_unwritten(EXT_FIRST_EXTENT(eh)) ?
+ EXTENT_STATUS_UNWRITTEN :
+ EXTENT_STATUS_WRITTEN,
+ 0);
+
+ /* Add stubs */
+ kunit_activate_static_stub(test, __ext4_ext_dirty,
+ __ext4_ext_dirty_stub);
+ kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub);
+ kunit_activate_static_stub(test, ext4_issue_zeroout,
+ ext4_issue_zeroout_stub);
+ return 0;
+}
+
+/*
+ * Return 1 if all bytes in the buf equal to c, else return the offset of first mismatch
+ */
+static int check_buffer(char *buf, int c, int size)
+{
+ void *ret = NULL;
+
+ ret = memchr_inv(buf, c, size);
+ if (ret == NULL)
+ return 0;
+
+ kunit_log(KERN_ALERT, kunit_get_current_test(),
+ "# %s: wrong char found at offset %u (expected:%d got:%d)", __func__,
+ (u32)((char *)ret - buf), c, *((char *)ret));
+ return 1;
+}
+
+/*
+ * Simulate a map block call by first calling ext4_map_query_blocks() to
+ * correctly populate map flags and pblk and then call the
+ * ext4_map_create_blocks() to do actual split and conversion. This is easier
+ * than calling ext4_map_blocks() because that needs mocking a lot of unrelated
+ * functions.
+ */
+static void ext4_map_create_blocks_helper(struct kunit *test,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int retval = 0;
+
+ retval = ext4_map_query_blocks(NULL, inode, map, flags);
+ if (retval < 0) {
+ KUNIT_FAIL(test,
+ "ext4_map_query_blocks() failed. Cannot proceed\n");
+ return;
+ }
+
+ ext4_map_create_blocks(NULL, inode, map, flags);
+}
+
+static void test_split_convert(struct kunit *test)
+{
+ struct ext4_ext_path *path;
+ struct inode *inode = &k_ctx.k_ei->vfs_inode;
+ struct ext4_extent *ex;
+ struct ext4_map_blocks map;
+ const struct kunit_ext_test_param *param =
+ (const struct kunit_ext_test_param *)(test->param_value);
+ int blkbits = inode->i_sb->s_blocksize_bits;
+
+ if (param->is_zeroout_test)
+ /*
+ * Force zeroout by making ext4_ext_insert_extent return ENOSPC
+ */
+ kunit_activate_static_stub(test, ext4_ext_insert_extent,
+ ext4_ext_insert_extent_stub);
+
+ path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE);
+ ex = path->p_ext;
+ KUNIT_EXPECT_EQ(test, EXT_DATA_LBLK, le32_to_cpu(ex->ee_block));
+ KUNIT_EXPECT_EQ(test, EXT_DATA_LEN, ext4_ext_get_actual_len(ex));
+ KUNIT_EXPECT_EQ(test, param->is_unwrit_at_start,
+ ext4_ext_is_unwritten(ex));
+ if (param->is_zeroout_test)
+ KUNIT_EXPECT_EQ(test, 0,
+ check_buffer(k_ctx.k_data, 'X',
+ EXT_DATA_LEN << blkbits));
+
+ map.m_lblk = param->split_map.m_lblk;
+ map.m_len = param->split_map.m_len;
+
+ switch (param->type) {
+ case TEST_SPLIT_CONVERT:
+ path = ext4_split_convert_extents(NULL, inode, &map, path,
+ param->split_flags, NULL);
+ break;
+ case TEST_CREATE_BLOCKS:
+ ext4_map_create_blocks_helper(test, inode, &map, param->split_flags);
+ break;
+ default:
+ KUNIT_FAIL(test, "param->type %d not support.", param->type);
+ }
+
+ path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE);
+ ex = path->p_ext;
+
+ for (int i = 0; i < param->nr_exp_ext; i++) {
+ struct kunit_ext_state exp_ext = param->exp_ext_state[i];
+ bool es_check_needed = param->type != TEST_SPLIT_CONVERT;
+ struct extent_status es;
+ int contains_ex, ex_end, es_end, es_pblk;
+
+ KUNIT_EXPECT_EQ(test, exp_ext.ex_lblk,
+ le32_to_cpu(ex->ee_block));
+ KUNIT_EXPECT_EQ(test, exp_ext.ex_len,
+ ext4_ext_get_actual_len(ex));
+ KUNIT_EXPECT_EQ(test, exp_ext.is_unwrit,
+ ext4_ext_is_unwritten(ex));
+ /*
+ * Confirm extent cache is in sync. Note that es cache can be
+ * merged even when on-disk extents are not so take that into
+ * account.
+ *
+ * Also, ext4_split_convert_extents() forces EXT4_EX_NOCACHE hence
+ * es status are ignored for that case.
+ */
+ if (es_check_needed) {
+ ext4_es_lookup_extent(inode, le32_to_cpu(ex->ee_block),
+ NULL, &es, NULL);
+
+ ex_end = exp_ext.ex_lblk + exp_ext.ex_len;
+ es_end = es.es_lblk + es.es_len;
+ contains_ex = es.es_lblk <= exp_ext.ex_lblk &&
+ es_end >= ex_end;
+ es_pblk = ext4_es_pblock(&es) +
+ (exp_ext.ex_lblk - es.es_lblk);
+
+ KUNIT_EXPECT_EQ(test, contains_ex, 1);
+ KUNIT_EXPECT_EQ(test, ext4_ext_pblock(ex), es_pblk);
+ KUNIT_EXPECT_EQ(test, 1,
+ (exp_ext.is_unwrit &&
+ ext4_es_is_unwritten(&es)) ||
+ (!exp_ext.is_unwrit &&
+ ext4_es_is_written(&es)));
+ }
+
+ /* Only printed on failure */
+ kunit_log(KERN_INFO, test,
+ "# [extent %d] exp: lblk:%d len:%d unwrit:%d \n", i,
+ exp_ext.ex_lblk, exp_ext.ex_len, exp_ext.is_unwrit);
+ kunit_log(KERN_INFO, test,
+ "# [extent %d] got: lblk:%d len:%d unwrit:%d\n", i,
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_is_unwritten(ex));
+ if (es_check_needed)
+ kunit_log(
+ KERN_INFO, test,
+ "# [extent %d] es: lblk:%d len:%d pblk:%lld type:0x%x\n",
+ i, es.es_lblk, es.es_len, ext4_es_pblock(&es),
+ ext4_es_type(&es));
+ kunit_log(KERN_INFO, test, "------------------\n");
+
+ ex = ex + 1;
+ }
+
+ if (!param->is_zeroout_test)
+ return;
+
+ /*
+ * Check that then data area has been zeroed out correctly
+ */
+ for (int i = 0; i < param->nr_exp_data_segs; i++) {
+ loff_t off, len;
+ struct kunit_ext_data_state exp_data_seg = param->exp_data_state[i];
+
+ off = exp_data_seg.off_blk << blkbits;
+ len = exp_data_seg.len_blk << blkbits;
+ KUNIT_EXPECT_EQ_MSG(test, 0,
+ check_buffer(k_ctx.k_data + off,
+ exp_data_seg.exp_char, len),
+ "# corruption in byte range [%lld, %lld)",
+ off, len);
+ }
+
+ return;
+}
+
+static const struct kunit_ext_test_param test_split_convert_params[] = {
+ /* unwrit to writ splits */
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 3,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 2,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2),
+ .ex_len = 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+
+ /* writ to unwrit splits */
+ { .desc = "split writ extent to 2 extents and convert 1st half unwrit",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split writ extent to 2 extents and convert 2nd half unwrit",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 3,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 2,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2),
+ .ex_len = 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+
+ /*
+ * ***** zeroout tests *****
+ */
+ /* unwrit to writ splits */
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 3,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X', .off_blk = 1, .len_blk = EXT_DATA_LEN - 2 },
+ { .exp_char = 0, .off_blk = EXT_DATA_LEN - 1, .len_blk = 1 } } },
+
+ /* writ to unwrit splits */
+ { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)",
+ .type = TEST_SPLIT_CONVERT,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 3,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 2 },
+ { .exp_char = 'X',
+ .off_blk = EXT_DATA_LEN - 1,
+ .len_blk = 1 } } },
+};
+
+/* Tests to trigger ext4_ext_map_blocks() -> convert_initialized_extent() */
+static const struct kunit_ext_test_param test_convert_initialized_params[] = {
+ /* writ to unwrit splits */
+ { .desc = "split writ extent to 2 extents and convert 1st half unwrit",
+ .type = TEST_CREATE_BLOCKS,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .is_unwrit_at_start = 0,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split writ extent to 2 extents and convert 2nd half unwrit",
+ .type = TEST_CREATE_BLOCKS,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .is_unwrit_at_start = 0,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit",
+ .type = TEST_CREATE_BLOCKS,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .is_unwrit_at_start = 0,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 3,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 2,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2),
+ .ex_len = 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+
+ /* writ to unwrit splits (zeroout) */
+ { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 0,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 3,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 2 },
+ { .exp_char = 'X',
+ .off_blk = EXT_DATA_LEN - 1,
+ .len_blk = 1 } } },
+};
+
+/* Tests to trigger ext4_ext_map_blocks() -> ext4_ext_handle_unwritten_exntents() */
+static const struct kunit_ext_test_param test_handle_unwritten_params[] = {
+ /* unwrit to writ splits via endio path */
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 2,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 3,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 2,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2),
+ .ex_len = 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+
+ /* unwrit to writ splits via non-endio path */
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 2,
+ .disable_zeroout = true,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 2,
+ .disable_zeroout = true,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 1,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 0 },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (non endio)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 3,
+ .disable_zeroout = true,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = 1,
+ .is_unwrit = 1 },
+ { .ex_lblk = EXT_DATA_LBLK + 1,
+ .ex_len = EXT_DATA_LEN - 2,
+ .is_unwrit = 0 },
+ { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2),
+ .ex_len = 1,
+ .is_unwrit = 1 } },
+ .is_zeroout_test = 0 },
+
+ /*
+ * ***** zeroout tests *****
+ */
+ /* unwrit to writ splits (endio)*/
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CONVERT,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 3,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 2 },
+ { .exp_char = 0,
+ .off_blk = EXT_DATA_LEN - 1,
+ .len_blk = 1 } } },
+
+ /* unwrit to writ splits (non-endio)*/
+ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non-endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 0,
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non-endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 2,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 1 } } },
+ { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (non-endio, zeroout)",
+ .type = TEST_CREATE_BLOCKS,
+ .is_unwrit_at_start = 1,
+ .split_flags = EXT4_GET_BLOCKS_CREATE,
+ .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 },
+ .nr_exp_ext = 1,
+ .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK,
+ .ex_len = EXT_DATA_LEN,
+ .is_unwrit = 0 } },
+ .is_zeroout_test = 1,
+ .nr_exp_data_segs = 3,
+ .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 },
+ { .exp_char = 'X',
+ .off_blk = 1,
+ .len_blk = EXT_DATA_LEN - 2 },
+ { .exp_char = 0,
+ .off_blk = EXT_DATA_LEN - 1,
+ .len_blk = 1 } } },
+};
+
+static void ext_get_desc(struct kunit *test, const void *p, char *desc)
+
+{
+ struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)p;
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s %s\n", param->desc,
+ (param->type & TEST_CREATE_BLOCKS) ? "(highlevel)" : "");
+}
+
+static int test_split_convert_param_init(struct kunit *test)
+{
+ size_t arr_size = ARRAY_SIZE(test_split_convert_params);
+
+ kunit_register_params_array(test, test_split_convert_params, arr_size,
+ ext_get_desc);
+ return 0;
+}
+
+static int test_convert_initialized_param_init(struct kunit *test)
+{
+ size_t arr_size = ARRAY_SIZE(test_convert_initialized_params);
+
+ kunit_register_params_array(test, test_convert_initialized_params,
+ arr_size, ext_get_desc);
+ return 0;
+}
+
+static int test_handle_unwritten_init(struct kunit *test)
+{
+ size_t arr_size = ARRAY_SIZE(test_handle_unwritten_params);
+
+ kunit_register_params_array(test, test_handle_unwritten_params,
+ arr_size, ext_get_desc);
+ return 0;
+}
+
+/*
+ * Note that we use KUNIT_CASE_PARAM_WITH_INIT() instead of the more compact
+ * KUNIT_ARRAY_PARAM() because the later currently has a limitation causing the
+ * output parsing to be prone to error. For more context:
+ *
+ * https://lore.kernel.org/linux-kselftest/aULJpTvJDw9ctUDe@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com/
+ */
+static struct kunit_case extents_test_cases[] = {
+ KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params,
+ test_split_convert_param_init, NULL),
+ KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params,
+ test_convert_initialized_param_init, NULL),
+ KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params,
+ test_handle_unwritten_init, NULL),
+ {}
+};
+
+static struct kunit_suite extents_test_suite = {
+ .name = "ext4_extents_test",
+ .init = extents_kunit_init,
+ .exit = extents_kunit_exit,
+ .test_cases = extents_test_cases,
+};
+
+kunit_test_suites(&extents_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2cf5759..3630b27 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -32,6 +32,7 @@
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
+#include <kunit/static_stub.h>
#include <trace/events/ext4.h>
@@ -40,11 +41,9 @@
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
-#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
-#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
-
-#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
-#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
+static struct ext4_ext_path *ext4_split_convert_extents(
+ handle_t *handle, struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags, unsigned int *allocated);
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
@@ -86,8 +85,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
- ext4_lblk_t split,
- int split_flag, int flags);
+ ext4_lblk_t split, int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
@@ -192,6 +190,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
{
int err;
+ KUNIT_STATIC_STUB_REDIRECT(__ext4_ext_dirty, where, line, handle, inode,
+ path);
+
WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
if (path->p_bh) {
ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
@@ -332,15 +333,12 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
- int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
- return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
- EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
- flags);
+ return ext4_split_extent_at(handle, inode, path, lblk, flags);
}
static int
@@ -530,6 +528,8 @@ static void ext4_cache_extents(struct inode *inode,
ext4_lblk_t prev = 0;
int i;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_cache_extents, inode, eh);
+
for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
unsigned int status = EXTENT_STATUS_WRITTEN;
ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
@@ -893,6 +893,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
int ret;
gfp_t gfp_flags = GFP_NOFS;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_find_extent, inode, block, path, flags);
+
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
@@ -1985,6 +1987,9 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
ext4_lblk_t next;
int mb_flags = 0, unwritten;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_ext_insert_extent, handle, inode, path,
+ newext, gb_flags);
+
if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
@@ -2944,10 +2949,6 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
} else {
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS | __GFP_NOFAIL);
- if (path == NULL) {
- ext4_journal_stop(handle);
- return -ENOMEM;
- }
path[0].p_maxdepth = path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
@@ -3133,8 +3134,8 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
if (ee_len == 0)
@@ -3150,6 +3151,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_ext_zeroout, inode, ex);
+
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
@@ -3163,35 +3166,30 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* @inode: the file inode
* @path: the path to the extent
* @split: the logical block where the extent is splitted.
- * @split_flags: indicates if the extent could be zeroout if split fails, and
- * the states(init or unwritten) of new extents.
* @flags: flags used to insert new extent to extent tree.
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are determined by split_flag.
+ * of which are same as the original extent. No conversion is performed.
*
- * There are two cases:
- * a> the extent are splitted into two extent.
- * b> split is not needed, and just mark the extent.
- *
- * Return an extent path pointer on success, or an error pointer on failure.
+ * Return an extent path pointer on success, or an error pointer on failure. On
+ * failure, the extent is restored to original state.
*/
static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t split,
- int split_flag, int flags)
+ int flags)
{
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
- struct ext4_extent *ex, newex, orig_ex, zero_ex;
+ struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex2 = NULL;
unsigned int ee_len, depth;
- int err = 0;
+ int err = 0, insert_err = 0, is_unwrit = 0;
- BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
- (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+ /* Do not cache extents that are in the process of being modified. */
+ flags |= EXT4_EX_NOCACHE;
ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
@@ -3202,39 +3200,24 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
newblock = split - ee_block + ext4_ext_pblock(ex);
+ is_unwrit = ext4_ext_is_unwritten(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
- BUG_ON(!ext4_ext_is_unwritten(ex) &&
- split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNWRIT1 |
- EXT4_EXT_MARK_UNWRIT2));
+
+ /*
+ * No split needed
+ */
+ if (split == ee_block)
+ goto out;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- if (split == ee_block) {
- /*
- * case b: block @split is the block that the extent begins with
- * then we just change the state of the extent, and splitting
- * is not needed.
- */
- if (split_flag & EXT4_EXT_MARK_UNWRIT2)
- ext4_ext_mark_unwritten(ex);
- else
- ext4_ext_mark_initialized(ex);
-
- if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
- ext4_ext_try_to_merge(handle, inode, path, ex);
-
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- goto out;
- }
-
/* case a */
memcpy(&orig_ex, ex, sizeof(orig_ex));
ex->ee_len = cpu_to_le16(split - ee_block);
- if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+ if (is_unwrit)
ext4_ext_mark_unwritten(ex);
/*
@@ -3249,17 +3232,16 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
ex2->ee_block = cpu_to_le32(split);
ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
ext4_ext_store_pblock(ex2, newblock);
- if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ if (is_unwrit)
ext4_ext_mark_unwritten(ex2);
path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (!IS_ERR(path))
- goto out;
-
- err = PTR_ERR(path);
- if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
return path;
+ insert_err = PTR_ERR(path);
+ err = 0;
+
/*
* Get a new path to try to zeroout or fix the extent length.
* Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
@@ -3272,70 +3254,124 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
if (IS_ERR(path)) {
EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
split, PTR_ERR(path));
- return path;
+ goto out_path;
}
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
depth = ext_depth(inode);
ex = path[depth].p_ext;
- if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
- if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
- if (split_flag & EXT4_EXT_DATA_VALID1) {
- err = ext4_ext_zeroout(inode, ex2);
- zero_ex.ee_block = ex2->ee_block;
- zero_ex.ee_len = cpu_to_le16(
- ext4_ext_get_actual_len(ex2));
- ext4_ext_store_pblock(&zero_ex,
- ext4_ext_pblock(ex2));
- } else {
- err = ext4_ext_zeroout(inode, ex);
- zero_ex.ee_block = ex->ee_block;
- zero_ex.ee_len = cpu_to_le16(
- ext4_ext_get_actual_len(ex));
- ext4_ext_store_pblock(&zero_ex,
- ext4_ext_pblock(ex));
- }
- } else {
- err = ext4_ext_zeroout(inode, &orig_ex);
- zero_ex.ee_block = orig_ex.ee_block;
- zero_ex.ee_len = cpu_to_le16(
- ext4_ext_get_actual_len(&orig_ex));
- ext4_ext_store_pblock(&zero_ex,
- ext4_ext_pblock(&orig_ex));
- }
-
- if (!err) {
- /* update the extent length and mark as initialized */
- ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (!err)
- /* update extent status tree */
- ext4_zeroout_es(inode, &zero_ex);
- /* If we failed at this point, we don't know in which
- * state the extent tree exactly is so don't try to fix
- * length of the original extent as it may do even more
- * damage.
- */
- goto out;
- }
- }
-
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
- /*
- * Ignore ext4_ext_dirty return value since we are already in error path
- * and err is a non-zero error code.
- */
- ext4_ext_dirty(handle, inode, path + path->p_depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
- if (err) {
+ if (err || insert_err) {
ext4_free_ext_path(path);
- path = ERR_PTR(err);
+ path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
}
+out_path:
+ if (IS_ERR(path))
+ /* Remove all remaining potentially stale extents. */
+ ext4_es_remove_extent(inode, ee_block, ee_len);
ext4_ext_show_leaf(inode, path);
return path;
}
+static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ ext4_lblk_t ee_block;
+ uint64_t lblk, pblk, len;
+ int is_unwrit;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ is_unwrit = ext4_ext_is_unwritten(ex);
+
+ if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ /*
+ * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
+ * map to be initialized. Zeroout everything except the map
+ * range.
+ */
+
+ loff_t map_end = (loff_t) map->m_lblk + map->m_len;
+ loff_t ex_end = (loff_t) ee_block + ee_len;
+
+ if (!is_unwrit)
+ /* Shouldn't happen. Just exit */
+ return -EINVAL;
+
+ /* zeroout left */
+ if (map->m_lblk > ee_block) {
+ lblk = ee_block;
+ len = map->m_lblk - ee_block;
+ pblk = ext4_ext_pblock(ex);
+ err = ext4_issue_zeroout(inode, lblk, pblk, len);
+ if (err)
+ /* ZEROOUT failed, just return original error */
+ return err;
+ }
+
+ /* zeroout right */
+ if (map_end < ex_end) {
+ lblk = map_end;
+ len = ex_end - map_end;
+ pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
+ err = ext4_issue_zeroout(inode, lblk, pblk, len);
+ if (err)
+ /* ZEROOUT failed, just return original error */
+ return err;
+ }
+ } else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ /*
+ * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
+ * range specified by map to be marked unwritten.
+ * Zeroout the map range leaving rest as it is.
+ */
+
+ if (is_unwrit)
+ /* Shouldn't happen. Just exit */
+ return -EINVAL;
+
+ lblk = map->m_lblk;
+ len = map->m_len;
+ pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
+ err = ext4_issue_zeroout(inode, lblk, pblk, len);
+ if (err)
+ /* ZEROOUT failed, just return original error */
+ return err;
+ } else {
+ /*
+ * We no longer perform unwritten to unwritten splits in IO paths.
+ * Hence this should not happen.
+ */
+ WARN_ON_ONCE(true);
+ return -EINVAL;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ return err;
+
+ ext4_ext_mark_initialized(ex);
+
+ ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ return err;
+
+ return 0;
+}
+
/*
* ext4_split_extent() splits an extent and mark extent which is covered
* by @map as split_flags indicates
@@ -3352,13 +3388,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
struct ext4_ext_path *path,
struct ext4_map_blocks *map,
int split_flag, int flags,
- unsigned int *allocated)
+ unsigned int *allocated, bool *did_zeroout)
{
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, orig_ee_block;
struct ext4_extent *ex;
- unsigned int ee_len, depth;
- int unwritten;
- int split_flag1, flags1;
+ unsigned int ee_len, orig_ee_len, depth;
+ int unwritten, orig_unwritten;
+ int orig_err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3366,25 +3402,27 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
unwritten = ext4_ext_is_unwritten(ex);
+ orig_ee_block = ee_block;
+ orig_ee_len = ee_len;
+ orig_unwritten = unwritten;
+
+ /* Do not cache extents that are in the process of being modified. */
+ flags |= EXT4_EX_NOCACHE;
+
if (map->m_lblk + map->m_len < ee_block + ee_len) {
- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
- flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
- if (unwritten)
- split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
- EXT4_EXT_MARK_UNWRIT2;
- if (split_flag & EXT4_EXT_DATA_VALID2)
- split_flag1 |= EXT4_EXT_DATA_VALID1;
path = ext4_split_extent_at(handle, inode, path,
- map->m_lblk + map->m_len, split_flag1, flags1);
+ map->m_lblk + map->m_len, flags);
if (IS_ERR(path))
- return path;
+ goto try_zeroout;
+
/*
* Update path is required because previous ext4_split_extent_at
* may result in split of original leaf or extent zeroout.
*/
path = ext4_find_extent(inode, map->m_lblk, path, flags);
if (IS_ERR(path))
- return path;
+ goto try_zeroout;
+
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
@@ -3393,22 +3431,69 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
ext4_free_ext_path(path);
return ERR_PTR(-EFSCORRUPTED);
}
- unwritten = ext4_ext_is_unwritten(ex);
+
+ /* extent would have changed so update original values */
+ orig_ee_block = le32_to_cpu(ex->ee_block);
+ orig_ee_len = ext4_ext_get_actual_len(ex);
+ orig_unwritten = ext4_ext_is_unwritten(ex);
}
if (map->m_lblk >= ee_block) {
- split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
- if (unwritten) {
- split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
- split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNWRIT2);
- }
- path = ext4_split_extent_at(handle, inode, path,
- map->m_lblk, split_flag1, flags);
+ path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
+ flags);
if (IS_ERR(path))
- return path;
+ goto try_zeroout;
}
+ goto success;
+
+try_zeroout:
+ /*
+ * There was an error in splitting the extent. So instead, just zeroout
+ * unwritten portions and convert it to initialized as a last resort. If
+ * there is any failure here we just return the original error
+ */
+
+ orig_err = PTR_ERR(path);
+ if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
+ goto out_orig_err;
+
+ /* we can't zeroout? just return the original err */
+ if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
+ goto out_orig_err;
+
+ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ int max_zeroout_blks =
+ EXT4_SB(inode->i_sb)->s_extent_max_zeroout_kb >>
+ (inode->i_sb->s_blocksize_bits - 10);
+
+ if (map->m_len > max_zeroout_blks)
+ goto out_orig_err;
+ }
+
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
+ if (IS_ERR(path))
+ goto out_orig_err;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ unwritten = ext4_ext_is_unwritten(ex);
+
+ /* extent to zeroout should have been unchanged but its not */
+ if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
+ unwritten != orig_unwritten))
+ goto out_free_path;
+
+ if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
+ goto out_free_path;
+
+ /* zeroout succeeded */
+ if (did_zeroout)
+ *did_zeroout = true;
+
+success:
if (allocated) {
if (map->m_lblk + map->m_len > ee_block + ee_len)
*allocated = ee_len - (map->m_lblk - ee_block);
@@ -3417,6 +3502,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
}
ext4_ext_show_leaf(inode, path);
return path;
+
+out_free_path:
+ ext4_free_ext_path(path);
+out_orig_err:
+ return ERR_PTR(orig_err);
+
}
/*
@@ -3452,7 +3543,6 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth, map_len = map->m_len;
int err = 0;
- int split_flag = EXT4_EXT_DATA_VALID2;
unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3604,9 +3694,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully inside i_size or new_size.
*/
- split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-
- if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ if (ee_block + ee_len <= eof_block)
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
@@ -3661,8 +3749,8 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
}
fallback:
- path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
- flags, NULL);
+ path = ext4_split_convert_extents(handle, inode, &split_map, path,
+ flags | EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
return path;
out:
@@ -3712,7 +3800,8 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len;
- int split_flag = 0, depth;
+ int split_flag = 0, depth, err = 0;
+ bool did_zeroout = false;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map->m_len);
@@ -3726,34 +3815,87 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- /* Convert to unwritten */
- if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
- split_flag |= EXT4_EXT_DATA_VALID1;
- /* Convert to initialized */
- } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully inside i_size or new_size.
- */
- split_flag |= ee_block + ee_len <= eof_block ?
- EXT4_EXT_MAY_ZEROOUT : 0;
- split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+ /* No split needed */
+ if (ee_block == map->m_lblk && ee_len == map->m_len)
+ goto convert;
+
+ /*
+ * It is only safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
+ */
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+ /*
+ * pass SPLIT_NOMERGE explicitly so we don't end up merging extents we
+ * just split.
+ */
+ path = ext4_split_extent(handle, inode, path, map, split_flag,
+ flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE,
+ allocated, &did_zeroout);
+ if (IS_ERR(path))
+ return path;
+
+convert:
+ path = ext4_find_extent(inode, map->m_lblk, path, flags);
+ if (IS_ERR(path))
+ return path;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ /*
+ * Conversion is already handled in case of zeroout
+ */
+ if (!did_zeroout) {
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto err;
+
+ if (flags & EXT4_GET_BLOCKS_CONVERT)
+ ext4_ext_mark_initialized(ex);
+ else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)
+ ext4_ext_mark_unwritten(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
+ /*
+ * note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto err;
}
- flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
- return ext4_split_extent(handle, inode, path, map, split_flag, flags,
- allocated);
+
+ /* Lets update the extent status tree after conversion */
+ if (!(flags & EXT4_EX_NOCACHE))
+ ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_unwritten(ex) ?
+ EXTENT_STATUS_UNWRITTEN :
+ EXTENT_STATUS_WRITTEN,
+ false);
+
+err:
+ if (err) {
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
+ }
+
+ return path;
}
static struct ext4_ext_path *
ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
int depth;
- int err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3763,66 +3905,21 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
- /* If extent is larger than requested it is a clear sign that we still
- * have some extent state machine issues left. So extent_split is still
- * required.
- * TODO: Once all related issues will be fixed this situation should be
- * illegal.
- */
- if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef CONFIG_EXT4_DEBUG
- ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
- " len %u; IO logical block %llu, len %u",
- inode->i_ino, (unsigned long long)ee_block, ee_len,
- (unsigned long long)map->m_lblk, map->m_len);
-#endif
- path = ext4_split_convert_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT, NULL);
- if (IS_ERR(path))
- return path;
-
- path = ext4_find_extent(inode, map->m_lblk, path, 0);
- if (IS_ERR(path))
- return path;
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- }
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto errout;
- /* first mark the extent as initialized */
- ext4_ext_mark_initialized(ex);
-
- /* note: ext4_ext_correct_indexes() isn't needed here because
- * borders are not changed
- */
- ext4_ext_try_to_merge(handle, inode, path, ex);
-
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto errout;
-
- ext4_ext_show_leaf(inode, path);
- return path;
-
-errout:
- ext4_free_ext_path(path);
- return ERR_PTR(err);
+ return ext4_split_convert_extents(handle, inode, map, path, flags,
+ NULL);
}
static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path,
+ int flags,
unsigned int *allocated)
{
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
int depth;
- int err = 0;
/*
* Make sure that the extent is no bigger than we support with
@@ -3839,53 +3936,33 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
- if (ee_block != map->m_lblk || ee_len > map->m_len) {
- path = ext4_split_convert_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
- if (IS_ERR(path))
- return path;
+ path = ext4_split_convert_extents(handle, inode, map, path, flags,
+ NULL);
+ if (IS_ERR(path))
+ return path;
- path = ext4_find_extent(inode, map->m_lblk, path, 0);
- if (IS_ERR(path))
- return path;
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- if (!ex) {
- EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) map->m_lblk);
- err = -EFSCORRUPTED;
- goto errout;
- }
- }
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto errout;
- /* first mark the extent as unwritten */
- ext4_ext_mark_unwritten(ex);
-
- /* note: ext4_ext_correct_indexes() isn't needed here because
- * borders are not changed
- */
- ext4_ext_try_to_merge(handle, inode, path, ex);
-
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
- map->m_flags |= EXT4_MAP_UNWRITTEN;
+ /*
+ * The extent might be initialized in case of zeroout.
+ */
+ path = ext4_find_extent(inode, map->m_lblk, path, flags);
+ if (IS_ERR(path))
+ return path;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ if (ext4_ext_is_unwritten(ex))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ map->m_flags |= EXT4_MAP_MAPPED;
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
return path;
-
-errout:
- ext4_free_ext_path(path);
- return ERR_PTR(err);
}
static struct ext4_ext_path *
@@ -3910,30 +3987,10 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
*allocated, newblock);
- /* get_block() before submitting IO, split the extent */
- if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
- path = ext4_split_convert_extents(handle, inode, map, path,
- flags | EXT4_GET_BLOCKS_CONVERT, allocated);
- if (IS_ERR(path))
- return path;
- /*
- * shouldn't get a 0 allocated when splitting an extent unless
- * m_len is 0 (bug) or extent has been corrupted
- */
- if (unlikely(*allocated == 0)) {
- EXT4_ERROR_INODE(inode,
- "unexpected allocated == 0, m_len = %u",
- map->m_len);
- err = -EFSCORRUPTED;
- goto errout;
- }
- map->m_flags |= EXT4_MAP_UNWRITTEN;
- goto out;
- }
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
path = ext4_convert_unwritten_extents_endio(handle, inode,
- map, path);
+ map, path, flags);
if (IS_ERR(path))
return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3983,7 +4040,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
goto errout;
}
-out:
map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
@@ -4160,8 +4216,7 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
- ext4_es_insert_extent(inode, hole_start, len, ~0,
- EXTENT_STATUS_HOLE, false);
+ ext4_es_cache_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4257,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
path = convert_initialized_extent(handle,
- inode, map, path, &allocated);
+ inode, map, path, flags, &allocated);
if (IS_ERR(path))
err = PTR_ERR(path);
goto out;
@@ -5375,7 +5430,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) *iterator);
- return -EFSCORRUPTED;
+ ret = -EFSCORRUPTED;
+ goto out;
}
if (SHIFT == SHIFT_LEFT && *iterator >
le32_to_cpu(extent->ee_block)) {
@@ -5541,7 +5597,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
struct ext4_extent *extent;
ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret, depth, split_flag = 0;
+ int ret, depth;
loff_t start;
trace_ext4_insert_range(inode, offset, len);
@@ -5612,12 +5668,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
*/
if ((start_lblk > ee_start_lblk) &&
(start_lblk < (ee_start_lblk + ee_len))) {
- if (ext4_ext_is_unwritten(extent))
- split_flag = EXT4_EXT_MARK_UNWRIT1 |
- EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
- start_lblk, split_flag,
- EXT4_EX_NOCACHE |
+ start_lblk, EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_SPLIT_NOMERGE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
@@ -6187,3 +6239,7 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_free_ext_path(path);
return 0;
}
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "extents-test.c"
+#endif
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e04fbf1..a1538ba 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -16,6 +16,7 @@
#include "ext4.h"
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
/*
* According to previous discussion in Ext4 Developer Workshop, we
@@ -178,7 +179,8 @@ static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved,
+ ext4_lblk_t end, unsigned int status,
+ int *reserved, struct extent_status *res,
struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -242,6 +244,21 @@ static inline void ext4_es_inc_seq(struct inode *inode)
WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
}
+static inline int __es_check_extent_status(struct extent_status *es,
+ unsigned int status,
+ struct extent_status *res)
+{
+ if (ext4_es_type(es) & status)
+ return 0;
+
+ if (res) {
+ res->es_lblk = es->es_lblk;
+ res->es_len = es->es_len;
+ res->es_pblk = es->es_pblk;
+ }
+ return -EINVAL;
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -882,7 +899,8 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
/*
* ext4_es_insert_extent() adds information to an inode's extent
- * status tree.
+ * status tree. This interface is used for modifying extents. To cache
+ * on-disk extents, use ext4_es_cache_extent() instead.
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
@@ -929,7 +947,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
+ err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -961,10 +979,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
}
pending = err3;
}
- /*
- * TODO: For cache on-disk extents, there is no need to increment
- * the sequence counter, this requires future optimization.
- */
ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -998,17 +1012,24 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * ext4_es_cache_extent() inserts information into the extent status
- * tree if and only if there isn't information about the range in
- * question already.
+ * ext4_es_cache_extent() inserts information into the extent status tree
+ * only if there is no existing information about the specified range or
+ * if the existing extents have the same status.
+ *
+ * Note that this interface is only used for caching on-disk extent
+ * information and cannot be used to convert existing extents in the extent
+ * status tree. To convert existing extents, use ext4_es_insert_extent()
+ * instead.
*/
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status)
{
struct extent_status *es;
- struct extent_status newes;
+ struct extent_status chkes, newes;
ext4_lblk_t end = lblk + len - 1;
+ bool conflict = false;
+ int err;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -1016,7 +1037,6 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
- trace_ext4_es_cache_extent(inode, &newes);
if (!len)
return;
@@ -1024,11 +1044,42 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
BUG_ON(end < lblk);
write_lock(&EXT4_I(inode)->i_es_lock);
-
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
- if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes, NULL);
+ if (es && es->es_lblk <= end) {
+ /* Found an extent that covers the entire range. */
+ if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) {
+ if (__es_check_extent_status(es, status, &chkes))
+ conflict = true;
+ goto unlock;
+ }
+ /* Check and remove all extents in range. */
+ err = __es_remove_extent(inode, lblk, end, status, NULL,
+ &chkes, NULL);
+ if (err) {
+ if (err == -EINVAL)
+ conflict = true;
+ goto unlock;
+ }
+ }
+ __es_insert_extent(inode, &newes, NULL);
+ trace_ext4_es_cache_extent(inode, &newes);
+ ext4_es_print_tree(inode);
+unlock:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (!conflict)
+ return;
+ /*
+ * A hole in the on-disk extent but a delayed extent in the extent
+ * status tree, is allowed.
+ */
+ if (status == EXTENT_STATUS_HOLE &&
+ ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED)
+ return;
+
+ ext4_warning_inode(inode,
+ "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n",
+ lblk, len, pblk, status, chkes.es_lblk, chkes.es_len,
+ ext4_es_pblock(&chkes), ext4_es_status(&chkes));
}
/*
@@ -1409,23 +1460,27 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
return rc->ndelayed;
}
-
/*
* __es_remove_extent - removes block range from extent status tree
*
* @inode - file containing range
* @lblk - first block in range
* @end - last block in range
+ * @status - the extent status to be checked
* @reserved - number of cluster reservations released
+ * @res - return the extent if the status is not match
* @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
- * enabled cancels pending reservations as needed. Returns 0 on success,
- * error code on failure.
+ * enabled cancels pending reservations as needed. If @status is not
+ * zero, check extent status type while removing extent, return -EINVAL
+ * and pass out the extent through @res if not match. Returns 0 on
+ * success, error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved,
+ ext4_lblk_t end, unsigned int status,
+ int *reserved, struct extent_status *res,
struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
@@ -1434,18 +1489,24 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err = 0;
+ int err;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
+ if (status == 0)
+ status = ES_TYPE_MASK;
es = __es_tree_search(&tree->root, lblk);
if (!es)
- goto out;
+ return 0;
if (es->es_lblk > end)
- goto out;
+ return 0;
+
+ err = __es_check_extent_status(es, status, res);
+ if (err)
+ return err;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
@@ -1480,7 +1541,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
- goto out;
+ return err;
}
} else {
es->es_lblk = end + 1;
@@ -1494,7 +1555,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (count_reserved)
count_rsvd(inode, orig_es.es_lblk + len1,
orig_es.es_len - len1 - len2, &orig_es, &rc);
- goto out_get_reserved;
+ goto out;
}
if (len1 > 0) {
@@ -1509,6 +1570,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
while (es && ext4_es_end(es) <= end) {
+ err = __es_check_extent_status(es, status, res);
+ if (err)
+ return err;
if (count_reserved)
count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
node = rb_next(&es->rb_node);
@@ -1524,6 +1588,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (es && es->es_lblk < end + 1) {
ext4_lblk_t orig_len = es->es_len;
+ err = __es_check_extent_status(es, status, res);
+ if (err)
+ return err;
+
len1 = ext4_es_end(es) - end;
if (count_reserved)
count_rsvd(inode, es->es_lblk, orig_len - len1,
@@ -1536,11 +1604,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
}
-out_get_reserved:
+out:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
-out:
- return err;
+ return 0;
}
/*
@@ -1582,7 +1649,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es);
if (err)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2174,7 +2241,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
}
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index fa66b08..f575751 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -231,16 +231,16 @@ static bool ext4_fc_disabled(struct super_block *sb)
void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
wait_queue_head_t *wq;
+ int alloc_ctx;
if (ext4_fc_disabled(inode->i_sb))
return;
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(inode->i_sb);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
@@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode)
#endif
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
schedule();
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(inode->i_sb);
}
finish_wait(wq, &wait.wq_entry);
}
@@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode)
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
@@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode)
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
@@ -315,6 +315,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
tid_t tid;
bool has_transaction = true;
bool is_ineligible;
+ int alloc_ctx;
if (ext4_fc_disabled(sb))
return;
@@ -329,12 +330,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -358,6 +359,7 @@ static int ext4_fc_track_template(
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
tid_t tid = 0;
+ int alloc_ctx;
int ret;
tid = handle->h_transaction->t_tid;
@@ -373,14 +375,14 @@ static int ext4_fc_track_template(
if (!enqueue)
return ret;
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(inode->i_sb);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
return ret;
}
@@ -402,6 +404,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct inode *dir = dentry->d_parent->d_inode;
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int alloc_ctx;
spin_unlock(&ei->i_fc_lock);
@@ -425,7 +428,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
INIT_LIST_HEAD(&node->fcd_list);
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -446,7 +449,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
spin_lock(&ei->i_fc_lock);
return 0;
@@ -1046,18 +1049,19 @@ static int ext4_fc_perform_commit(journal_t *journal)
struct blk_plug plug;
int ret = 0;
u32 crc = 0;
+ int alloc_ctx;
/*
* Step 1: Mark all inodes on s_fc_q[MAIN] with
* EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
* freed until the data flush is over.
*/
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
}
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
/* Step 2: Flush data for all the eligible inodes. */
ret = ext4_fc_flush_data(journal);
@@ -1067,7 +1071,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* any error from step 2. This ensures that waiters waiting on
* EXT4_STATE_FC_FLUSHING_DATA can resume.
*/
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
@@ -1084,7 +1088,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* prepare_to_wait() in ext4_fc_del().
*/
smp_mb();
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
/*
* If we encountered error in Step 2, return it now after clearing
@@ -1101,12 +1105,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
* previous handles are now drained. We now mark the inodes on the
* commit queue as being committed.
*/
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
}
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
jbd2_journal_unlock_updates(journal);
/*
@@ -1117,6 +1121,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
+ alloc_ctx = ext4_fc_lock(sb);
/* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
@@ -1134,7 +1139,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
/* Step 6.2: Now write all the dentry updates. */
- mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
if (ret)
goto out;
@@ -1156,7 +1160,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
ret = ext4_fc_write_tail(sb, crc);
out:
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
blk_finish_plug(&plug);
return ret;
}
@@ -1290,6 +1294,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
+ int alloc_ctx;
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
@@ -1297,7 +1302,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- mutex_lock(&sbi->s_fc_lock);
+ alloc_ctx = ext4_fc_lock(sb);
while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
struct ext4_inode_info,
@@ -1356,7 +1361,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- mutex_unlock(&sbi->s_fc_lock);
+ ext4_fc_unlock(sb, alloc_ctx);
trace_ext4_fc_stats(sb);
}
@@ -2302,6 +2307,9 @@ static const char * const fc_ineligible_reasons[] = {
[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
+ [EXT4_FC_REASON_MIGRATE] = "Inode format migration",
+ [EXT4_FC_REASON_VERITY] = "fs-verity enable",
+ [EXT4_FC_REASON_MOVE_EXT] = "Move extents",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 3bd534e..2f77a37 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -97,6 +97,9 @@ enum {
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ EXT4_FC_REASON_MIGRATE,
+ EXT4_FC_REASON_VERITY,
+ EXT4_FC_REASON_MOVE_EXT,
EXT4_FC_REASON_MAX
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 534cf864..4320ebf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -419,22 +419,20 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
* - shared locking will only be true mostly with overwrites, including
- * initialized blocks and unwritten blocks. For overwrite unwritten blocks
- * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
- * also release exclusive i_rwsem lock.
+ * initialized blocks and unwritten blocks.
*
* - Otherwise we will switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
bool *ilock_shared, bool *extend,
- bool *unwritten, int *dio_flags)
+ int *dio_flags)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
- bool overwrite, unaligned_io;
+ bool overwrite, unaligned_io, unwritten;
restart:
ret = ext4_generic_write_checks(iocb, from);
@@ -446,7 +444,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
unaligned_io = ext4_unaligned_io(inode, from, offset);
*extend = ext4_extending_io(inode, offset, count);
- overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
+ overwrite = ext4_overwrite_io(inode, offset, count, &unwritten);
/*
* Determine whether we need to upgrade to an exclusive lock. This is
@@ -461,7 +459,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
*/
if (*ilock_shared &&
((!IS_NOSEC(inode) || *extend || !overwrite ||
- (unaligned_io && *unwritten)))) {
+ (unaligned_io && unwritten)))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -484,7 +482,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
ret = -EAGAIN;
goto out;
}
- if (unaligned_io && (!overwrite || *unwritten))
+ if (unaligned_io && (!overwrite || unwritten))
inode_dio_wait(inode);
*dio_flags = IOMAP_DIO_FORCE_WAIT;
}
@@ -509,8 +507,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
- const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unwritten = false;
+ bool extend = false;
bool ilock_shared = true;
int dio_flags = 0;
@@ -556,7 +553,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
- &unwritten, &dio_flags);
+ &dio_flags);
if (ret <= 0)
return ret;
@@ -576,9 +573,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (ilock_shared && !unwritten)
- iomap_ops = &ext4_iomap_overwrite_ops;
- ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+ ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
@@ -859,7 +854,6 @@ static int ext4_sample_last_mounted(struct super_block *sb,
* when trying to sort through large numbers of block
* devices or filesystem images.
*/
- memset(buf, 0, sizeof(buf));
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0c466cc..8a544f7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -48,6 +48,8 @@
#include "acl.h"
#include "truncate.h"
+#include <kunit/static_stub.h>
+
#include <trace/events/ext4.h>
static void ext4_journalled_zero_new_buffers(handle_t *handle,
@@ -400,6 +402,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len);
+
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
@@ -503,8 +507,8 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
if (retval <= 0) {
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
+ ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
return map->m_len;
}
@@ -525,20 +529,20 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
*/
if (map->m_pblk + map->m_len == map2.m_pblk &&
status == status2) {
- ext4_es_insert_extent(inode, map->m_lblk,
- map->m_len + map2.m_len, map->m_pblk,
- status, false);
+ ext4_es_cache_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status);
map->m_len += map2.m_len;
} else {
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
+ ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
return map->m_len;
}
-static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map, int flags)
+int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
@@ -573,8 +577,8 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
map->m_len == orig_mlen) {
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
+ ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
} else {
retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map,
orig_mlen);
@@ -584,10 +588,9 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
return retval;
}
-static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map, int flags)
+int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
- struct extent_status es;
unsigned int status;
int err, retval = 0;
@@ -648,16 +651,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
return err;
}
- /*
- * If the extent has been zeroed out, we don't need to update
- * extent status tree.
- */
- if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
- if (ext4_es_is_written(&es))
- return retval;
- }
-
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
@@ -2375,7 +2368,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
@@ -3380,33 +3373,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
return ret;
}
-static int ext4_read_folio(struct file *file, struct folio *folio)
-{
- int ret = -EAGAIN;
- struct inode *inode = folio->mapping->host;
-
- trace_ext4_read_folio(inode, folio);
-
- if (ext4_has_inline_data(inode))
- ret = ext4_readpage_inline(inode, folio);
-
- if (ret == -EAGAIN)
- return ext4_mpage_readpages(inode, NULL, folio);
-
- return ret;
-}
-
-static void ext4_readahead(struct readahead_control *rac)
-{
- struct inode *inode = rac->mapping->host;
-
- /* If the file has inline data, no need to do readahead. */
- if (ext4_has_inline_data(inode))
- return;
-
- ext4_mpage_readpages(inode, rac, NULL);
-}
-
static void ext4_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
@@ -3740,7 +3706,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (flags & IOMAP_ATOMIC)
ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
@@ -3812,22 +3778,25 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
/*
- * For atomic writes the entire requested length should
- * be mapped.
+ * For DAX we convert extents to initialized ones before
+ * copying the data, otherwise we do it after I/O so
+ * there's no need to call into ext4_iomap_alloc().
*/
- if (map.m_flags & EXT4_MAP_MAPPED) {
- if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
- (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ if ((map.m_flags & EXT4_MAP_MAPPED) ||
+ (!(flags & IOMAP_DAX) &&
+ (map.m_flags & EXT4_MAP_UNWRITTEN))) {
+ /*
+ * For atomic writes the entire requested
+ * length should be mapped.
+ */
+ if (ret == orig_mlen ||
+ (!(flags & IOMAP_ATOMIC) && ret > 0))
goto out;
}
map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
- /*
- * This can be called for overwrites path from
- * ext4_iomap_overwrite_begin().
- */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3856,30 +3825,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return 0;
}
-static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
- loff_t length, unsigned flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- int ret;
-
- /*
- * Even for writes we don't need to allocate blocks, so just pretend
- * we are reading to save overhead of starting a transaction.
- */
- flags &= ~IOMAP_WRITE;
- ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
- WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
- return ret;
-}
-
const struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
};
-const struct iomap_ops ext4_iomap_overwrite_ops = {
- .iomap_begin = ext4_iomap_overwrite_begin,
-};
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
@@ -4133,9 +4082,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (ext4_should_journal_data(inode)) {
err = ext4_dirty_journalled_data(handle, bh);
} else {
- err = 0;
mark_buffer_dirty(bh);
- if (ext4_should_order_data(inode))
+ /*
+ * Only the written block requires ordered data to prevent
+ * exposing stale data.
+ */
+ if (!buffer_unwritten(bh) && !buffer_delay(bh) &&
+ ext4_should_order_data(inode))
err = ext4_jbd2_inode_add_write(handle, inode, from,
length);
}
@@ -5835,10 +5788,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (error)
return error;
- error = fsverity_prepare_setattr(dentry, attr);
- if (error)
- return error;
-
if (is_quota_modification(idmap, inode, attr)) {
error = dquot_initialize(inode);
if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ea26cd0..3ae9cb5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -968,6 +968,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
@@ -1613,6 +1614,8 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE,
+ NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index a9416b2..4abb40d 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -567,7 +567,7 @@ test_mark_diskspace_used_range(struct kunit *test,
bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
memset(bitmap, 0, sb->s_blocksize);
- ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL);
KUNIT_ASSERT_EQ(test, ret, 0);
max = EXT4_CLUSTERS_PER_GROUP(sb);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e817a75..b99d1a7e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -892,6 +892,21 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
}
}
+static ext4_group_t ext4_get_allocation_groups_count(
+ struct ext4_allocation_context *ac)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups;
+
+ /* Pairs with smp_wmb() in ext4_update_super() */
+ smp_rmb();
+
+ return ngroups;
+}
+
static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct xarray *xa,
ext4_group_t start, ext4_group_t end)
@@ -899,7 +914,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
enum criteria cr = ac->ac_criteria;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
+ ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
unsigned long group = start;
struct ext4_group_info *grp;
@@ -951,7 +966,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
@@ -1001,7 +1016,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@@ -1083,7 +1098,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
min_order = fls(ac->ac_o_ex.fe_len);
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
@@ -1133,8 +1148,6 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0;
- if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
- return 0;
return 1;
}
@@ -1182,11 +1195,7 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
int ret = 0;
ext4_group_t start;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
-
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
+ ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
/* searching for the right group start from the goal value specified */
start = ac->ac_g_ex.fe_group;
@@ -1706,16 +1715,17 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* Avoid locking the folio in the fast path ... */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
+ /*
+ * folio_test_locked is employed to detect ongoing folio
+ * migrations, since concurrent migrations can lead to
+ * bitmap inconsistency. And if we are not uptodate that
+ * implies somebody just created the folio but is yet to
+ * initialize it. We can drop the folio reference and
+ * try to get the folio with lock in both cases to avoid
+ * concurrency.
+ */
if (!IS_ERR(folio))
- /*
- * drop the folio reference and try
- * to get the folio with lock. If we
- * are not uptodate that implies
- * somebody just created the folio but
- * is yet to initialize it. So
- * wait for it to initialize.
- */
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
@@ -1764,7 +1774,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* we need another folio for the buddy */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
if (!IS_ERR(folio))
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
@@ -4185,8 +4195,7 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
* Returns 0 if success or error code
*/
static noinline_for_stack int
-ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_clstrs)
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
{
struct ext4_group_desc *gdp;
struct ext4_sb_info *sbi;
@@ -4241,13 +4250,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
BUG_ON(changed != ac->ac_b_ex.fe_len);
#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
- /*
- * Now reduce the dirty block count also. Should not go negative
- */
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
- /* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyclusters_counter,
- reserv_clstrs);
return err;
}
@@ -6331,7 +6333,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ext4_mb_pa_put_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle);
if (*errp) {
ext4_discard_allocated_blocks(ac);
goto errout;
@@ -6362,12 +6364,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
out:
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
- if (!ar->len) {
- if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
- /* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyclusters_counter,
- reserv_clstrs);
- }
+ /* release any reserved blocks */
+ if (reserv_clstrs)
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1b0dfd9..96ab951 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -449,6 +449,12 @@ int ext4_ext_migrate(struct inode *inode)
retval = PTR_ERR(handle);
goto out_unlock;
}
+ /*
+ * This operation rewrites the inode's block mapping layout
+ * (indirect to extents) and is not tracked in the fast commit
+ * log, so disable fast commits for this transaction.
+ */
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle);
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
owner[0] = i_uid_read(inode);
@@ -630,6 +636,12 @@ int ext4_ind_migrate(struct inode *inode)
ret = PTR_ERR(handle);
goto out_unlock;
}
+ /*
+ * This operation rewrites the inode's block mapping layout
+ * (extents to indirect blocks) and is not tracked in the fast
+ * commit log, so disable fast commits for this transaction.
+ */
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 635fb8a..ce1f738 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -321,6 +321,8 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len)
ret = PTR_ERR(handle);
goto out;
}
+ ext4_fc_mark_ineligible(orig_inode->i_sb, EXT4_FC_REASON_MOVE_EXT,
+ handle);
ret = mext_move_begin(mext, folio, &move_type);
if (ret)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 49a6d36..830f3b8a 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -46,6 +46,7 @@
#include <linux/pagevec.h>
#include "ext4.h"
+#include <trace/events/ext4.h>
#define NUM_PREALLOC_POST_READ_CTXS 128
@@ -62,6 +63,7 @@ enum bio_post_read_step {
struct bio_post_read_ctx {
struct bio *bio;
+ struct fsverity_info *vi;
struct work_struct work;
unsigned int cur_step;
unsigned int enabled_steps;
@@ -97,6 +99,7 @@ static void verity_work(struct work_struct *work)
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
struct bio *bio = ctx->bio;
+ struct fsverity_info *vi = ctx->vi;
/*
* fsverity_verify_bio() may call readahead() again, and although verity
@@ -109,7 +112,7 @@ static void verity_work(struct work_struct *work)
mempool_free(ctx, bio_post_read_ctx_pool);
bio->bi_private = NULL;
- fsverity_verify_bio(bio);
+ fsverity_verify_bio(vi, bio);
__read_end_io(bio);
}
@@ -131,7 +134,8 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
ctx->cur_step++;
fallthrough;
case STEP_VERITY:
- if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
fsverity_enqueue_verify_work(&ctx->work);
return;
@@ -172,22 +176,16 @@ static void mpage_end_io(struct bio *bio)
__read_end_io(bio);
}
-static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
-{
- return fsverity_active(inode) &&
- idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-}
-
static void ext4_set_bio_post_read_ctx(struct bio *bio,
const struct inode *inode,
- pgoff_t first_idx)
+ struct fsverity_info *vi)
{
unsigned int post_read_steps = 0;
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
- if (ext4_need_verity(inode, first_idx))
+ if (vi)
post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) {
@@ -196,6 +194,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
ctx->bio = bio;
+ ctx->vi = vi;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
}
@@ -209,7 +208,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-int ext4_mpage_readpages(struct inode *inode,
+static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi,
struct readahead_control *rac, struct folio *folio)
{
struct bio *bio = NULL;
@@ -329,8 +328,7 @@ int ext4_mpage_readpages(struct inode *inode,
folio_zero_segment(folio, first_hole << blkbits,
folio_size(folio));
if (first_hole == 0) {
- if (ext4_need_verity(inode, folio->index) &&
- !fsverity_verify_folio(folio))
+ if (vi && !fsverity_verify_folio(vi, folio))
goto set_error_page;
folio_end_read(folio, true);
continue;
@@ -358,7 +356,7 @@ int ext4_mpage_readpages(struct inode *inode,
REQ_OP_READ, GFP_KERNEL);
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
- ext4_set_bio_post_read_ctx(bio, inode, folio->index);
+ ext4_set_bio_post_read_ctx(bio, inode, vi);
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
if (rac)
@@ -394,6 +392,44 @@ int ext4_mpage_readpages(struct inode *inode,
return 0;
}
+int ext4_read_folio(struct file *file, struct folio *folio)
+{
+ struct inode *inode = folio->mapping->host;
+ struct fsverity_info *vi = NULL;
+ int ret;
+
+ trace_ext4_read_folio(inode, folio);
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_readpage_inline(inode, folio);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
+ if (folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
+ vi = fsverity_get_info(inode);
+ if (vi)
+ fsverity_readahead(vi, folio->index, folio_nr_pages(folio));
+ return ext4_mpage_readpages(inode, vi, NULL, folio);
+}
+
+void ext4_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct fsverity_info *vi = NULL;
+
+ /* If the file has inline data, no need to do readahead. */
+ if (ext4_has_inline_data(inode))
+ return;
+
+ if (readahead_index(rac) < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
+ vi = fsverity_get_info(inode);
+ if (vi)
+ fsverity_readahead(vi, readahead_index(rac),
+ readahead_count(rac));
+ ext4_mpage_readpages(inode, vi, rac, NULL);
+}
+
int __init ext4_init_post_read_processing(void)
{
bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3c73b98..504148b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1489,9 +1489,6 @@ static void init_once(void *foo)
#ifdef CONFIG_FS_ENCRYPTION
ei->i_crypt_info = NULL;
#endif
-#ifdef CONFIG_FS_VERITY
- ei->i_verity_info = NULL;
-#endif
}
static int __init init_inodecache(void)
@@ -1539,7 +1536,6 @@ void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->jinode = NULL;
}
fscrypt_put_encryption_info(inode);
- fsverity_cleanup_inode(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -3650,10 +3646,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
}
/*
- * This function is called once a day if we have errors logged
- * on the file system
+ * This function is called once a day by default if we have errors logged
+ * on the file system.
+ * Use the err_report_sec sysfs attribute to disable or adjust its call
+ * freequency.
*/
-static void print_daily_error_info(struct timer_list *t)
+void print_daily_error_info(struct timer_list *t)
{
struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
@@ -3693,7 +3691,9 @@ static void print_daily_error_info(struct timer_list *t)
le64_to_cpu(es->s_last_error_block));
printk(KERN_CONT "\n");
}
- mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
+
+ if (sbi->s_err_report_sec)
+ mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
}
/* Find next suitable group and run ext4_init_inode_table */
@@ -5616,6 +5616,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt2(sb, MB_OPTIMIZE_SCAN);
}
+ err = ext4_percpu_param_init(sbi);
+ if (err)
+ goto failed_mount5;
+
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
@@ -5631,10 +5635,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_journal->j_commit_callback =
ext4_journal_commit_callback;
- err = ext4_percpu_param_init(sbi);
- if (err)
- goto failed_mount6;
-
if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
@@ -5690,8 +5690,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt(sb, DISCARD);
}
- if (es->s_error_count)
- mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+ if (es->s_error_count) {
+ sbi->s_err_report_sec = 5*60; /* first time 5 minutes */
+ mod_timer(&sbi->s_err_report,
+ jiffies + secs_to_jiffies(sbi->s_err_report_sec));
+ }
+ sbi->s_err_report_sec = 24*60*60; /* Once a day */
/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
@@ -5716,8 +5720,8 @@ failed_mount8: __maybe_unused
failed_mount6:
ext4_mb_release(sb);
ext4_flex_groups_free(sbi);
- ext4_percpu_param_destroy(sbi);
failed_mount5:
+ ext4_percpu_param_destroy(sbi);
ext4_ext_release(sb);
ext4_release_system_zone(sb);
failed_mount4a:
@@ -6237,10 +6241,11 @@ static void ext4_update_super(struct super_block *sb)
ext4_errno_to_code(sbi->s_last_error_code);
/*
* Start the daily error reporting function if it hasn't been
- * started already
+ * started already and sbi->s_err_report_sec is not zero
*/
- if (!es->s_error_count)
- mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
+ if (!es->s_error_count && !sbi->s_err_report_sec)
+ mod_timer(&sbi->s_err_report,
+ jiffies + secs_to_jiffies(sbi->s_err_report_sec));
le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
sbi->s_add_error_count = 0;
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 0018e09..d2ecc10 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -40,6 +40,7 @@ typedef enum {
attr_pointer_string,
attr_pointer_atomic,
attr_journal_task,
+ attr_err_report_sec,
} attr_id_t;
typedef enum {
@@ -130,6 +131,36 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
return count;
}
+static ssize_t err_report_sec_store(struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ /*the maximum time interval must not exceed one year.*/
+ if (t > (365*24*60*60))
+ return -EINVAL;
+
+ if (sbi->s_err_report_sec == t) /*nothing to do*/
+ goto out;
+ else if (!sbi->s_err_report_sec && t) {
+ timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+ } else if (sbi->s_err_report_sec && !t) {
+ timer_delete_sync(&sbi->s_err_report);
+ goto out;
+ }
+
+ sbi->s_err_report_sec = t;
+ mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
+
+out:
+ return count;
+}
+
static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
if (!sbi->s_journal)
@@ -217,6 +248,7 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
ext4_sb_info, s_mb_group_prealloc);
EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
ext4_sb_info, s_mb_best_avail_max_trim_order);
+EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
@@ -309,6 +341,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(last_trim_minblks),
ATTR_LIST(sb_update_sec),
ATTR_LIST(sb_update_kb),
+ ATTR_LIST(err_report_sec),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -402,6 +435,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
case attr_pointer_ul:
+ case attr_err_report_sec:
return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
case attr_pointer_u8:
return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
@@ -525,6 +559,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
+ case attr_err_report_sec:
+ return err_report_sec_store(sbi, buf, len);
default:
return ext4_generic_attr_store(a, sbi, buf, len);
}
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 415d9c4..ca61da5 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -231,6 +231,8 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
goto cleanup;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_VERITY, handle);
+
err = ext4_orphan_del(handle, inode);
if (err)
goto stop_and_cleanup;
@@ -358,42 +360,32 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
}
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages)
+ pgoff_t index)
{
- struct folio *folio;
-
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
-
- folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
-
- if (!IS_ERR(folio))
- folio_put(folio);
- else if (num_ra_pages > 1)
- page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
- folio = read_mapping_folio(inode->i_mapping, index, NULL);
- if (IS_ERR(folio))
- return ERR_CAST(folio);
- }
- return folio_file_page(folio, index);
+ return generic_read_merkle_tree_page(inode, index);
}
-static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+static void ext4_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+ unsigned long nr_pages)
+{
+ index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+ generic_readahead_merkle_tree(inode, index, nr_pages);
+}
+
+static int ext4_write_merkle_tree_block(struct file *file, const void *buf,
u64 pos, unsigned int size)
{
- pos += ext4_verity_metadata_pos(inode);
+ pos += ext4_verity_metadata_pos(file_inode(file));
- return pagecache_write(inode, buf, size, pos);
+ return pagecache_write(file_inode(file), buf, size, pos);
}
const struct fsverity_operations ext4_verityops = {
- .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) -
- (int)offsetof(struct ext4_inode_info, vfs_inode),
.begin_enable_verity = ext4_begin_enable_verity,
.end_enable_verity = ext4_end_enable_verity,
.get_verity_descriptor = ext4_get_verity_descriptor,
.read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .readahead_merkle_tree = ext4_readahead_merkle_tree,
.write_merkle_tree_block = ext4_write_merkle_tree_block,
};
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 7b68bf2..ef1225a 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1181,6 +1181,7 @@ int f2fs_prepare_compress_overwrite(struct inode *inode,
.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
.rpages = NULL,
.nr_rpages = 0,
+ .vi = NULL, /* can't write to fsverity files */
};
return prepare_compress_overwrite(&cc, pagep, index, fsdata);
@@ -1716,7 +1717,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->nr_cpages = cc->nr_cpages;
refcount_set(&dic->refcnt, 1);
dic->failed = false;
- dic->need_verity = f2fs_need_verity(cc->inode, start_idx);
+ dic->vi = cc->vi;
for (i = 0; i < dic->cluster_size; i++)
dic->rpages[i] = cc->rpages[i];
@@ -1814,7 +1815,7 @@ static void f2fs_verify_cluster(struct work_struct *work)
if (!rpage)
continue;
- if (fsverity_verify_page(rpage))
+ if (fsverity_verify_page(dic->vi, rpage))
SetPageUptodate(rpage);
else
ClearPageUptodate(rpage);
@@ -1833,7 +1834,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
{
int i;
- if (!failed && dic->need_verity) {
+ if (IS_ENABLED(CONFIG_FS_VERITY) && !failed && dic->vi) {
/*
* Note that to avoid deadlocks, the verity work can't be done
* on the decompression workqueue. This is because verifying
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fe944b3..79b70bc 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -109,6 +109,7 @@ enum bio_post_read_step {
struct bio_post_read_ctx {
struct bio *bio;
struct f2fs_sb_info *sbi;
+ struct fsverity_info *vi;
struct work_struct work;
unsigned int enabled_steps;
/*
@@ -165,6 +166,7 @@ static void f2fs_verify_bio(struct work_struct *work)
container_of(work, struct bio_post_read_ctx, work);
struct bio *bio = ctx->bio;
bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS);
+ struct fsverity_info *vi = ctx->vi;
/*
* fsverity_verify_bio() may call readahead() again, and while verity
@@ -187,13 +189,13 @@ static void f2fs_verify_bio(struct work_struct *work)
struct folio *folio = fi.folio;
if (!f2fs_is_compressed_page(folio) &&
- !fsverity_verify_page(&folio->page)) {
+ !fsverity_verify_page(vi, &folio->page)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
}
} else {
- fsverity_verify_bio(bio);
+ fsverity_verify_bio(vi, bio);
}
f2fs_finish_read_bio(bio, true);
@@ -1040,7 +1042,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
f2fs_up_write(&io->io_rwsem);
}
-static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+static struct bio *f2fs_grab_read_bio(struct inode *inode,
+ struct fsverity_info *vi, block_t blkaddr,
unsigned nr_pages, blk_opf_t op_flag,
pgoff_t first_idx, bool for_write)
{
@@ -1061,7 +1064,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= STEP_DECRYPT;
- if (f2fs_need_verity(inode, first_idx))
+ if (vi)
post_read_steps |= STEP_VERITY;
/*
@@ -1076,6 +1079,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
ctx->bio = bio;
ctx->sbi = sbi;
+ ctx->vi = vi;
ctx->enabled_steps = post_read_steps;
ctx->fs_blkaddr = blkaddr;
ctx->decompression_attempted = false;
@@ -1087,15 +1091,15 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
}
/* This can handle encryption stuffs */
-static void f2fs_submit_page_read(struct inode *inode, struct folio *folio,
- block_t blkaddr, blk_opf_t op_flags,
- bool for_write)
+static void f2fs_submit_page_read(struct inode *inode, struct fsverity_info *vi,
+ struct folio *folio, block_t blkaddr,
+ blk_opf_t op_flags, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
- folio->index, for_write);
+ bio = f2fs_grab_read_bio(inode, vi, blkaddr, 1, op_flags, folio->index,
+ for_write);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, blkaddr);
@@ -1197,6 +1201,14 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
+static inline struct fsverity_info *f2fs_need_verity(const struct inode *inode,
+ pgoff_t idx)
+{
+ if (idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
+ return fsverity_get_info(inode);
+ return NULL;
+}
+
struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs)
{
@@ -1262,8 +1274,8 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
return folio;
}
- f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
- op_flags, for_write);
+ f2fs_submit_page_read(inode, f2fs_need_verity(inode, folio->index),
+ folio, dn.data_blkaddr, op_flags, for_write);
return folio;
put_err:
@@ -2067,12 +2079,12 @@ static inline blk_opf_t f2fs_ra_op_flags(struct readahead_control *rac)
return rac ? REQ_RAHEAD : 0;
}
-static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
- unsigned nr_pages,
- struct f2fs_map_blocks *map,
- struct bio **bio_ret,
- sector_t *last_block_in_bio,
- struct readahead_control *rac)
+static int f2fs_read_single_page(struct inode *inode, struct fsverity_info *vi,
+ struct folio *folio, unsigned int nr_pages,
+ struct f2fs_map_blocks *map,
+ struct bio **bio_ret,
+ sector_t *last_block_in_bio,
+ struct readahead_control *rac)
{
struct bio *bio = *bio_ret;
const unsigned int blocksize = F2FS_BLKSIZE;
@@ -2124,8 +2136,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
} else {
zero_out:
folio_zero_segment(folio, 0, folio_size(folio));
- if (f2fs_need_verity(inode, index) &&
- !fsverity_verify_folio(folio)) {
+ if (vi && !fsverity_verify_folio(vi, folio)) {
ret = -EIO;
goto out;
}
@@ -2147,9 +2158,8 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
bio = NULL;
}
if (bio == NULL)
- bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- f2fs_ra_op_flags(rac), index,
- false);
+ bio = f2fs_grab_read_bio(inode, vi, block_nr, nr_pages,
+ f2fs_ra_op_flags(rac), index, false);
/*
* If the page is under writeback, we need to wait for
@@ -2299,9 +2309,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
if (!bio)
- bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i,
- f2fs_ra_op_flags(rac),
- folio->index, for_write);
+ bio = f2fs_grab_read_bio(inode, cc->vi, blkaddr,
+ nr_pages - i,
+ f2fs_ra_op_flags(rac),
+ folio->index, for_write);
if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
@@ -2340,7 +2351,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
*/
-static int f2fs_mpage_readpages(struct inode *inode,
+static int f2fs_mpage_readpages(struct inode *inode, struct fsverity_info *vi,
struct readahead_control *rac, struct folio *folio)
{
struct bio *bio = NULL;
@@ -2395,6 +2406,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
/* there are remained compressed pages, submit them */
if (!f2fs_cluster_can_merge_page(&cc, index)) {
+ cc.vi = vi;
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
@@ -2428,8 +2440,9 @@ static int f2fs_mpage_readpages(struct inode *inode,
read_single_page:
#endif
- ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map,
- &bio, &last_block_in_bio, rac);
+ ret = f2fs_read_single_page(inode, vi, folio, max_nr_pages,
+ &map, &bio, &last_block_in_bio,
+ rac);
if (ret) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
@@ -2445,6 +2458,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
if (f2fs_compressed_file(inode)) {
/* last page */
if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) {
+ cc.vi = vi;
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
@@ -2462,7 +2476,8 @@ static int f2fs_mpage_readpages(struct inode *inode,
static int f2fs_read_data_folio(struct file *file, struct folio *folio)
{
struct inode *inode = folio->mapping->host;
- int ret = -EAGAIN;
+ struct fsverity_info *vi = NULL;
+ int ret;
trace_f2fs_readpage(folio, DATA);
@@ -2472,16 +2487,22 @@ static int f2fs_read_data_folio(struct file *file, struct folio *folio)
}
/* If the file has inline data, try to read it directly */
- if (f2fs_has_inline_data(inode))
+ if (f2fs_has_inline_data(inode)) {
ret = f2fs_read_inline_data(inode, folio);
- if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(inode, NULL, folio);
- return ret;
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
+ vi = f2fs_need_verity(inode, folio->index);
+ if (vi)
+ fsverity_readahead(vi, folio->index, folio_nr_pages(folio));
+ return f2fs_mpage_readpages(inode, vi, NULL, folio);
}
static void f2fs_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
+ struct fsverity_info *vi = NULL;
trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac));
@@ -2492,7 +2513,11 @@ static void f2fs_readahead(struct readahead_control *rac)
if (f2fs_has_inline_data(inode))
return;
- f2fs_mpage_readpages(inode, rac, NULL);
+ vi = f2fs_need_verity(inode, readahead_index(rac));
+ if (vi)
+ fsverity_readahead(vi, readahead_index(rac),
+ readahead_count(rac));
+ f2fs_mpage_readpages(inode, vi, rac, NULL);
}
int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
@@ -3641,9 +3666,10 @@ static int f2fs_write_begin(const struct kiocb *iocb,
err = -EFSCORRUPTED;
goto put_folio;
}
- f2fs_submit_page_read(use_cow ?
- F2FS_I(inode)->cow_inode : inode,
- folio, blkaddr, 0, true);
+ f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode :
+ inode,
+ NULL, /* can't write to fsverity files */
+ folio, blkaddr, 0, true);
folio_lock(folio);
if (unlikely(folio->mapping != mapping)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9f3aa3c..a90a62c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -974,9 +974,6 @@ struct f2fs_inode_info {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */
#endif
-#ifdef CONFIG_FS_VERITY
- struct fsverity_info *i_verity_info; /* filesystem verity info */
-#endif
};
static inline void get_read_extent_info(struct extent_info *ext,
@@ -1603,6 +1600,7 @@ struct compress_ctx {
size_t clen; /* valid data length in cbuf */
void *private; /* payload buffer for specified compression algorithm */
void *private2; /* extra payload buffer */
+ struct fsverity_info *vi; /* verity info if needed */
};
/* compress context for write IO path */
@@ -1658,7 +1656,7 @@ struct decompress_io_ctx {
refcount_t refcnt;
bool failed; /* IO error occurred before decompression? */
- bool need_verity; /* need fs-verity verification after decompression? */
+ struct fsverity_info *vi; /* fs-verity context if needed */
unsigned char compress_algorithm; /* backup algorithm type */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
@@ -4886,12 +4884,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
-static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
-{
- return fsverity_active(inode) &&
- idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-}
-
#ifdef CONFIG_F2FS_FAULT_INJECTION
extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
unsigned long type, enum fault_option fo);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 31a0c1b..1fdbe18 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1076,10 +1076,6 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (err)
return err;
- err = fsverity_prepare_setattr(dentry, attr);
- if (err)
- return err;
-
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -4424,7 +4420,9 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
pgoff_t redirty_idx = page_idx;
int page_len = 0, ret = 0;
+ filemap_invalidate_lock_shared(mapping);
page_cache_ra_unbounded(&ractl, len, 0);
+ filemap_invalidate_unlock_shared(mapping);
do {
folio = read_cache_folio(mapping, page_idx, NULL, NULL);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 38b8994..ee332b9 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -1000,7 +1000,6 @@ void f2fs_evict_inode(struct inode *inode)
}
out_clear:
fscrypt_put_encryption_info(inode);
- fsverity_cleanup_inode(inode);
clear_inode(inode);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c4c225e..cd00d03 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -504,9 +504,6 @@ static void init_once(void *foo)
#ifdef CONFIG_FS_ENCRYPTION
fi->i_crypt_info = NULL;
#endif
-#ifdef CONFIG_FS_VERITY
- fi->i_verity_info = NULL;
-#endif
}
#ifdef CONFIG_QUOTA
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 05b935b..92ebcc1 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -256,42 +256,32 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
}
static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages)
+ pgoff_t index)
{
- struct folio *folio;
-
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
-
- folio = f2fs_filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
-
- if (!IS_ERR(folio))
- folio_put(folio);
- else if (num_ra_pages > 1)
- page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
- folio = read_mapping_folio(inode->i_mapping, index, NULL);
- if (IS_ERR(folio))
- return ERR_CAST(folio);
- }
- return folio_file_page(folio, index);
+ return generic_read_merkle_tree_page(inode, index);
}
-static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
+static void f2fs_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+ unsigned long nr_pages)
+{
+ index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
+ generic_readahead_merkle_tree(inode, index, nr_pages);
+}
+
+static int f2fs_write_merkle_tree_block(struct file *file, const void *buf,
u64 pos, unsigned int size)
{
- pos += f2fs_verity_metadata_pos(inode);
+ pos += f2fs_verity_metadata_pos(file_inode(file));
- return pagecache_write(inode, buf, size, pos);
+ return pagecache_write(file_inode(file), buf, size, pos);
}
const struct fsverity_operations f2fs_verityops = {
- .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) -
- (int)offsetof(struct f2fs_inode_info, vfs_inode),
.begin_enable_verity = f2fs_begin_enable_verity,
.end_enable_verity = f2fs_end_enable_verity,
.get_verity_descriptor = f2fs_get_verity_descriptor,
.read_merkle_tree_page = f2fs_read_merkle_tree_page,
+ .readahead_merkle_tree = f2fs_readahead_merkle_tree,
.write_merkle_tree_block = f2fs_write_merkle_tree_block,
};
diff --git a/fs/inode.c b/fs/inode.c
index dae43a8..cc12b68e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
+#include <linux/fsverity.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
@@ -774,6 +775,14 @@ void dump_mapping(const struct address_space *mapping)
void clear_inode(struct inode *inode)
{
/*
+ * Only IS_VERITY() inodes can have verity info, so start by checking
+ * for IS_VERITY() (which is faster than retrieving the pointer to the
+ * verity info). This minimizes overhead for non-verity inodes.
+ */
+ if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
+ fsverity_cleanup_inode(inode);
+
+ /*
* We have to cycle the i_pages lock here because reclaim can be in the
* process of removing the last page (in __filemap_remove_folio())
* and we must not free the mapping under it.
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 5764982..6fe6dbd 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -271,7 +271,7 @@ int get_rock_ridge_filename(struct iso_directory_record *de,
break;
}
len = rr->len - 5;
- if (retnamlen + len >= 254) {
+ if (retnamlen + len > NAME_MAX) {
truncate = 1;
break;
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ab83bb..9ab3f2f 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2903,7 +2903,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
stbl = DT_GETSTBL(p);
for (i = index; i < p->header.nextindex; i++) {
- if (stbl[i] < 0 || stbl[i] >= DTPAGEMAXSLOT) {
+ if (stbl[i] < 0) {
jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld",
i, stbl[i], (long)ip->i_ino, (long long)bn);
free_page(dirent_buf);
@@ -3108,7 +3108,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
/* get the leftmost entry */
stbl = DT_GETSTBL(p);
- if (stbl[0] < 0 || stbl[0] >= DTPAGEMAXSLOT) {
+ if (stbl[0] < 0) {
DT_PUTPAGE(mp);
jfs_error(ip->i_sb, "stbl[0] out of bound\n");
return -EIO;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index b343c5e..5b1c5da 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2311,6 +2311,7 @@ int jfsIOWait(void *arg)
{
struct lbuf *bp;
+ set_freezable();
do {
spin_lock_irq(&log_redrive_lock);
while ((bp = log_redrive_list)) {
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index f7e2ae7a..60c4a0e0 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1229,7 +1229,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
jfs_err("jfs_rename: dtInsert returned -EIO");
goto out_tx;
}
- if (S_ISDIR(old_ip->i_mode))
+ if (S_ISDIR(old_ip->i_mode) && old_dir != new_dir)
inc_nlink(new_dir);
}
/*
@@ -1245,7 +1245,9 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_tx;
}
if (S_ISDIR(old_ip->i_mode)) {
- drop_nlink(old_dir);
+ if (new_ip || old_dir != new_dir)
+ drop_nlink(old_dir);
+
if (old_dir != new_dir) {
/*
* Change inode number of parent for moved directory
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d68afa1..dcd80c4 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -141,7 +141,7 @@ lockd(void *vrqstp)
*/
while (!svc_thread_should_stop(rqstp)) {
nlmsvc_retry_blocked(rqstp);
- svc_recv(rqstp);
+ svc_recv(rqstp, 0);
}
if (nlmsvc_ops)
nlmsvc_invalidate_all();
@@ -340,7 +340,7 @@ static int lockd_get(void)
return -ENOMEM;
}
- error = svc_set_num_threads(serv, NULL, 1);
+ error = svc_set_num_threads(serv, 0, 1);
if (error < 0) {
svc_destroy(&serv);
return error;
@@ -368,7 +368,7 @@ static void lockd_put(void)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
- svc_set_num_threads(nlmsvc_serv, NULL, 0);
+ svc_set_num_threads(nlmsvc_serv, 0, 0);
timer_delete_sync(&nlmsvc_retry);
svc_destroy(&nlmsvc_serv);
dprintk("lockd_down: service destroyed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6bce19f..712df1e 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -641,10 +641,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
conflock->fl.c.flc_owner = lock->fl.c.flc_owner;
error = vfs_test_lock(file->f_file[mode], &conflock->fl);
if (error) {
- /* We can't currently deal with deferred test requests */
- if (error == FILE_LOCK_DEFERRED)
- WARN_ON_ONCE(1);
-
ret = nlm_lck_denied_nolocks;
goto out;
}
diff --git a/fs/locks.c b/fs/locks.c
index 3ea25d3..d13ec93 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2262,12 +2262,23 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
+ int error = 0;
+
WARN_ON_ONCE(fl->fl_ops || fl->fl_lmops);
WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
- return filp->f_op->lock(filp, F_GETLK, fl);
- posix_test_lock(filp, fl);
- return 0;
+ error = filp->f_op->lock(filp, F_GETLK, fl);
+ else
+ posix_test_lock(filp, fl);
+
+ /*
+ * We don't expect FILE_LOCK_DEFERRED and callers cannot
+ * handle it.
+ */
+ if (WARN_ON_ONCE(error == FILE_LOCK_DEFERRED))
+ error = -EIO;
+
+ return error;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index fabda0f6..701a9ac 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -81,7 +81,7 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
while (!svc_thread_should_stop(rqstp))
- svc_recv(rqstp);
+ svc_recv(rqstp, 0);
svc_exit_thread(rqstp);
return 0;
@@ -119,9 +119,9 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
if (serv->sv_nrthreads == nrservs)
return 0;
- ret = svc_set_num_threads(serv, NULL, nrservs);
+ ret = svc_set_num_threads(serv, 0, nrservs);
if (ret) {
- svc_set_num_threads(serv, NULL, 0);
+ svc_set_num_threads(serv, 0, 0);
return ret;
}
dprintk("nfs_callback_up: service started\n");
@@ -242,7 +242,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
cb_info->users++;
err_net:
if (!cb_info->users) {
- svc_set_num_threads(cb_info->serv, NULL, 0);
+ svc_set_num_threads(cb_info->serv, 0, 0);
svc_destroy(&cb_info->serv);
}
err_create:
@@ -268,7 +268,7 @@ void nfs_callback_down(int minorversion, struct net *net, struct rpc_xprt *xprt)
nfs_callback_down_net(minorversion, serv, net);
cb_info->users--;
if (cb_info->users == 0) {
- svc_set_num_threads(serv, NULL, 0);
+ svc_set_num_threads(serv, 0, 0);
dprintk("nfs_callback_down: service destroyed\n");
xprt_svc_destroy_nullify_bc(xprt, &cb_info->serv);
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 0b5c1a0b..4fd6e81 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -186,3 +186,22 @@
draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This
is currently an experimental feature and is therefore left disabled
by default.
+
+config NFSD_V4_POSIX_ACLS
+ bool "Support NFSv4 POSIX draft ACLs"
+ depends on NFSD_V4
+ default n
+ help
+ Include experimental support for POSIX Access Control Lists
+ (ACLs) in NFSv4 as specified in the IETF draft
+ draft-ietf-nfsv4-posix-acls. This protocol extension enables
+ NFSv4 clients to retrieve and modify POSIX ACLs on exported
+ filesystems that support them.
+
+ This feature is based on an unratified IETF draft
+ specification that may change in ways that impact
+ interoperability with existing clients. Enable only for
+ testing environments or when interoperability with specific
+ clients that implement this draft is required.
+
+ If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 55744bb..f0da4d6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -26,7 +26,15 @@
nfsd-$(CONFIG_NFS_LOCALIO) += localio.o
nfsd-$(CONFIG_DEBUG_FS) += debugfs.o
-
+#
+# XDR code generation (requires Python and additional packages)
+#
+# The generated *xdr_gen.{h,c} files are checked into git. Normal kernel
+# builds do not require the xdrgen tool or its Python dependencies.
+#
+# Developers modifying .x files in Documentation/sunrpc/xdr/ should run
+# "make xdrgen" to regenerate the affected files.
+#
.PHONY: xdrgen
xdrgen: ../../include/linux/sunrpc/xdrgen/nfs4_1.h nfs4xdr_gen.h nfs4xdr_gen.c
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 4b73244..2003523d 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -49,5 +49,6 @@ int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
struct nfs4_acl **acl);
__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
struct nfsd_attrs *attr);
+void sort_pacl_range(struct posix_acl *pacl, int start, int end);
#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index ac51a44..8875259 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -24,11 +24,12 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = {
};
/* NFSD_CMD_THREADS_SET - do */
-static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = {
+static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = {
[NFSD_A_SERVER_THREADS] = { .type = NLA_U32, },
[NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, },
[NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, },
[NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, },
+ [NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, },
};
/* NFSD_CMD_VERSION_SET - do */
@@ -57,7 +58,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
.cmd = NFSD_CMD_THREADS_SET,
.doit = nfsd_nl_threads_set_doit,
.policy = nfsd_threads_set_nl_policy,
- .maxattr = NFSD_A_SERVER_SCOPE,
+ .maxattr = NFSD_A_SERVER_MIN_THREADS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index fe83387..9fa6006 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -67,7 +67,6 @@ struct nfsd_net {
struct lock_manager nfsd4_manager;
bool grace_ended;
bool grace_end_forced;
- bool client_tracking_active;
time64_t boot_time;
struct dentry *nfsd_client_dir;
@@ -130,6 +129,12 @@ struct nfsd_net {
seqlock_t writeverf_lock;
unsigned char writeverf[8];
+ /*
+ * Minimum number of threads to run per pool. If 0 then the
+ * min == max requested number of threads.
+ */
+ unsigned int min_threads;
+
u32 clientid_base;
u32 clientid_counter;
u32 clverifier_counter;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 5fb202a..0ac538c7 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -45,7 +45,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
inode = d_inode(fh->fh_dentry);
if (argp->mask & ~NFS_ACL_MASK) {
- resp->status = nfserr_inval;
+ resp->status = nfserr_io;
goto out;
}
resp->mask = argp->mask;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 936ea1a..2c2f2fd8 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -369,12 +369,21 @@ pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
return false;
}
-static void
-sort_pacl_range(struct posix_acl *pacl, int start, int end) {
+/**
+ * sort_pacl_range - sort a range of POSIX ACL entries by tag and id
+ * @pacl: POSIX ACL containing entries to sort
+ * @start: starting index of range to sort
+ * @end: ending index of range to sort (inclusive)
+ *
+ * Sorts ACL entries in place so that USER entries are ordered by UID
+ * and GROUP entries are ordered by GID. Required before calling
+ * posix_acl_valid().
+ */
+void sort_pacl_range(struct posix_acl *pacl, int start, int end)
+{
int sorted = 0, i;
- /* We just do a bubble sort; easy to do in place, and we're not
- * expecting acl's to be long enough to justify anything more. */
+ /* Bubble sort: acceptable here because ACLs are typically short. */
while (!sorted) {
sorted = 1;
for (i = start; i < end; i++) {
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 8cca1329..c319c31 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -643,34 +643,74 @@ static __be32 encode_name_from_id(struct xdr_stream *xdr,
return idmap_id_to_name(xdr, rqstp, type, id);
}
-__be32
-nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
- kuid_t *uid)
+/**
+ * nfsd_map_name_to_uid - Map user@domain to local UID
+ * @rqstp: RPC execution context
+ * @name: user@domain name to be mapped
+ * @namelen: length of name, in bytes
+ * @uid: OUT: mapped local UID value
+ *
+ * Returns nfs_ok on success or an NFSv4 status code on failure.
+ */
+__be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name,
+ size_t namelen, kuid_t *uid)
{
__be32 status;
u32 id = -1;
+ /*
+ * The idmap lookup below triggers an upcall that invokes
+ * cache_check(). RQ_USEDEFERRAL must be clear to prevent
+ * cache_check() from setting RQ_DROPME via svc_defer().
+ * NFSv4 servers are not permitted to drop requests. Also
+ * RQ_DROPME will force NFSv4.1 session slot processing to
+ * be skipped.
+ */
+ WARN_ON_ONCE(test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags));
+
if (name == NULL || namelen == 0)
return nfserr_inval;
status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
+ if (status)
+ return status;
*uid = make_kuid(nfsd_user_namespace(rqstp), id);
if (!uid_valid(*uid))
status = nfserr_badowner;
return status;
}
-__be32
-nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
- kgid_t *gid)
+/**
+ * nfsd_map_name_to_gid - Map user@domain to local GID
+ * @rqstp: RPC execution context
+ * @name: user@domain name to be mapped
+ * @namelen: length of name, in bytes
+ * @gid: OUT: mapped local GID value
+ *
+ * Returns nfs_ok on success or an NFSv4 status code on failure.
+ */
+__be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name,
+ size_t namelen, kgid_t *gid)
{
__be32 status;
u32 id = -1;
+ /*
+ * The idmap lookup below triggers an upcall that invokes
+ * cache_check(). RQ_USEDEFERRAL must be clear to prevent
+ * cache_check() from setting RQ_DROPME via svc_defer().
+ * NFSv4 servers are not permitted to drop requests. Also
+ * RQ_DROPME will force NFSv4.1 session slot processing to
+ * be skipped.
+ */
+ WARN_ON_ONCE(test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags));
+
if (name == NULL || namelen == 0)
return nfserr_inval;
status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
+ if (status)
+ return status;
*gid = make_kgid(nfsd_user_namespace(rqstp), id);
if (!gid_valid(*gid))
status = nfserr_badowner;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e400f3b..37ab3a6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -81,8 +81,8 @@ static u32 nfsd41_ex_attrmask[] = {
};
static __be32
-check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- u32 *bmval, u32 *writable)
+check_attr_support(struct nfsd4_compound_state *cstate, u32 *bmval,
+ u32 *writable)
{
struct dentry *dentry = cstate->current_fh.fh_dentry;
struct svc_export *exp = cstate->current_fh.fh_export;
@@ -91,6 +91,10 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_attrnotsupp;
if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
return nfserr_attrnotsupp;
+ if ((bmval[2] & (FATTR4_WORD2_POSIX_DEFAULT_ACL |
+ FATTR4_WORD2_POSIX_ACCESS_ACL)) &&
+ !IS_POSIXACL(d_inode(dentry)))
+ return nfserr_attrnotsupp;
if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
!(exp->ex_flags & NFSEXP_SECURITY_LABEL))
return nfserr_attrnotsupp;
@@ -103,21 +107,25 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
-nfsd4_check_open_attributes(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+nfsd4_check_open_attributes(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open)
{
__be32 status = nfs_ok;
- if (open->op_create == NFS4_OPEN_CREATE) {
- if (open->op_createmode == NFS4_CREATE_UNCHECKED
- || open->op_createmode == NFS4_CREATE_GUARDED)
- status = check_attr_support(rqstp, cstate,
- open->op_bmval, nfsd_attrmask);
- else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
- status = check_attr_support(rqstp, cstate,
- open->op_bmval, nfsd41_ex_attrmask);
- }
+ if (open->op_create != NFS4_OPEN_CREATE)
+ return status;
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ case NFS4_CREATE_GUARDED:
+ status = check_attr_support(cstate, open->op_bmval,
+ nfsd_attrmask);
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ status = check_attr_support(cstate, open->op_bmval,
+ nfsd41_ex_attrmask);
+ break;
+ }
return status;
}
@@ -266,8 +274,20 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- if (is_create_with_attrs(open))
- nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
+ if (open->op_acl) {
+ if (open->op_dpacl || open->op_pacl) {
+ status = nfserr_inval;
+ goto out_write;
+ }
+ if (is_create_with_attrs(open))
+ nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
+ } else if (is_create_with_attrs(open)) {
+ /* The dpacl and pacl will get released by nfsd_attrs_free(). */
+ attrs.na_dpacl = open->op_dpacl;
+ attrs.na_pacl = open->op_pacl;
+ open->op_dpacl = NULL;
+ open->op_pacl = NULL;
+ }
child = start_creating(&nop_mnt_idmap, parent,
&QSTR_LEN(open->op_fname, open->op_fnamelen));
@@ -378,8 +398,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (attrs.na_labelerr)
open->op_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (attrs.na_aclerr)
+ if (attrs.na_paclerr || attrs.na_dpaclerr)
open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
+ if (attrs.na_dpaclerr)
+ open->op_bmval[2] &= ~FATTR4_WORD2_POSIX_DEFAULT_ACL;
+ if (attrs.na_paclerr)
+ open->op_bmval[2] &= ~FATTR4_WORD2_POSIX_ACCESS_ACL;
out:
end_creating(child);
nfsd_attrs_free(&attrs);
@@ -547,8 +571,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
open->op_rqstp = rqstp;
/* This check required by spec. */
- if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
- return nfserr_inval;
+ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) {
+ status = nfserr_inval;
+ goto out_err;
+ }
open->op_created = false;
/*
@@ -557,8 +583,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
if (nfsd4_has_session(cstate) &&
!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) &&
- open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
- return nfserr_grace;
+ open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) {
+ status = nfserr_grace;
+ goto out_err;
+ }
if (nfsd4_has_session(cstate))
copy_clientid(&open->op_clientid, cstate->session);
@@ -584,7 +612,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- status = nfsd4_check_open_attributes(rqstp, cstate, open);
+ status = nfsd4_check_open_attributes(cstate, open);
if (status)
goto out;
@@ -645,6 +673,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
nfsd4_cleanup_open_state(cstate, open);
nfsd4_bump_seqid(cstate, status);
+out_err:
+ posix_acl_release(open->op_dpacl);
+ posix_acl_release(open->op_pacl);
return status;
}
@@ -785,23 +816,34 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd_attrs attrs = {
.na_iattr = &create->cr_iattr,
.na_seclabel = &create->cr_label,
+ .na_dpacl = create->cr_dpacl,
+ .na_pacl = create->cr_pacl,
};
struct svc_fh resfh;
__be32 status;
dev_t rdev;
+ create->cr_dpacl = NULL;
+ create->cr_pacl = NULL;
+
fh_init(&resfh, NFS4_FHSIZE);
status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP);
if (status)
- return status;
+ goto out_aftermask;
- status = check_attr_support(rqstp, cstate, create->cr_bmval,
- nfsd_attrmask);
+ status = check_attr_support(cstate, create->cr_bmval, nfsd_attrmask);
if (status)
- return status;
+ goto out_aftermask;
- status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl, &attrs);
+ if (create->cr_acl) {
+ if (create->cr_dpacl || create->cr_pacl) {
+ status = nfserr_inval;
+ goto out_aftermask;
+ }
+ status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl,
+ &attrs);
+ }
current->fs->umask = create->cr_umask;
switch (create->cr_type) {
case NF4LNK:
@@ -860,14 +902,19 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (attrs.na_labelerr)
create->cr_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (attrs.na_aclerr)
+ if (attrs.na_paclerr || attrs.na_dpaclerr)
create->cr_bmval[0] &= ~FATTR4_WORD0_ACL;
+ if (attrs.na_dpaclerr)
+ create->cr_bmval[2] &= ~FATTR4_WORD2_POSIX_DEFAULT_ACL;
+ if (attrs.na_paclerr)
+ create->cr_bmval[2] &= ~FATTR4_WORD2_POSIX_ACCESS_ACL;
set_change_info(&create->cr_cinfo, &cstate->current_fh);
fh_dup2(&cstate->current_fh, &resfh);
out:
fh_put(&resfh);
out_umask:
current->fs->umask = 0;
+out_aftermask:
nfsd_attrs_free(&attrs);
return status;
}
@@ -1172,6 +1219,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd_attrs attrs = {
.na_iattr = &setattr->sa_iattr,
.na_seclabel = &setattr->sa_label,
+ .na_pacl = setattr->sa_pacl,
+ .na_dpacl = setattr->sa_dpacl,
};
bool save_no_wcc, deleg_attrs;
struct nfs4_stid *st = NULL;
@@ -1179,6 +1228,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status = nfs_ok;
int err;
+ /* Transfer ownership to attrs for cleanup via nfsd_attrs_free() */
+ setattr->sa_pacl = NULL;
+ setattr->sa_dpacl = NULL;
+
deleg_attrs = setattr->sa_bmval[2] & (FATTR4_WORD2_TIME_DELEG_ACCESS |
FATTR4_WORD2_TIME_DELEG_MODIFY);
@@ -1192,7 +1245,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&cstate->current_fh, &setattr->sa_stateid,
flags, NULL, &st);
if (status)
- return status;
+ goto out_err;
}
if (deleg_attrs) {
@@ -1210,18 +1263,24 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (st)
nfs4_put_stid(st);
if (status)
- return status;
+ goto out_err;
err = fh_want_write(&cstate->current_fh);
- if (err)
- return nfserrno(err);
+ if (err) {
+ status = nfserrno(err);
+ goto out_err;
+ }
status = nfs_ok;
- status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
- nfsd_attrmask);
+ status = check_attr_support(cstate, setattr->sa_bmval, nfsd_attrmask);
if (status)
goto out;
+ if (setattr->sa_acl && (attrs.na_dpacl || attrs.na_pacl)) {
+ status = nfserr_inval;
+ goto out;
+ }
+
inode = cstate->current_fh.fh_dentry->d_inode;
status = nfsd4_acl_to_attr(S_ISDIR(inode->i_mode) ? NF4DIR : NF4REG,
setattr->sa_acl, &attrs);
@@ -1235,10 +1294,13 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!status)
status = nfserrno(attrs.na_labelerr);
if (!status)
- status = nfserrno(attrs.na_aclerr);
+ status = nfserrno(attrs.na_dpaclerr);
+ if (!status)
+ status = nfserrno(attrs.na_paclerr);
out:
- nfsd_attrs_free(&attrs);
fh_drop_write(&cstate->current_fh);
+out_err:
+ nfsd_attrs_free(&attrs);
return status;
}
@@ -1430,14 +1492,26 @@ static void nfs4_put_copy(struct nfsd4_copy *copy)
kfree(copy);
}
+static void release_copy_files(struct nfsd4_copy *copy);
+
static void nfsd4_stop_copy(struct nfsd4_copy *copy)
{
trace_nfsd_copy_async_cancel(copy);
if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) {
kthread_stop(copy->copy_task);
- copy->nfserr = nfs_ok;
+ if (!test_bit(NFSD4_COPY_F_CB_ERROR, ©->cp_flags))
+ copy->nfserr = nfs_ok;
set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags);
}
+
+ /*
+ * The copy was removed from async_copies before this function
+ * was called, so the reaper cannot clean it up. Release files
+ * here regardless of who won the STOPPED race. If the thread
+ * set STOPPED, it has finished using the files. If STOPPED
+ * was set here, kthread_stop() waited for the thread to exit.
+ */
+ release_copy_files(copy);
nfs4_put_copy(copy);
}
@@ -1465,6 +1539,72 @@ void nfsd4_shutdown_copy(struct nfs4_client *clp)
while ((copy = nfsd4_unhash_copy(clp)) != NULL)
nfsd4_stop_copy(copy);
}
+
+static bool nfsd4_copy_on_sb(const struct nfsd4_copy *copy,
+ const struct super_block *sb)
+{
+ if (copy->nf_src &&
+ file_inode(copy->nf_src->nf_file)->i_sb == sb)
+ return true;
+ if (copy->nf_dst &&
+ file_inode(copy->nf_dst->nf_file)->i_sb == sb)
+ return true;
+ return false;
+}
+
+/**
+ * nfsd4_cancel_copy_by_sb - cancel async copy operations on @sb
+ * @net: net namespace containing the copy operations
+ * @sb: targeted superblock
+ */
+void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd4_copy *copy, *tmp;
+ struct nfs4_client *clp;
+ unsigned int idhashval;
+ LIST_HEAD(to_cancel);
+
+ spin_lock(&nn->client_lock);
+ for (idhashval = 0; idhashval < CLIENT_HASH_SIZE; idhashval++) {
+ struct list_head *head = &nn->conf_id_hashtbl[idhashval];
+
+ list_for_each_entry(clp, head, cl_idhash) {
+ spin_lock(&clp->async_lock);
+ list_for_each_entry_safe(copy, tmp,
+ &clp->async_copies, copies) {
+ if (nfsd4_copy_on_sb(copy, sb)) {
+ refcount_inc(©->refcount);
+ /*
+ * Hold a reference on the client while
+ * nfsd4_stop_copy() runs. Unlike
+ * nfsd4_unhash_copy(), cp_clp is not
+ * NULLed here because nfsd4_send_cb_offload()
+ * needs a valid client to send CB_OFFLOAD.
+ * That function takes its own reference to
+ * survive callback flight.
+ */
+ kref_get(&clp->cl_nfsdfs.cl_ref);
+ copy->nfserr = nfserr_admin_revoked;
+ set_bit(NFSD4_COPY_F_CB_ERROR,
+ ©->cp_flags);
+ list_move(©->copies, &to_cancel);
+ }
+ }
+ spin_unlock(&clp->async_lock);
+ }
+ }
+ spin_unlock(&nn->client_lock);
+
+ list_for_each_entry_safe(copy, tmp, &to_cancel, copies) {
+ struct nfs4_client *clp = copy->cp_clp;
+
+ list_del_init(©->copies);
+ nfsd4_stop_copy(copy);
+ nfsd4_put_client(clp);
+ }
+}
+
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
extern struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
@@ -1754,6 +1894,7 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
container_of(cbo, struct nfsd4_copy, cp_cb_offload);
set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags);
+ nfsd4_put_client(cb->cb_clp);
}
static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
@@ -1873,10 +2014,14 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
static void release_copy_files(struct nfsd4_copy *copy)
{
- if (copy->nf_src)
+ if (copy->nf_src) {
nfsd_file_put(copy->nf_src);
- if (copy->nf_dst)
+ copy->nf_src = NULL;
+ }
+ if (copy->nf_dst) {
nfsd_file_put(copy->nf_dst);
+ copy->nf_dst = NULL;
+ }
}
static void cleanup_async_copy(struct nfsd4_copy *copy)
@@ -1895,18 +2040,34 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
{
struct nfsd4_cb_offload *cbo = ©->cp_cb_offload;
+ struct nfs4_client *clp = copy->cp_clp;
+
+ /*
+ * cp_clp is NULL when called via nfsd4_shutdown_copy() during
+ * client destruction. Skip the callback; the client is gone.
+ */
+ if (!clp) {
+ set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags);
+ return;
+ }
memcpy(&cbo->co_res, ©->cp_res, sizeof(copy->cp_res));
memcpy(&cbo->co_fh, ©->fh, sizeof(copy->fh));
cbo->co_nfserr = copy->nfserr;
cbo->co_retries = 5;
- nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
+ /*
+ * Hold a reference on the client while the callback is in flight.
+ * Released in nfsd4_cb_offload_release().
+ */
+ kref_get(&clp->cl_nfsdfs.cl_ref);
+
+ nfsd4_init_cb(&cbo->co_cb, clp, &nfsd4_cb_offload_ops,
NFSPROC4_CLNT_CB_OFFLOAD);
nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid,
cbo->co_referring_slotid,
cbo->co_referring_seqno);
- trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
+ trace_nfsd_cb_offload(clp, &cbo->co_res.cb_stateid,
&cbo->co_fh, copy->cp_count, copy->nfserr);
nfsd4_try_run_cb(&cbo->co_cb);
}
@@ -1921,6 +2082,7 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
static int nfsd4_do_async_copy(void *data)
{
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+ __be32 nfserr = nfs_ok;
trace_nfsd_copy_async(copy);
if (nfsd4_ssc_is_inter(copy)) {
@@ -1931,23 +2093,25 @@ static int nfsd4_do_async_copy(void *data)
if (IS_ERR(filp)) {
switch (PTR_ERR(filp)) {
case -EBADF:
- copy->nfserr = nfserr_wrong_type;
+ nfserr = nfserr_wrong_type;
break;
default:
- copy->nfserr = nfserr_offload_denied;
+ nfserr = nfserr_offload_denied;
}
/* ss_mnt will be unmounted by the laundromat */
goto do_callback;
}
- copy->nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
- false);
+ nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
+ false);
nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst);
} else {
- copy->nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
- copy->nf_dst->nf_file, false);
+ nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, false);
}
do_callback:
+ if (!test_bit(NFSD4_COPY_F_CB_ERROR, ©->cp_flags))
+ copy->nfserr = nfserr;
/* The kthread exits forthwith. Ensure that a subsequent
* OFFLOAD_CANCEL won't try to kill it again. */
set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags);
@@ -2271,7 +2435,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
+ status = check_attr_support(cstate, verify->ve_bmval, NULL);
if (status)
return status;
@@ -2281,6 +2445,11 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (verify->ve_attrlen & 3)
return nfserr_inval;
+ /* The POSIX draft ACLs cannot be tested via (N)VERIFY. */
+ if (verify->ve_bmval[2] & (FATTR4_WORD2_POSIX_DEFAULT_ACL |
+ FATTR4_WORD2_POSIX_ACCESS_ACL))
+ return nfserr_inval;
+
/* count in words:
* bitmap_len(1) + bitmap(2) + attr_len(1) = 4
*/
@@ -3016,8 +3185,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
BUG_ON(cstate->replay_owner);
out:
cstate->status = status;
- /* Reset deferral mechanism for RPC deferrals */
- set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5e0f3a..f5cb067 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1253,7 +1253,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f
if (ret) {
struct inode *inode = file_inode(f);
- pr_notice_ratelimited("Unable to update timestamps on inode %02x:%02x:%lu: %d\n",
+ pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%lu: %d\n",
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev),
inode->i_ino, ret);
@@ -2413,7 +2413,13 @@ static void __free_client(struct kref *k)
kmem_cache_free(client_slab, clp);
}
-static void drop_client(struct nfs4_client *clp)
+/**
+ * nfsd4_put_client - release a reference on an nfs4_client
+ * @clp: the client to be released
+ *
+ * When the last reference is released, the client is freed.
+ */
+void nfsd4_put_client(struct nfs4_client *clp)
{
kref_put(&clp->cl_nfsdfs.cl_ref, __free_client);
}
@@ -2435,7 +2441,7 @@ free_client(struct nfs4_client *clp)
clp->cl_nfsd_dentry = NULL;
wake_up_all(&expiry_wq);
}
- drop_client(clp);
+ nfsd4_put_client(clp);
}
/* must be called under the client_lock */
@@ -2833,7 +2839,7 @@ static int client_info_show(struct seq_file *m, void *v)
spin_unlock(&clp->cl_lock);
seq_puts(m, "\n");
- drop_client(clp);
+ nfsd4_put_client(clp);
return 0;
}
@@ -3099,7 +3105,7 @@ static int client_states_open(struct inode *inode, struct file *file)
ret = seq_open(file, &states_seq_ops);
if (ret) {
- drop_client(clp);
+ nfsd4_put_client(clp);
return ret;
}
s = file->private_data;
@@ -3113,7 +3119,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
struct nfs4_client *clp = m->private;
/* XXX: alternatively, we could get/drop in seq start/stop */
- drop_client(clp);
+ nfsd4_put_client(clp);
return seq_release(inode, file);
}
@@ -3169,7 +3175,7 @@ static ssize_t client_ctl_write(struct file *file, const char __user *buf,
if (!clp)
return -ENXIO;
force_expire_client(clp);
- drop_client(clp);
+ nfsd4_put_client(clp);
return 7;
}
@@ -3204,7 +3210,7 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
- drop_client(clp);
+ nfsd4_put_client(clp);
}
static int
@@ -6353,7 +6359,8 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
dp->dl_ctime = stat.ctime;
dp->dl_mtime = stat.mtime;
spin_lock(&f->f_lock);
- f->f_mode |= FMODE_NOCMTIME;
+ if (deleg_ts)
+ f->f_mode |= FMODE_NOCMTIME;
spin_unlock(&f->f_lock);
trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
} else {
@@ -6637,14 +6644,14 @@ bool nfsd4_force_end_grace(struct nfsd_net *nn)
{
if (!nn->client_tracking_ops)
return false;
- spin_lock(&nn->client_lock);
- if (nn->grace_ended || !nn->client_tracking_active) {
- spin_unlock(&nn->client_lock);
+ if (READ_ONCE(nn->grace_ended))
return false;
- }
+ /* laundromat_work must be initialised now, though it might be disabled */
WRITE_ONCE(nn->grace_end_forced, true);
+ /* mod_delayed_work() doesn't queue work after
+ * nfs4_state_shutdown_net() has called disable_delayed_work_sync()
+ */
mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
- spin_unlock(&nn->client_lock);
return true;
}
@@ -8980,7 +8987,6 @@ static int nfs4_state_create_net(struct net *net)
nn->boot_time = ktime_get_real_seconds();
nn->grace_ended = false;
nn->grace_end_forced = false;
- nn->client_tracking_active = false;
nn->nfsd4_manager.block_opens = true;
INIT_LIST_HEAD(&nn->nfsd4_manager.list);
INIT_LIST_HEAD(&nn->client_lru);
@@ -8995,6 +9001,8 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->blocked_locks_lru);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ /* Make sure this cannot run until client tracking is initialised */
+ disable_delayed_work(&nn->laundromat_work);
INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
@@ -9062,9 +9070,7 @@ nfs4_state_start_net(struct net *net)
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
/* safe for laundromat to run now */
- spin_lock(&nn->client_lock);
- nn->client_tracking_active = true;
- spin_unlock(&nn->client_lock);
+ enable_delayed_work(&nn->laundromat_work);
if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
goto skip_grace;
printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n",
@@ -9113,10 +9119,7 @@ nfs4_state_shutdown_net(struct net *net)
shrinker_free(nn->nfsd_client_shrinker);
cancel_work_sync(&nn->nfsd_shrinker_work);
- spin_lock(&nn->client_lock);
- nn->client_tracking_active = false;
- spin_unlock(&nn->client_lock);
- cancel_delayed_work_sync(&nn->laundromat_work);
+ disable_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
INIT_LIST_HEAD(&reaplist);
@@ -9520,8 +9523,10 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
spin_unlock(&clp->cl_lock);
spin_unlock(&state_lock);
- if (!status)
+ if (!status) {
+ put_nfs4_file(fp);
return dp;
+ }
/* Something failed. Drop the lease and clean up the stid */
kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
@@ -9529,5 +9534,6 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
nfs4_put_stid(&dp->dl_stid);
out_delegees:
put_deleg_file(fp);
+ put_nfs4_file(fp);
return ERR_PTR(status);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51ef97c..5172dbd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -43,6 +43,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/xattr.h>
#include <linux/vmalloc.h>
+#include <linux/nfsacl.h>
#include <uapi/linux/xattr.h>
@@ -377,10 +378,111 @@ nfsd4_decode_security_label(struct nfsd4_compoundargs *argp,
return nfs_ok;
}
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+
+static short nfsd4_posixacetag4_to_tag(posixacetag4 tag)
+{
+ switch (tag) {
+ case POSIXACE4_TAG_USER_OBJ: return ACL_USER_OBJ;
+ case POSIXACE4_TAG_GROUP_OBJ: return ACL_GROUP_OBJ;
+ case POSIXACE4_TAG_USER: return ACL_USER;
+ case POSIXACE4_TAG_GROUP: return ACL_GROUP;
+ case POSIXACE4_TAG_MASK: return ACL_MASK;
+ case POSIXACE4_TAG_OTHER: return ACL_OTHER;
+ }
+ return ACL_OTHER;
+}
+
+static __be32
+nfsd4_decode_posixace4(struct nfsd4_compoundargs *argp,
+ struct posix_acl_entry *ace)
+{
+ posixaceperm4 perm;
+ __be32 *p, status;
+ posixacetag4 tag;
+ u32 len;
+
+ if (!xdrgen_decode_posixacetag4(argp->xdr, &tag))
+ return nfserr_bad_xdr;
+ ace->e_tag = nfsd4_posixacetag4_to_tag(tag);
+
+ if (!xdrgen_decode_posixaceperm4(argp->xdr, &perm))
+ return nfserr_bad_xdr;
+ if (perm & ~S_IRWXO)
+ return nfserr_bad_xdr;
+ ace->e_perm = perm;
+
+ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, len);
+ if (!p)
+ return nfserr_bad_xdr;
+ switch (tag) {
+ case POSIXACE4_TAG_USER:
+ if (len > 0)
+ status = nfsd_map_name_to_uid(argp->rqstp,
+ (char *)p, len, &ace->e_uid);
+ else
+ status = nfserr_bad_xdr;
+ break;
+ case POSIXACE4_TAG_GROUP:
+ if (len > 0)
+ status = nfsd_map_name_to_gid(argp->rqstp,
+ (char *)p, len, &ace->e_gid);
+ else
+ status = nfserr_bad_xdr;
+ break;
+ default:
+ status = nfs_ok;
+ }
+
+ return status;
+}
+
+static noinline __be32
+nfsd4_decode_posixacl(struct nfsd4_compoundargs *argp, struct posix_acl **acl)
+{
+ struct posix_acl_entry *ace;
+ __be32 status;
+ u32 count;
+
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+
+ *acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (*acl == NULL)
+ return nfserr_resource;
+
+ (*acl)->a_count = count;
+ for (ace = (*acl)->a_entries; ace < (*acl)->a_entries + count; ace++) {
+ status = nfsd4_decode_posixace4(argp, ace);
+ if (status) {
+ posix_acl_release(*acl);
+ *acl = NULL;
+ return status;
+ }
+ }
+
+ /*
+ * posix_acl_valid() requires the ACEs to be sorted.
+ * If they are already sorted, sort_pacl_range() will return
+ * after one pass through the ACEs, since it implements bubble sort.
+ * Note that a count == 0 is used to delete a POSIX ACL and a count
+ * of 1 or 2 will always be found invalid by posix_acl_valid().
+ */
+ if (count >= 3)
+ sort_pacl_range(*acl, 0, count - 1);
+
+ return nfs_ok;
+}
+
+#endif /* CONFIG_NFSD_V4_POSIX_ACLS */
+
static __be32
nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
struct iattr *iattr, struct nfs4_acl **acl,
- struct xdr_netobj *label, int *umask)
+ struct xdr_netobj *label, int *umask,
+ struct posix_acl **dpaclp, struct posix_acl **paclp)
{
unsigned int starting_pos;
u32 attrlist4_count;
@@ -543,9 +645,40 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG;
}
+ *dpaclp = NULL;
+ *paclp = NULL;
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ if (bmval[2] & FATTR4_WORD2_POSIX_DEFAULT_ACL) {
+ struct posix_acl *dpacl;
+
+ status = nfsd4_decode_posixacl(argp, &dpacl);
+ if (status)
+ return status;
+ *dpaclp = dpacl;
+ }
+ if (bmval[2] & FATTR4_WORD2_POSIX_ACCESS_ACL) {
+ struct posix_acl *pacl;
+
+ status = nfsd4_decode_posixacl(argp, &pacl);
+ if (status) {
+ posix_acl_release(*dpaclp);
+ *dpaclp = NULL;
+ return status;
+ }
+ *paclp = pacl;
+ }
+#endif /* CONFIG_NFSD_V4_POSIX_ACLS */
+
/* request sanity: did attrlist4 contain the expected number of words? */
- if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos)
+ if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos) {
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ posix_acl_release(*dpaclp);
+ posix_acl_release(*paclp);
+ *dpaclp = NULL;
+ *paclp = NULL;
+#endif
return nfserr_bad_xdr;
+ }
return nfs_ok;
}
@@ -849,7 +982,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
status = nfsd4_decode_fattr4(argp, create->cr_bmval,
ARRAY_SIZE(create->cr_bmval),
&create->cr_iattr, &create->cr_acl,
- &create->cr_label, &create->cr_umask);
+ &create->cr_label, &create->cr_umask,
+ &create->cr_dpacl, &create->cr_pacl);
if (status)
return status;
@@ -1000,7 +1134,8 @@ nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open
status = nfsd4_decode_fattr4(argp, open->op_bmval,
ARRAY_SIZE(open->op_bmval),
&open->op_iattr, &open->op_acl,
- &open->op_label, &open->op_umask);
+ &open->op_label, &open->op_umask,
+ &open->op_dpacl, &open->op_pacl);
if (status)
return status;
break;
@@ -1018,7 +1153,8 @@ nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open
status = nfsd4_decode_fattr4(argp, open->op_bmval,
ARRAY_SIZE(open->op_bmval),
&open->op_iattr, &open->op_acl,
- &open->op_label, &open->op_umask);
+ &open->op_label, &open->op_umask,
+ &open->op_dpacl, &open->op_pacl);
if (status)
return status;
break;
@@ -1345,7 +1481,8 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
return nfsd4_decode_fattr4(argp, setattr->sa_bmval,
ARRAY_SIZE(setattr->sa_bmval),
&setattr->sa_iattr, &setattr->sa_acl,
- &setattr->sa_label, NULL);
+ &setattr->sa_label, NULL, &setattr->sa_dpacl,
+ &setattr->sa_pacl);
}
static __be32
@@ -2849,6 +2986,89 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
{ return 0; }
#endif
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+
+static int nfsd4_posix_tagtotype(short tag)
+{
+ switch (tag) {
+ case ACL_USER_OBJ: return POSIXACE4_TAG_USER_OBJ;
+ case ACL_GROUP_OBJ: return POSIXACE4_TAG_GROUP_OBJ;
+ case ACL_USER: return POSIXACE4_TAG_USER;
+ case ACL_GROUP: return POSIXACE4_TAG_GROUP;
+ case ACL_MASK: return POSIXACE4_TAG_MASK;
+ case ACL_OTHER: return POSIXACE4_TAG_OTHER;
+ default: return -EINVAL;
+ }
+}
+
+static __be32
+nfsd4_encode_posixace4(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct posix_acl_entry *acep)
+{
+ __be32 status;
+ int type;
+
+ type = nfsd4_posix_tagtotype(acep->e_tag);
+ if (type < 0)
+ return nfserr_resource;
+ if (!xdrgen_encode_posixacetag4(xdr, type))
+ return nfserr_resource;
+ if (!xdrgen_encode_posixaceperm4(xdr, acep->e_perm))
+ return nfserr_resource;
+
+ /* who */
+ switch (acep->e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ break;
+ case ACL_USER:
+ status = nfsd4_encode_user(xdr, rqstp, acep->e_uid);
+ if (status != nfs_ok)
+ return status;
+ break;
+ case ACL_GROUP:
+ status = nfsd4_encode_group(xdr, rqstp, acep->e_gid);
+ if (status != nfs_ok)
+ return status;
+ break;
+ default:
+ return nfserr_resource;
+ }
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_posixacl(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct posix_acl *acl)
+{
+ __be32 status;
+ int i;
+
+ if (!acl) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
+ if (acl->a_count > NFS_ACL_MAX_ENTRIES)
+ return nfserr_resource;
+ if (xdr_stream_encode_u32(xdr, acl->a_count) != XDR_UNIT)
+ return nfserr_resource;
+ for (i = 0; i < acl->a_count; i++) {
+ status = nfsd4_encode_posixace4(xdr, rqstp, &acl->a_entries[i]);
+ if (status != nfs_ok)
+ return status;
+ }
+
+ return nfs_ok;
+}
+
+#endif /* CONFIG_NFSD_V4_POSIX_ACL */
+
static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err)
{
/* As per referral draft: */
@@ -2930,6 +3150,10 @@ struct nfsd4_fattr_args {
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
struct lsm_context context;
#endif
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ struct posix_acl *dpacl;
+ struct posix_acl *pacl;
+#endif
u32 rdattr_err;
bool contextsupport;
bool ignore_crossmnt;
@@ -3470,6 +3694,42 @@ static __be32 nfsd4_encode_fattr4_open_arguments(struct xdr_stream *xdr,
return nfs_ok;
}
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+
+static __be32 nfsd4_encode_fattr4_acl_trueform(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ aclmodel4 trueform = ACL_MODEL_NONE;
+
+ if (IS_POSIXACL(d_inode(args->dentry)))
+ trueform = ACL_MODEL_POSIX_DRAFT;
+ if (!xdrgen_encode_aclmodel4(xdr, trueform))
+ return nfserr_resource;
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_acl_trueform_scope(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ if (!xdrgen_encode_aclscope4(xdr, ACL_SCOPE_FILE_SYSTEM))
+ return nfserr_resource;
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_posix_default_acl(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_posixacl(xdr, args->rqstp, args->dpacl);
+}
+
+static __be32 nfsd4_encode_fattr4_posix_access_acl(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_posixacl(xdr, args->rqstp, args->pacl);
+}
+
+#endif /* CONFIG_NFSD_V4_POSIX_ACLS */
+
static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs,
[FATTR4_TYPE] = nfsd4_encode_fattr4_type,
@@ -3573,6 +3833,22 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval,
[FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval,
[FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments,
+
+ /* Reserved */
+ [87] = nfsd4_encode_fattr4__inval,
+ [88] = nfsd4_encode_fattr4__inval,
+
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ [FATTR4_ACL_TRUEFORM] = nfsd4_encode_fattr4_acl_trueform,
+ [FATTR4_ACL_TRUEFORM_SCOPE] = nfsd4_encode_fattr4_acl_trueform_scope,
+ [FATTR4_POSIX_DEFAULT_ACL] = nfsd4_encode_fattr4_posix_default_acl,
+ [FATTR4_POSIX_ACCESS_ACL] = nfsd4_encode_fattr4_posix_access_acl,
+#else
+ [FATTR4_ACL_TRUEFORM] = nfsd4_encode_fattr4__noop,
+ [FATTR4_ACL_TRUEFORM_SCOPE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_POSIX_DEFAULT_ACL] = nfsd4_encode_fattr4__noop,
+ [FATTR4_POSIX_ACCESS_ACL] = nfsd4_encode_fattr4__noop,
+#endif
};
/*
@@ -3613,6 +3889,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
args.context.context = NULL;
#endif
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ args.dpacl = NULL;
+ args.pacl = NULL;
+#endif
/*
* Make a local copy of the attribute bitmap that can be modified.
@@ -3719,6 +3999,55 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
}
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ if (attrmask[2] & FATTR4_WORD2_POSIX_DEFAULT_ACL) {
+ struct inode *inode = d_inode(dentry);
+ struct posix_acl *dpacl;
+
+ if (S_ISDIR(inode->i_mode)) {
+ dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(dpacl)) {
+ switch (PTR_ERR(dpacl)) {
+ case -EOPNOTSUPP:
+ attrmask[2] &= ~FATTR4_WORD2_POSIX_DEFAULT_ACL;
+ break;
+ case -EINVAL:
+ status = nfserr_attrnotsupp;
+ goto out;
+ default:
+ err = PTR_ERR(dpacl);
+ goto out_nfserr;
+ }
+ } else {
+ args.dpacl = dpacl;
+ }
+ }
+ }
+ if (attrmask[2] & FATTR4_WORD2_POSIX_ACCESS_ACL) {
+ struct inode *inode = d_inode(dentry);
+ struct posix_acl *pacl;
+
+ pacl = get_inode_acl(inode, ACL_TYPE_ACCESS);
+ if (!pacl)
+ pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ if (IS_ERR(pacl)) {
+ switch (PTR_ERR(pacl)) {
+ case -EOPNOTSUPP:
+ attrmask[2] &= ~FATTR4_WORD2_POSIX_ACCESS_ACL;
+ break;
+ case -EINVAL:
+ status = nfserr_attrnotsupp;
+ goto out;
+ default:
+ err = PTR_ERR(pacl);
+ goto out_nfserr;
+ }
+ } else {
+ args.pacl = pacl;
+ }
+ }
+#endif /* CONFIG_NFSD_V4_POSIX_ACLS */
+
/* attrmask */
status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
attrmask[2]);
@@ -3742,6 +4071,12 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = nfs_ok;
out:
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+ if (args.dpacl)
+ posix_acl_release(args.dpacl);
+ if (args.pacl)
+ posix_acl_release(args.pacl);
+#endif /* CONFIG_NFSD_V4_POSIX_ACLS */
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
if (args.context.context)
security_release_secctx(&args.context);
@@ -6013,6 +6348,22 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
args->ops = args->iops;
args->rqstp = rqstp;
+ /*
+ * NFSv4 operation decoders can invoke svc cache lookups
+ * that trigger svc_defer() when RQ_USEDEFERRAL is set,
+ * setting RQ_DROPME. This creates two problems:
+ *
+ * 1. Non-idempotency: Compounds make it too hard to avoid
+ * problems if a request is deferred and replayed.
+ *
+ * 2. Session slot leakage (NFSv4.1+): If RQ_DROPME is set
+ * during decode but SEQUENCE executes successfully, the
+ * session slot will be marked INUSE. The request is then
+ * dropped before encoding, so the slot is never released,
+ * rendering it permanently unusable by the client.
+ */
+ clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+
return nfsd4_decode_compound(args);
}
diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
index a17b5d8..8244970 100644
--- a/fs/nfsd/nfs4xdr_gen.c
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
// Generated by xdrgen. Manual edits will be lost.
// XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
-// XDR specification modification time: Mon Oct 14 09:10:13 2024
+// XDR specification modification time: Thu Jan 8 23:12:07 2026
#include <linux/sunrpc/svc.h>
@@ -11,13 +11,13 @@ static bool __maybe_unused
xdrgen_decode_int64_t(struct xdr_stream *xdr, int64_t *ptr)
{
return xdrgen_decode_hyper(xdr, ptr);
-};
+}
static bool __maybe_unused
xdrgen_decode_uint32_t(struct xdr_stream *xdr, uint32_t *ptr)
{
return xdrgen_decode_unsigned_int(xdr, ptr);
-};
+}
static bool __maybe_unused
xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr)
@@ -28,7 +28,31 @@ xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr)
if (!xdrgen_decode_uint32_t(xdr, &ptr->element[i]))
return false;
return true;
-};
+}
+
+static bool __maybe_unused
+xdrgen_decode_utf8string(struct xdr_stream *xdr, utf8string *ptr)
+{
+ return xdrgen_decode_opaque(xdr, ptr, 0);
+}
+
+static bool __maybe_unused
+xdrgen_decode_utf8str_cis(struct xdr_stream *xdr, utf8str_cis *ptr)
+{
+ return xdrgen_decode_utf8string(xdr, ptr);
+}
+
+static bool __maybe_unused
+xdrgen_decode_utf8str_cs(struct xdr_stream *xdr, utf8str_cs *ptr)
+{
+ return xdrgen_decode_utf8string(xdr, ptr);
+}
+
+static bool __maybe_unused
+xdrgen_decode_utf8str_mixed(struct xdr_stream *xdr, utf8str_mixed *ptr)
+{
+ return xdrgen_decode_utf8string(xdr, ptr);
+}
static bool __maybe_unused
xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr)
@@ -38,13 +62,13 @@ xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr)
if (!xdrgen_decode_uint32_t(xdr, &ptr->nseconds))
return false;
return true;
-};
+}
static bool __maybe_unused
xdrgen_decode_fattr4_offline(struct xdr_stream *xdr, fattr4_offline *ptr)
{
return xdrgen_decode_bool(xdr, ptr);
-};
+}
static bool __maybe_unused
xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *ptr)
@@ -60,7 +84,7 @@ xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *pt
if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_create_mode))
return false;
return true;
-};
+}
static bool __maybe_unused
xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 *ptr)
@@ -69,6 +93,15 @@ xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_ac
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_ARGS_SHARE_ACCESS_READ:
+ case OPEN_ARGS_SHARE_ACCESS_WRITE:
+ case OPEN_ARGS_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
@@ -80,6 +113,16 @@ xdrgen_decode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_ARGS_SHARE_DENY_NONE:
+ case OPEN_ARGS_SHARE_DENY_READ:
+ case OPEN_ARGS_SHARE_DENY_WRITE:
+ case OPEN_ARGS_SHARE_DENY_BOTH:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
@@ -91,6 +134,19 @@ xdrgen_decode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_sha
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS:
+ case OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
@@ -102,6 +158,19 @@ xdrgen_decode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_ARGS_OPEN_CLAIM_NULL:
+ case OPEN_ARGS_OPEN_CLAIM_PREVIOUS:
+ case OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR:
+ case OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV:
+ case OPEN_ARGS_OPEN_CLAIM_FH:
+ case OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH:
+ case OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
@@ -113,6 +182,16 @@ xdrgen_decode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_ARGS_CREATEMODE_UNCHECKED4:
+ case OPEN_ARGS_CREATE_MODE_GUARDED:
+ case OPEN_ARGS_CREATEMODE_EXCLUSIVE4:
+ case OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
@@ -121,19 +200,28 @@ bool
xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr)
{
return xdrgen_decode_open_arguments4(xdr, ptr);
-};
+}
+
+/*
+ * Determine what OPEN supports.
+ */
bool
xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr)
{
return xdrgen_decode_nfstime4(xdr, ptr);
-};
+}
bool
xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr)
{
return xdrgen_decode_nfstime4(xdr, ptr);
-};
+}
+
+/*
+ * New RECOMMENDED Attribute for
+ * delegation caching of times
+ */
static bool __maybe_unused
xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr)
@@ -142,21 +230,152 @@ xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case OPEN_DELEGATE_NONE:
+ case OPEN_DELEGATE_READ:
+ case OPEN_DELEGATE_WRITE:
+ case OPEN_DELEGATE_NONE_EXT:
+ case OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ break;
+ default:
+ return false;
+ }
*ptr = val;
return true;
}
+bool
+xdrgen_decode_aclmodel4(struct xdr_stream *xdr, aclmodel4 *ptr)
+{
+ u32 val;
+
+ if (xdr_stream_decode_u32(xdr, &val) < 0)
+ return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case ACL_MODEL_NFS4:
+ case ACL_MODEL_POSIX_DRAFT:
+ case ACL_MODEL_NONE:
+ break;
+ default:
+ return false;
+ }
+ *ptr = val;
+ return true;
+}
+
+bool
+xdrgen_decode_aclscope4(struct xdr_stream *xdr, aclscope4 *ptr)
+{
+ u32 val;
+
+ if (xdr_stream_decode_u32(xdr, &val) < 0)
+ return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case ACL_SCOPE_FILE_OBJECT:
+ case ACL_SCOPE_FILE_SYSTEM:
+ case ACL_SCOPE_SERVER:
+ break;
+ default:
+ return false;
+ }
+ *ptr = val;
+ return true;
+}
+
+bool
+xdrgen_decode_posixacetag4(struct xdr_stream *xdr, posixacetag4 *ptr)
+{
+ u32 val;
+
+ if (xdr_stream_decode_u32(xdr, &val) < 0)
+ return false;
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+ case POSIXACE4_TAG_USER_OBJ:
+ case POSIXACE4_TAG_USER:
+ case POSIXACE4_TAG_GROUP_OBJ:
+ case POSIXACE4_TAG_GROUP:
+ case POSIXACE4_TAG_MASK:
+ case POSIXACE4_TAG_OTHER:
+ break;
+ default:
+ return false;
+ }
+ *ptr = val;
+ return true;
+}
+
+bool
+xdrgen_decode_posixaceperm4(struct xdr_stream *xdr, posixaceperm4 *ptr)
+{
+ return xdrgen_decode_uint32_t(xdr, ptr);
+}
+
+static bool __maybe_unused
+xdrgen_decode_posixace4(struct xdr_stream *xdr, struct posixace4 *ptr)
+{
+ if (!xdrgen_decode_posixacetag4(xdr, &ptr->tag))
+ return false;
+ if (!xdrgen_decode_posixaceperm4(xdr, &ptr->perm))
+ return false;
+ if (!xdrgen_decode_utf8str_mixed(xdr, &ptr->who))
+ return false;
+ return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_fattr4_acl_trueform(struct xdr_stream *xdr, fattr4_acl_trueform *ptr)
+{
+ return xdrgen_decode_aclmodel4(xdr, ptr);
+}
+
+static bool __maybe_unused
+xdrgen_decode_fattr4_acl_trueform_scope(struct xdr_stream *xdr, fattr4_acl_trueform_scope *ptr)
+{
+ return xdrgen_decode_aclscope4(xdr, ptr);
+}
+
+static bool __maybe_unused
+xdrgen_decode_fattr4_posix_default_acl(struct xdr_stream *xdr, fattr4_posix_default_acl *ptr)
+{
+ if (xdr_stream_decode_u32(xdr, &ptr->count) < 0)
+ return false;
+ for (u32 i = 0; i < ptr->count; i++)
+ if (!xdrgen_decode_posixace4(xdr, &ptr->element[i]))
+ return false;
+ return true;
+}
+
+static bool __maybe_unused
+xdrgen_decode_fattr4_posix_access_acl(struct xdr_stream *xdr, fattr4_posix_access_acl *ptr)
+{
+ if (xdr_stream_decode_u32(xdr, &ptr->count) < 0)
+ return false;
+ for (u32 i = 0; i < ptr->count; i++)
+ if (!xdrgen_decode_posixace4(xdr, &ptr->element[i]))
+ return false;
+ return true;
+}
+
+/*
+ * New for POSIX ACL extension
+ */
+
static bool __maybe_unused
xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value)
{
return xdrgen_encode_hyper(xdr, value);
-};
+}
static bool __maybe_unused
xdrgen_encode_uint32_t(struct xdr_stream *xdr, const uint32_t value)
{
return xdrgen_encode_unsigned_int(xdr, value);
-};
+}
static bool __maybe_unused
xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value)
@@ -167,7 +386,31 @@ xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value)
if (!xdrgen_encode_uint32_t(xdr, value.element[i]))
return false;
return true;
-};
+}
+
+static bool __maybe_unused
+xdrgen_encode_utf8string(struct xdr_stream *xdr, const utf8string value)
+{
+ return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0;
+}
+
+static bool __maybe_unused
+xdrgen_encode_utf8str_cis(struct xdr_stream *xdr, const utf8str_cis value)
+{
+ return xdrgen_encode_utf8string(xdr, value);
+}
+
+static bool __maybe_unused
+xdrgen_encode_utf8str_cs(struct xdr_stream *xdr, const utf8str_cs value)
+{
+ return xdrgen_encode_utf8string(xdr, value);
+}
+
+static bool __maybe_unused
+xdrgen_encode_utf8str_mixed(struct xdr_stream *xdr, const utf8str_mixed value)
+{
+ return xdrgen_encode_utf8string(xdr, value);
+}
static bool __maybe_unused
xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value)
@@ -177,13 +420,13 @@ xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value)
if (!xdrgen_encode_uint32_t(xdr, value->nseconds))
return false;
return true;
-};
+}
static bool __maybe_unused
xdrgen_encode_fattr4_offline(struct xdr_stream *xdr, const fattr4_offline value)
{
return xdrgen_encode_bool(xdr, value);
-};
+}
static bool __maybe_unused
xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_arguments4 *value)
@@ -199,7 +442,7 @@ xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_argument
if (!xdrgen_encode_bitmap4(xdr, value->oa_create_mode))
return false;
return true;
-};
+}
static bool __maybe_unused
xdrgen_encode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 value)
@@ -235,22 +478,92 @@ bool
xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value)
{
return xdrgen_encode_open_arguments4(xdr, value);
-};
+}
bool
xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value)
{
return xdrgen_encode_nfstime4(xdr, value);
-};
+}
bool
xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value)
{
return xdrgen_encode_nfstime4(xdr, value);
-};
+}
static bool __maybe_unused
xdrgen_encode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 value)
{
return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
}
+
+bool
+xdrgen_encode_aclmodel4(struct xdr_stream *xdr, aclmodel4 value)
+{
+ return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+bool
+xdrgen_encode_aclscope4(struct xdr_stream *xdr, aclscope4 value)
+{
+ return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+bool
+xdrgen_encode_posixacetag4(struct xdr_stream *xdr, posixacetag4 value)
+{
+ return xdr_stream_encode_u32(xdr, value) == XDR_UNIT;
+}
+
+bool
+xdrgen_encode_posixaceperm4(struct xdr_stream *xdr, const posixaceperm4 value)
+{
+ return xdrgen_encode_uint32_t(xdr, value);
+}
+
+static bool __maybe_unused
+xdrgen_encode_posixace4(struct xdr_stream *xdr, const struct posixace4 *value)
+{
+ if (!xdrgen_encode_posixacetag4(xdr, value->tag))
+ return false;
+ if (!xdrgen_encode_posixaceperm4(xdr, value->perm))
+ return false;
+ if (!xdrgen_encode_utf8str_mixed(xdr, value->who))
+ return false;
+ return true;
+}
+
+static bool __maybe_unused
+xdrgen_encode_fattr4_acl_trueform(struct xdr_stream *xdr, const fattr4_acl_trueform value)
+{
+ return xdrgen_encode_aclmodel4(xdr, value);
+}
+
+static bool __maybe_unused
+xdrgen_encode_fattr4_acl_trueform_scope(struct xdr_stream *xdr, const fattr4_acl_trueform_scope value)
+{
+ return xdrgen_encode_aclscope4(xdr, value);
+}
+
+static bool __maybe_unused
+xdrgen_encode_fattr4_posix_default_acl(struct xdr_stream *xdr, const fattr4_posix_default_acl value)
+{
+ if (xdr_stream_encode_u32(xdr, value.count) != XDR_UNIT)
+ return false;
+ for (u32 i = 0; i < value.count; i++)
+ if (!xdrgen_encode_posixace4(xdr, &value.element[i]))
+ return false;
+ return true;
+}
+
+static bool __maybe_unused
+xdrgen_encode_fattr4_posix_access_acl(struct xdr_stream *xdr, const fattr4_posix_access_acl value)
+{
+ if (xdr_stream_encode_u32(xdr, value.count) != XDR_UNIT)
+ return false;
+ for (u32 i = 0; i < value.count; i++)
+ if (!xdrgen_encode_posixace4(xdr, &value.element[i]))
+ return false;
+ return true;
+}
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
index 41a0033b..1c487f1 100644
--- a/fs/nfsd/nfs4xdr_gen.h
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by xdrgen. Manual edits will be lost. */
/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
+/* XDR specification modification time: Thu Jan 8 23:12:07 2026 */
#ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
#define _LINUX_XDRGEN_NFS4_1_DECL_H
@@ -21,5 +21,15 @@ bool xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4
bool xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr);
bool xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value);
+bool xdrgen_decode_aclmodel4(struct xdr_stream *xdr, aclmodel4 *ptr);
+bool xdrgen_encode_aclmodel4(struct xdr_stream *xdr, aclmodel4 value);
+bool xdrgen_decode_aclscope4(struct xdr_stream *xdr, aclscope4 *ptr);
+bool xdrgen_encode_aclscope4(struct xdr_stream *xdr, aclscope4 value);
+bool xdrgen_decode_posixacetag4(struct xdr_stream *xdr, posixacetag4 *ptr);
+bool xdrgen_encode_posixacetag4(struct xdr_stream *xdr, posixacetag4 value);
+
+bool xdrgen_decode_posixaceperm4(struct xdr_stream *xdr, posixaceperm4 *ptr);
+bool xdrgen_encode_posixaceperm4(struct xdr_stream *xdr, const posixaceperm4 value);
+
#endif /* _LINUX_XDRGEN_NFS4_1_DECL_H */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 084fc517..89fe2c0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -285,6 +285,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
* 2. Is that directory a mount point, or
* 3. Is that directory the root of an exported file system?
*/
+ nfsd4_cancel_copy_by_sb(netns(file), path.dentry->d_sb);
error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
mutex_lock(&nfsd_mutex);
nn = net_generic(netns(file), nfsd_net_id);
@@ -1642,6 +1643,10 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
scope = nla_data(attr);
}
+ attr = info->attrs[NFSD_A_SERVER_MIN_THREADS];
+ if (attr)
+ nn->min_threads = nla_get_u32(attr);
+
ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope);
if (ret > 0)
ret = 0;
@@ -1681,6 +1686,8 @@ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info)
nn->nfsd4_grace) ||
nla_put_u32(skb, NFSD_A_SERVER_LEASETIME,
nn->nfsd4_lease) ||
+ nla_put_u32(skb, NFSD_A_SERVER_MIN_THREADS,
+ nn->min_threads) ||
nla_put_string(skb, NFSD_A_SERVER_SCOPE,
nn->nfsd_name);
if (err)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b028321..a01d709 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -454,6 +454,16 @@ enum {
#define NFSD4_2_SECURITY_ATTRS 0
#endif
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+#define NFSD4_2_POSIX_ACL_ATTRS \
+ (FATTR4_WORD2_ACL_TRUEFORM | \
+ FATTR4_WORD2_ACL_TRUEFORM_SCOPE | \
+ FATTR4_WORD2_POSIX_DEFAULT_ACL | \
+ FATTR4_WORD2_POSIX_ACCESS_ACL)
+#else
+#define NFSD4_2_POSIX_ACL_ATTRS 0
+#endif
+
#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
FATTR4_WORD2_MODE_UMASK | \
@@ -462,7 +472,8 @@ enum {
FATTR4_WORD2_XATTR_SUPPORT | \
FATTR4_WORD2_TIME_DELEG_ACCESS | \
FATTR4_WORD2_TIME_DELEG_MODIFY | \
- FATTR4_WORD2_OPEN_ARGUMENTS)
+ FATTR4_WORD2_OPEN_ARGUMENTS | \
+ NFSD4_2_POSIX_ACL_ATTRS)
extern const u32 nfsd_suppattrs[3][3];
@@ -530,11 +541,18 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
#else
#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0
#endif
+#ifdef CONFIG_NFSD_V4_POSIX_ACLS
+#define MAYBE_FATTR4_WORD2_POSIX_ACL_ATTRS \
+ FATTR4_WORD2_POSIX_DEFAULT_ACL | FATTR4_WORD2_POSIX_ACCESS_ACL
+#else
+#define MAYBE_FATTR4_WORD2_POSIX_ACL_ATTRS 0
+#endif
#define NFSD_WRITEABLE_ATTRS_WORD2 \
(FATTR4_WORD2_MODE_UMASK \
| MAYBE_FATTR4_WORD2_SECURITY_LABEL \
| FATTR4_WORD2_TIME_DELEG_ACCESS \
| FATTR4_WORD2_TIME_DELEG_MODIFY \
+ | MAYBE_FATTR4_WORD2_POSIX_ACL_ATTRS \
)
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
@@ -550,6 +568,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
* The FATTR4_WORD2_TIME_DELEG attributes are not to be allowed for
* OPEN(create) with EXCLUSIVE4_1. It doesn't make sense to set a
* delegated timestamp on a new file.
+ *
+ * This mask includes NFSv4.2-only attributes (e.g., POSIX ACLs).
+ * Version filtering occurs via nfsd_suppattrs[] before this mask
+ * is applied, so pre-4.2 clients never see unsupported attributes.
*/
#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
(NFSD_WRITEABLE_ATTRS_WORD2 & \
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 481e789..8873033 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -33,7 +33,7 @@ static __be32 nfsd_map_status(__be32 status)
break;
case nfserr_symlink:
case nfserr_wrong_type:
- status = nfserr_inval;
+ status = nfserr_io;
break;
}
return status;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index f1cc223..0887ee6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -580,7 +580,7 @@ void nfsd_shutdown_threads(struct net *net)
}
/* Kill outstanding nfsd threads */
- svc_set_num_threads(serv, NULL, 0);
+ svc_set_num_threads(serv, 0, 0);
nfsd_destroy_serv(net);
mutex_unlock(&nfsd_mutex);
}
@@ -688,12 +688,9 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
if (nn->nfsd_serv == NULL || n <= 0)
return 0;
- /*
- * Special case: When n == 1, pass in NULL for the pool, so that the
- * change is distributed equally among them.
- */
+ /* Special case: When n == 1, distribute threads equally among pools. */
if (n == 1)
- return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]);
+ return svc_set_num_threads(nn->nfsd_serv, nn->min_threads, nthreads[0]);
if (n > nn->nfsd_serv->sv_nrpools)
n = nn->nfsd_serv->sv_nrpools;
@@ -719,18 +716,18 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
/* apply the new numbers */
for (i = 0; i < n; i++) {
- err = svc_set_num_threads(nn->nfsd_serv,
- &nn->nfsd_serv->sv_pools[i],
- nthreads[i]);
+ err = svc_set_pool_threads(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i],
+ nn->min_threads, nthreads[i]);
if (err)
goto out;
}
/* Anything undefined in array is considered to be 0 */
for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) {
- err = svc_set_num_threads(nn->nfsd_serv,
- &nn->nfsd_serv->sv_pools[i],
- 0);
+ err = svc_set_pool_threads(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i],
+ 0, 0);
if (err)
goto out;
}
@@ -885,9 +882,11 @@ static int
nfsd(void *vrqstp)
{
struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+ struct svc_pool *pool = rqstp->rq_pool;
struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
struct net *net = perm_sock->xpt_net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ bool have_mutex = false;
/* At this point, the thread shares current->fs
* with the init process. We need to create files with the
@@ -905,7 +904,44 @@ nfsd(void *vrqstp)
* The main request loop
*/
while (!svc_thread_should_stop(rqstp)) {
- svc_recv(rqstp);
+ switch (svc_recv(rqstp, 5 * HZ)) {
+ case -ETIMEDOUT:
+ /* No work arrived within the timeout window */
+ if (mutex_trylock(&nfsd_mutex)) {
+ if (pool->sp_nrthreads > pool->sp_nrthrmin) {
+ trace_nfsd_dynthread_kill(net, pool);
+ set_bit(RQ_VICTIM, &rqstp->rq_flags);
+ have_mutex = true;
+ } else {
+ mutex_unlock(&nfsd_mutex);
+ }
+ } else {
+ trace_nfsd_dynthread_trylock_fail(net, pool);
+ }
+ break;
+ case -EBUSY:
+ /* No idle threads; consider spawning another */
+ if (pool->sp_nrthreads < pool->sp_nrthrmax) {
+ if (mutex_trylock(&nfsd_mutex)) {
+ if (pool->sp_nrthreads < pool->sp_nrthrmax) {
+ int ret;
+
+ trace_nfsd_dynthread_start(net, pool);
+ ret = svc_new_thread(rqstp->rq_server, pool);
+ if (ret)
+ pr_notice_ratelimited("%s: unable to spawn new thread: %d\n",
+ __func__, ret);
+ }
+ mutex_unlock(&nfsd_mutex);
+ } else {
+ trace_nfsd_dynthread_trylock_fail(net, pool);
+ }
+ }
+ clear_bit(SP_TASK_STARTING, &pool->sp_flags);
+ break;
+ default:
+ break;
+ }
nfsd_file_net_dispose(nn);
}
@@ -913,6 +949,8 @@ nfsd(void *vrqstp)
/* Release the thread */
svc_exit_thread(rqstp);
+ if (have_mutex)
+ mutex_unlock(&nfsd_mutex);
return 0;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 508b7e3..6fcbf1e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -822,6 +822,7 @@ static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb)
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
+void nfsd4_put_client(struct nfs4_client *clp);
void nfsd4_async_copy_reaper(struct nfsd_net *nn);
bool nfsd4_has_active_async_copies(struct nfs4_client *clp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
@@ -842,10 +843,14 @@ struct nfsd_file *find_any_file(struct nfs4_file *f);
#ifdef CONFIG_NFSD_V4
void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb);
+void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb);
#else
static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb)
{
}
+static inline void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb)
+{
+}
#endif
/* grace period management */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 5ae2a61..d1d0b0d 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -91,6 +91,41 @@ DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
+DECLARE_EVENT_CLASS(nfsd_dynthread_class,
+ TP_PROTO(
+ const struct net *net,
+ const struct svc_pool *pool
+ ),
+ TP_ARGS(net, pool),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(unsigned int, pool_id)
+ __field(unsigned int, nrthreads)
+ __field(unsigned int, nrthrmin)
+ __field(unsigned int, nrthrmax)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->pool_id = pool->sp_id;
+ __entry->nrthreads = pool->sp_nrthreads;
+ __entry->nrthrmin = pool->sp_nrthrmin;
+ __entry->nrthrmax = pool->sp_nrthrmax;
+ ),
+ TP_printk("pool=%u nrthreads=%u nrthrmin=%u nrthrmax=%u",
+ __entry->pool_id, __entry->nrthreads,
+ __entry->nrthrmin, __entry->nrthrmax
+ )
+);
+
+#define DEFINE_NFSD_DYNTHREAD_EVENT(name) \
+DEFINE_EVENT(nfsd_dynthread_class, nfsd_dynthread_##name, \
+ TP_PROTO(const struct net *net, const struct svc_pool *pool), \
+ TP_ARGS(net, pool))
+
+DEFINE_NFSD_DYNTHREAD_EVENT(start);
+DEFINE_NFSD_DYNTHREAD_EVENT(kill);
+DEFINE_NFSD_DYNTHREAD_EVENT(trylock_fail);
+
#define show_nfsd_may_flags(x) \
__print_flags(x, "|", \
{ NFSD_MAY_EXEC, "EXEC" }, \
@@ -2129,6 +2164,25 @@ TRACE_EVENT(nfsd_ctl_maxblksize,
)
);
+TRACE_EVENT(nfsd_ctl_minthreads,
+ TP_PROTO(
+ const struct net *net,
+ int minthreads
+ ),
+ TP_ARGS(net, minthreads),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, minthreads)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->minthreads = minthreads
+ ),
+ TP_printk("minthreads=%d",
+ __entry->minthreads
+ )
+);
+
TRACE_EVENT(nfsd_ctl_time,
TP_PROTO(
const struct net *net,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 168d3cc..c884c3f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -596,15 +596,35 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (attr->na_seclabel && attr->na_seclabel->len)
attr->na_labelerr = security_inode_setsecctx(dentry,
attr->na_seclabel->data, attr->na_seclabel->len);
- if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
- attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
- dentry, ACL_TYPE_ACCESS,
- attr->na_pacl);
- if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
- !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
- attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_dpacl) {
+ if (!S_ISDIR(inode->i_mode))
+ attr->na_dpaclerr = -EINVAL;
+ else if (attr->na_dpacl->a_count > 0)
+ /* a_count == 0 means delete the ACL. */
+ attr->na_dpaclerr = set_posix_acl(&nop_mnt_idmap,
dentry, ACL_TYPE_DEFAULT,
attr->na_dpacl);
+ else
+ attr->na_dpaclerr = set_posix_acl(&nop_mnt_idmap,
+ dentry, ACL_TYPE_DEFAULT,
+ NULL);
+ }
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) {
+ /*
+ * For any file system that is not ACL_SCOPE_FILE_OBJECT,
+ * a_count == 0 MUST reply nfserr_inval.
+ * For a file system that is ACL_SCOPE_FILE_OBJECT,
+ * a_count == 0 deletes the ACL.
+ * XXX File systems that are ACL_SCOPE_FILE_OBJECT
+ * are not yet supported.
+ */
+ if (attr->na_pacl->a_count > 0)
+ attr->na_paclerr = set_posix_acl(&nop_mnt_idmap,
+ dentry, ACL_TYPE_ACCESS,
+ attr->na_pacl);
+ else
+ attr->na_paclerr = -EINVAL;
+ }
out_fill_attrs:
/*
* RFC 1813 Section 3.3.2 does not mandate that an NFS server
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e192dca..702a844 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -53,7 +53,8 @@ struct nfsd_attrs {
struct posix_acl *na_dpacl; /* input */
int na_labelerr; /* output */
- int na_aclerr; /* output */
+ int na_dpaclerr; /* output */
+ int na_paclerr; /* output */
};
static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index ae75846..417e9ad 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -245,6 +245,8 @@ struct nfsd4_create {
int cr_umask; /* request */
struct nfsd4_change_info cr_cinfo; /* response */
struct nfs4_acl *cr_acl;
+ struct posix_acl *cr_dpacl;
+ struct posix_acl *cr_pacl;
struct xdr_netobj cr_label;
};
#define cr_datalen u.link.datalen
@@ -397,6 +399,8 @@ struct nfsd4_open {
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_clnt_odstate *op_odstate; /* used during processing */
struct nfs4_acl *op_acl;
+ struct posix_acl *op_dpacl;
+ struct posix_acl *op_pacl;
struct xdr_netobj op_label;
struct svc_rqst *op_rqstp;
};
@@ -483,6 +487,8 @@ struct nfsd4_setattr {
struct iattr sa_iattr; /* request */
struct nfs4_acl *sa_acl;
struct xdr_netobj sa_label;
+ struct posix_acl *sa_dpacl;
+ struct posix_acl *sa_pacl;
};
struct nfsd4_setclientid {
@@ -732,6 +738,7 @@ struct nfsd4_copy {
#define NFSD4_COPY_F_COMMITTED (3)
#define NFSD4_COPY_F_COMPLETED (4)
#define NFSD4_COPY_F_OFFLOAD_DONE (5)
+#define NFSD4_COPY_F_CB_ERROR (6)
/* response */
__be32 nfserr;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 71bd44e..9995de1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -33,65 +33,6 @@ void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
fsnotify_clear_marks_by_mntns(mntns);
}
-/**
- * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-static void fsnotify_unmount_inodes(struct super_block *sb)
-{
- struct inode *inode, *iput_inode = NULL;
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We cannot __iget() an inode in state I_FREEING,
- * I_WILL_FREE, or I_NEW which is fine because by that point
- * the inode cannot have any associated watches.
- */
- spin_lock(&inode->i_lock);
- if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- /*
- * If i_count is zero, the inode cannot have any watches and
- * doing an __iget/iput with SB_ACTIVE clear would actually
- * evict all inodes with zero i_count from icache which is
- * unnecessarily violent and may in fact be illegal to do.
- * However, we should have been called /after/ evict_inodes
- * removed all zero refcount inodes, in any case. Test to
- * be sure.
- */
- if (!icount_read(inode)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
-
- iput(iput_inode);
-
- /* for each watch, send FS_UNMOUNT and then remove it */
- fsnotify_inode(inode, FS_UNMOUNT);
-
- fsnotify_inode_delete(inode);
-
- iput_inode = inode;
-
- cond_resched();
- spin_lock(&sb->s_inode_list_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- iput(iput_inode);
-}
-
void fsnotify_sb_delete(struct super_block *sb)
{
struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
@@ -100,7 +41,7 @@ void fsnotify_sb_delete(struct super_block *sb)
if (!sbinfo)
return;
- fsnotify_unmount_inodes(sb);
+ fsnotify_unmount_inodes(sbinfo);
fsnotify_clear_marks_by_sb(sb);
/* Wait for outstanding object references from connectors */
wait_var_event(fsnotify_sb_watched_objects(sb),
@@ -112,7 +53,10 @@ void fsnotify_sb_delete(struct super_block *sb)
void fsnotify_sb_free(struct super_block *sb)
{
- kfree(sb->s_fsnotify_info);
+ if (sb->s_fsnotify_info) {
+ WARN_ON_ONCE(!list_empty(&sb->s_fsnotify_info->inode_conn_list));
+ kfree(sb->s_fsnotify_info);
+ }
}
/*
@@ -777,8 +721,7 @@ static __init int fsnotify_init(void)
if (ret)
panic("initializing fsnotify_mark_srcu");
- fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
- SLAB_PANIC);
+ fsnotify_init_connector_caches();
return 0;
}
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5950c7a..58c7bb2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -77,6 +77,9 @@ extern struct srcu_struct fsnotify_mark_srcu;
extern int fsnotify_compare_groups(struct fsnotify_group *a,
struct fsnotify_group *b);
+/* Destroy all inode marks for given superblock */
+void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo);
+
/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
@@ -106,6 +109,6 @@ static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
*/
extern void fsnotify_set_children_dentry_flags(struct inode *inode);
-extern struct kmem_cache *fsnotify_mark_connector_cachep;
+void fsnotify_init_connector_caches(void);
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 55a03bb..8e6997e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -79,7 +79,8 @@
#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
struct srcu_struct fsnotify_mark_srcu;
-struct kmem_cache *fsnotify_mark_connector_cachep;
+static struct kmem_cache *fsnotify_mark_connector_cachep;
+static struct kmem_cache *fsnotify_inode_mark_connector_cachep;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
@@ -323,10 +324,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
while (conn) {
free = conn;
conn = conn->destroy_next;
- kmem_cache_free(fsnotify_mark_connector_cachep, free);
+ kfree(free);
}
}
+static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn);
+
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
@@ -342,6 +345,7 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ fsnotify_untrack_connector(conn);
/* Unpin inode when detaching from connector */
if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
@@ -644,6 +648,8 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb)
if (!sbinfo)
return -ENOMEM;
+ INIT_LIST_HEAD(&sbinfo->inode_conn_list);
+ spin_lock_init(&sbinfo->list_lock);
/*
* cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
* will observe an initialized structure
@@ -655,20 +661,123 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb)
return 0;
}
-static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- void *obj, unsigned int obj_type)
-{
- struct fsnotify_mark_connector *conn;
+struct fsnotify_inode_mark_connector {
+ struct fsnotify_mark_connector common;
+ struct list_head conns_list;
+};
- conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
- if (!conn)
- return -ENOMEM;
+static struct inode *fsnotify_get_living_inode(struct fsnotify_sb_info *sbinfo)
+{
+ struct fsnotify_inode_mark_connector *iconn;
+ struct inode *inode;
+
+ spin_lock(&sbinfo->list_lock);
+ /* Find the first non-evicting inode */
+ list_for_each_entry(iconn, &sbinfo->inode_conn_list, conns_list) {
+ /* All connectors on the list are still attached to an inode */
+ inode = iconn->common.obj;
+ /*
+ * For connectors without FSNOTIFY_CONN_FLAG_HAS_IREF
+ * (evictable marks) corresponding inode may well have 0
+ * refcount and can be undergoing eviction. OTOH list_lock
+ * protects us from the connector getting detached and inode
+ * freed. So we can poke around the inode safely.
+ */
+ spin_lock(&inode->i_lock);
+ if (likely(
+ !(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))) {
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sbinfo->list_lock);
+ return inode;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&sbinfo->list_lock);
+
+ return NULL;
+}
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting. Handle any watched inodes.
+ * @sbinfo: fsnotify info for superblock being unmounted.
+ *
+ * Walk all inode connectors for the superblock and free all associated marks.
+ */
+void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo)
+{
+ struct inode *inode;
+
+ while ((inode = fsnotify_get_living_inode(sbinfo))) {
+ fsnotify_inode(inode, FS_UNMOUNT);
+ fsnotify_clear_marks_by_inode(inode);
+ iput(inode);
+ cond_resched();
+ }
+}
+
+static void fsnotify_init_connector(struct fsnotify_mark_connector *conn,
+ void *obj, unsigned int obj_type)
+{
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
conn->flags = 0;
conn->prio = 0;
conn->type = obj_type;
conn->obj = obj;
+}
+
+static struct fsnotify_mark_connector *
+fsnotify_alloc_inode_connector(struct inode *inode)
+{
+ struct fsnotify_inode_mark_connector *iconn;
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(inode->i_sb);
+
+ iconn = kmem_cache_alloc(fsnotify_inode_mark_connector_cachep,
+ GFP_KERNEL);
+ if (!iconn)
+ return NULL;
+
+ fsnotify_init_connector(&iconn->common, inode, FSNOTIFY_OBJ_TYPE_INODE);
+ spin_lock(&sbinfo->list_lock);
+ list_add(&iconn->conns_list, &sbinfo->inode_conn_list);
+ spin_unlock(&sbinfo->list_lock);
+
+ return &iconn->common;
+}
+
+static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn)
+{
+ struct fsnotify_inode_mark_connector *iconn;
+ struct fsnotify_sb_info *sbinfo;
+
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
+ return;
+
+ iconn = container_of(conn, struct fsnotify_inode_mark_connector, common);
+ sbinfo = fsnotify_sb_info(fsnotify_conn_inode(conn)->i_sb);
+ spin_lock(&sbinfo->list_lock);
+ list_del(&iconn->conns_list);
+ spin_unlock(&sbinfo->list_lock);
+}
+
+static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
+ void *obj, unsigned int obj_type)
+{
+ struct fsnotify_mark_connector *conn;
+
+ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
+ struct inode *inode = obj;
+
+ conn = fsnotify_alloc_inode_connector(inode);
+ } else {
+ conn = kmem_cache_alloc(fsnotify_mark_connector_cachep,
+ GFP_KERNEL);
+ if (conn)
+ fsnotify_init_connector(conn, obj, obj_type);
+ }
+ if (!conn)
+ return -ENOMEM;
/*
* cmpxchg() provides the barrier so that readers of *connp can see
@@ -676,7 +785,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+ fsnotify_untrack_connector(conn);
+ kfree(conn);
}
return 0;
}
@@ -1007,3 +1117,12 @@ void fsnotify_wait_marks_destroyed(void)
flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
+
+__init void fsnotify_init_connector_caches(void)
+{
+ fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+ SLAB_PANIC);
+ fsnotify_inode_mark_connector_cachep = KMEM_CACHE(
+ fsnotify_inode_mark_connector,
+ SLAB_PANIC);
+}
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 79267b341..0020929 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -238,14 +238,16 @@ void orangefs_debugfs_init(int debug_mask)
static void orangefs_kernel_debug_init(void)
{
static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
+ size_t len = strlen(kernel_debug_string);
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
- strcpy(k_buffer, kernel_debug_string);
- strcat(k_buffer, "\n");
+ if (len + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ memcpy(k_buffer, kernel_debug_string, len);
+ k_buffer[len] = '\n';
+ k_buffer[len + 1] = '\0';
} else {
- strcpy(k_buffer, "none\n");
+ strscpy(k_buffer, "none\n");
pr_info("%s: overflow 1!\n", __func__);
}
@@ -336,16 +338,17 @@ static int help_show(struct seq_file *m, void *v)
*/
static void orangefs_client_debug_init(void)
{
-
static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
+ size_t len = strlen(client_debug_string);
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
- strcpy(c_buffer, client_debug_string);
- strcat(c_buffer, "\n");
+ if (len + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ memcpy(c_buffer, client_debug_string, len);
+ c_buffer[len] = '\n';
+ c_buffer[len + 1] = '\0';
} else {
- strcpy(c_buffer, "none\n");
+ strscpy(c_buffer, "none\n");
pr_info("%s: overflow! 2\n", __func__);
}
@@ -748,15 +751,14 @@ static void debug_mask_to_string(void *mask, int type)
else if (len)
kernel_debug_string[len - 1] = '\0';
else if (type)
- strcpy(client_debug_string, "none");
+ strscpy(client_debug_string, "none");
else
- strcpy(kernel_debug_string, "none");
+ strscpy(kernel_debug_string, "none");
out:
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
return;
-
}
static void do_k_string(void *k_mask, int index)
@@ -775,7 +777,7 @@ static void do_k_string(void *k_mask, int index)
strcat(kernel_debug_string, ",");
} else {
gossip_err("%s: overflow!\n", __func__);
- strcpy(kernel_debug_string, ORANGEFS_ALL);
+ strscpy(kernel_debug_string, ORANGEFS_ALL);
goto out;
}
}
@@ -802,7 +804,7 @@ static void do_c_string(void *c_mask, int index)
strcat(client_debug_string, ",");
} else {
gossip_err("%s: overflow!\n", __func__);
- strcpy(client_debug_string, ORANGEFS_ALL);
+ strscpy(client_debug_string, ORANGEFS_ALL);
goto out;
}
}
@@ -838,14 +840,14 @@ static int check_amalgam_keyword(void *mask, int type)
if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
(c_mask->mask2 == cdm_array[client_all_index].mask2)) {
- strcpy(client_debug_string, ORANGEFS_ALL);
+ strscpy(client_debug_string, ORANGEFS_ALL);
rc = 1;
goto out;
}
if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
(c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
- strcpy(client_debug_string, ORANGEFS_VERBOSE);
+ strscpy(client_debug_string, ORANGEFS_VERBOSE);
rc = 1;
goto out;
}
@@ -854,7 +856,7 @@ static int check_amalgam_keyword(void *mask, int type)
k_mask = (__u64 *) mask;
if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
- strcpy(kernel_debug_string, ORANGEFS_ALL);
+ strscpy(kernel_debug_string, ORANGEFS_ALL);
rc = 1;
goto out;
}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index eee3c5e..a431aa0 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -152,7 +152,7 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
goto out_unlock;
new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
- strcpy(new_op->upcall.req.getxattr.key, name);
+ strscpy(new_op->upcall.req.getxattr.key, name);
/*
* NOTE: Although keys are meant to be NULL terminated textual
@@ -173,7 +173,7 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
(char *)new_op->upcall.req.getxattr.key);
cx = kmalloc(sizeof *cx, GFP_KERNEL);
if (cx) {
- strcpy(cx->key, name);
+ strscpy(cx->key, name);
cx->length = -1;
cx->timeout = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
@@ -220,14 +220,14 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
ret = length;
if (cx) {
- strcpy(cx->key, name);
+ strscpy(cx->key, name);
memcpy(cx->val, buffer, length);
cx->length = length;
cx->timeout = jiffies + HZ;
} else {
cx = kmalloc(sizeof *cx, GFP_KERNEL);
if (cx) {
- strcpy(cx->key, name);
+ strscpy(cx->key, name);
memcpy(cx->val, buffer, length);
cx->length = length;
cx->timeout = jiffies + HZ;
@@ -267,7 +267,7 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
* textual strings, I am going to explicitly pass the
* length just in case we change this later on...
*/
- strcpy(new_op->upcall.req.removexattr.key, name);
+ strscpy(new_op->upcall.req.removexattr.key, name);
new_op->upcall.req.removexattr.key_sz = strlen(name) + 1;
gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -361,7 +361,7 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
* strings, I am going to explicitly pass the length just in
* case we change this later on...
*/
- strcpy(new_op->upcall.req.setxattr.keyval.key, name);
+ strscpy(new_op->upcall.req.setxattr.keyval.key, name);
new_op->upcall.req.setxattr.keyval.key_sz = strlen(name) + 1;
memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
new_op->upcall.req.setxattr.keyval.val_sz = size;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ed90672..33bacd7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -898,6 +898,7 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
sb_start_write(sb);
sb_end_write(sb);
put_super(sb);
+ cond_resched();
goto retry;
}
return sb;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 788a067..d44847c 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -35,6 +35,10 @@ static void enqueue_reassembly(
static struct smbdirect_recv_io *_get_first_reassembly(
struct smbdirect_socket *sc);
+static int smbd_post_send(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ struct smbdirect_send_io *request);
+
static int smbd_post_recv(
struct smbdirect_socket *sc,
struct smbdirect_recv_io *response);
@@ -97,8 +101,23 @@ int smbd_send_credit_target = 255;
/* The maximum single message size can be sent to remote peer */
int smbd_max_send_size = 1364;
-/* The maximum fragmented upper-layer payload receive size supported */
-int smbd_max_fragmented_recv_size = 1024 * 1024;
+/*
+ * The maximum fragmented upper-layer payload receive size supported
+ *
+ * Assume max_payload_per_credit is
+ * smbd_max_receive_size - 24 = 1340
+ *
+ * The maximum number would be
+ * smbd_receive_credit_max * max_payload_per_credit
+ *
+ * 1340 * 255 = 341700 (0x536C4)
+ *
+ * The minimum value from the spec is 131072 (0x20000)
+ *
+ * For now we use the logic we used in ksmbd before:
+ * (1364 * 255) / 2 = 173910 (0x2A756)
+ */
+int smbd_max_fragmented_recv_size = (1364 * 255) / 2;
/* The maximum single-message size which can be received */
int smbd_max_receive_size = 1364;
@@ -493,27 +512,103 @@ static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response
return (void *)response->packet;
}
+static struct smbdirect_send_io *smbd_alloc_send_io(struct smbdirect_socket *sc)
+{
+ struct smbdirect_send_io *msg;
+
+ msg = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+ msg->socket = sc;
+ INIT_LIST_HEAD(&msg->sibling_list);
+ msg->num_sge = 0;
+
+ return msg;
+}
+
+static void smbd_free_send_io(struct smbdirect_send_io *msg)
+{
+ struct smbdirect_socket *sc = msg->socket;
+ size_t i;
+
+ /*
+ * The list needs to be empty!
+ * The caller should take care of it.
+ */
+ WARN_ON_ONCE(!list_empty(&msg->sibling_list));
+
+ /*
+ * Note we call ib_dma_unmap_page(), even if some sges are mapped using
+ * ib_dma_map_single().
+ *
+ * The difference between _single() and _page() only matters for the
+ * ib_dma_map_*() case.
+ *
+ * For the ib_dma_unmap_*() case it does not matter as both take the
+ * dma_addr_t and dma_unmap_single_attrs() is just an alias to
+ * dma_unmap_page_attrs().
+ */
+ for (i = 0; i < msg->num_sge; i++)
+ ib_dma_unmap_page(sc->ib.dev,
+ msg->sge[i].addr,
+ msg->sge[i].length,
+ DMA_TO_DEVICE);
+
+ mempool_free(msg, sc->send_io.mem.pool);
+}
+
/* Called when a RDMA send is done */
static void send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- int i;
struct smbdirect_send_io *request =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = request->socket;
+ struct smbdirect_send_io *sibling, *next;
int lcredits = 0;
log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
request, ib_wc_status_msg(wc->status));
- for (i = 0; i < request->num_sge; i++)
- ib_dma_unmap_single(sc->ib.dev,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- mempool_free(request, sc->send_io.mem.pool);
+ if (unlikely(!(request->wr.send_flags & IB_SEND_SIGNALED))) {
+ /*
+ * This happens when smbdirect_send_io is a sibling
+ * before the final message, it is signaled on
+ * error anyway, so we need to skip
+ * smbdirect_connection_free_send_io here,
+ * otherwise is will destroy the memory
+ * of the siblings too, which will cause
+ * use after free problems for the others
+ * triggered from ib_drain_qp().
+ */
+ if (wc->status != IB_WC_SUCCESS)
+ goto skip_free;
+
+ /*
+ * This should not happen!
+ * But we better just close the
+ * connection...
+ */
+ log_rdma_send(ERR,
+ "unexpected send completion wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smbd_disconnect_rdma_connection(sc);
+ return;
+ }
+
+ /*
+ * Free possible siblings and then the main send_io
+ */
+ list_for_each_entry_safe(sibling, next, &request->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smbd_free_send_io(sibling);
+ lcredits += 1;
+ }
+ /* Note this frees wc->wr_cqe, but not wc */
+ smbd_free_send_io(request);
lcredits += 1;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
+skip_free:
if (wc->status != IB_WC_WR_FLUSH_ERR)
log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->opcode);
@@ -608,6 +703,7 @@ static bool process_negotiation_response(
sp->max_frmr_depth * PAGE_SIZE);
sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
+ atomic_set(&sc->send_io.bcredits.count, 1);
sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
return true;
}
@@ -618,6 +714,7 @@ static void smbd_post_send_credits(struct work_struct *work)
struct smbdirect_recv_io *response;
struct smbdirect_socket *sc =
container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
+ int posted = 0;
if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
return;
@@ -640,9 +737,21 @@ static void smbd_post_send_credits(struct work_struct *work)
}
atomic_inc(&sc->recv_io.posted.count);
+ posted += 1;
}
}
+ atomic_add(posted, &sc->recv_io.credits.available);
+
+ /*
+ * If the last send credit is waiting for credits
+ * it can grant we need to wake it up
+ */
+ if (posted &&
+ atomic_read(&sc->send_io.bcredits.count) == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0)
+ wake_up(&sc->send_io.credits.wait_queue);
+
/* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
if (atomic_read(&sc->recv_io.credits.count) <
sc->recv_io.credits.target - 1) {
@@ -659,6 +768,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = response->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
+ int current_recv_credits;
u16 old_recv_credit_target;
u32 data_offset = 0;
u32 data_length = 0;
@@ -743,7 +853,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
atomic_dec(&sc->recv_io.posted.count);
- atomic_dec(&sc->recv_io.credits.count);
+ current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count);
+
old_recv_credit_target = sc->recv_io.credits.target;
sc->recv_io.credits.target =
le16_to_cpu(data_transfer->credits_requested);
@@ -779,7 +890,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
* reassembly queue and wake up the reading thread
*/
if (data_length) {
- if (sc->recv_io.credits.target > old_recv_credit_target)
+ if (current_recv_credits <= (sc->recv_io.credits.target / 4) ||
+ sc->recv_io.credits.target > old_recv_credit_target)
queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
enqueue_reassembly(sc, response, data_length);
@@ -810,6 +922,7 @@ static struct rdma_cm_id *smbd_create_id(
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct rdma_cm_id *id;
+ u8 node_type = RDMA_NODE_UNSPECIFIED;
int rc;
__be16 *sport;
@@ -821,6 +934,31 @@ static struct rdma_cm_id *smbd_create_id(
return id;
}
+ switch (port) {
+ case SMBD_PORT:
+ /*
+ * only allow iWarp devices
+ * for port 5445.
+ */
+ node_type = RDMA_NODE_RNIC;
+ break;
+ case SMB_PORT:
+ /*
+ * only allow InfiniBand, RoCEv1 or RoCEv2
+ * devices for port 445.
+ *
+ * (Basically don't allow iWarp devices)
+ */
+ node_type = RDMA_NODE_IB_CA;
+ break;
+ }
+ rc = rdma_restrict_node_type(id, node_type);
+ if (rc) {
+ log_rdma_event(ERR, "rdma_restrict_node_type(%u) failed %i\n",
+ node_type, rc);
+ goto out;
+ }
+
if (dstaddr->sa_family == AF_INET6)
sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
else
@@ -955,16 +1093,13 @@ static int smbd_ia_open(
static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct ib_send_wr send_wr;
- int rc = -ENOMEM;
+ int rc;
struct smbdirect_send_io *request;
struct smbdirect_negotiate_req *packet;
- request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
- if (!request)
- return rc;
-
- request->socket = sc;
+ request = smbd_alloc_send_io(sc);
+ if (IS_ERR(request))
+ return PTR_ERR(request);
packet = smbdirect_send_io_payload(request);
packet->min_version = cpu_to_le16(SMBDIRECT_V1);
@@ -976,7 +1111,6 @@ static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
packet->max_fragmented_size =
cpu_to_le32(sp->max_fragmented_recv_size);
- request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(
sc->ib.dev, (void *)packet,
sizeof(*packet), DMA_TO_DEVICE);
@@ -984,42 +1118,20 @@ static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
rc = -EIO;
goto dma_mapping_failed;
}
+ request->num_sge = 1;
request->sge[0].length = sizeof(*packet);
request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
- ib_dma_sync_single_for_device(
- sc->ib.dev, request->sge[0].addr,
- request->sge[0].length, DMA_TO_DEVICE);
-
- request->cqe.done = send_done;
-
- send_wr.next = NULL;
- send_wr.wr_cqe = &request->cqe;
- send_wr.sg_list = request->sge;
- send_wr.num_sge = request->num_sge;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
- request->sge[0].addr,
- request->sge[0].length, request->sge[0].lkey);
-
- atomic_inc(&sc->send_io.pending.count);
- rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
+ rc = smbd_post_send(sc, NULL, request);
if (!rc)
return 0;
- /* if we reach here, post send failed */
- log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- atomic_dec(&sc->send_io.pending.count);
- ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
- request->sge[0].length, DMA_TO_DEVICE);
-
- smbd_disconnect_rdma_connection(sc);
+ if (rc == -EAGAIN)
+ rc = -EIO;
dma_mapping_failed:
- mempool_free(request, sc->send_io.mem.pool);
+ smbd_free_send_io(request);
return rc;
}
@@ -1033,19 +1145,38 @@ static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
*/
static int manage_credits_prior_sending(struct smbdirect_socket *sc)
{
+ int missing;
+ int available;
int new_credits;
if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
return 0;
- new_credits = atomic_read(&sc->recv_io.posted.count);
- if (new_credits == 0)
+ missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count);
+ available = atomic_xchg(&sc->recv_io.credits.available, 0);
+ new_credits = (u16)min3(U16_MAX, missing, available);
+ if (new_credits <= 0) {
+ /*
+ * If credits are available, but not granted
+ * we need to re-add them again.
+ */
+ if (available)
+ atomic_add(available, &sc->recv_io.credits.available);
return 0;
+ }
- new_credits -= atomic_read(&sc->recv_io.credits.count);
- if (new_credits <= 0)
- return 0;
+ if (new_credits < available) {
+ /*
+ * Readd the remaining available again.
+ */
+ available -= new_credits;
+ atomic_add(available, &sc->recv_io.credits.available);
+ }
+ /*
+ * Remember we granted the credits
+ */
+ atomic_add(new_credits, &sc->recv_io.credits.count);
return new_credits;
}
@@ -1075,12 +1206,27 @@ static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
return 0;
}
+static int smbd_ib_post_send(struct smbdirect_socket *sc,
+ struct ib_send_wr *wr)
+{
+ int ret;
+
+ atomic_inc(&sc->send_io.pending.count);
+ ret = ib_post_send(sc->ib.qp, wr, NULL);
+ if (ret) {
+ pr_err("failed to post send: %d\n", ret);
+ smbd_disconnect_rdma_connection(sc);
+ ret = -EAGAIN;
+ }
+ return ret;
+}
+
/* Post the send request */
static int smbd_post_send(struct smbdirect_socket *sc,
- struct smbdirect_send_io *request)
+ struct smbdirect_send_batch *batch,
+ struct smbdirect_send_io *request)
{
- struct ib_send_wr send_wr;
- int rc, i;
+ int i;
for (i = 0; i < request->num_sge; i++) {
log_rdma_send(INFO,
@@ -1094,79 +1240,245 @@ static int smbd_post_send(struct smbdirect_socket *sc,
}
request->cqe.done = send_done;
+ request->wr.next = NULL;
+ request->wr.sg_list = request->sge;
+ request->wr.num_sge = request->num_sge;
+ request->wr.opcode = IB_WR_SEND;
- send_wr.next = NULL;
- send_wr.wr_cqe = &request->cqe;
- send_wr.sg_list = request->sge;
- send_wr.num_sge = request->num_sge;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
+ if (batch) {
+ request->wr.wr_cqe = NULL;
+ request->wr.send_flags = 0;
+ if (!list_empty(&batch->msg_list)) {
+ struct smbdirect_send_io *last;
- rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
- if (rc) {
- log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- smbd_disconnect_rdma_connection(sc);
- rc = -EAGAIN;
+ last = list_last_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+ last->wr.next = &request->wr;
+ }
+ list_add_tail(&request->sibling_list, &batch->msg_list);
+ batch->wr_cnt++;
+ return 0;
}
- return rc;
+ request->wr.wr_cqe = &request->cqe;
+ request->wr.send_flags = IB_SEND_SIGNALED;
+ return smbd_ib_post_send(sc, &request->wr);
+}
+
+static void smbd_send_batch_init(struct smbdirect_send_batch *batch,
+ bool need_invalidate_rkey,
+ unsigned int remote_key)
+{
+ INIT_LIST_HEAD(&batch->msg_list);
+ batch->wr_cnt = 0;
+ batch->need_invalidate_rkey = need_invalidate_rkey;
+ batch->remote_key = remote_key;
+ batch->credit = 0;
+}
+
+static int smbd_send_batch_flush(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ bool is_last)
+{
+ struct smbdirect_send_io *first, *last;
+ int ret = 0;
+
+ if (list_empty(&batch->msg_list))
+ goto release_credit;
+
+ first = list_first_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+ last = list_last_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+
+ if (batch->need_invalidate_rkey) {
+ first->wr.opcode = IB_WR_SEND_WITH_INV;
+ first->wr.ex.invalidate_rkey = batch->remote_key;
+ batch->need_invalidate_rkey = false;
+ batch->remote_key = 0;
+ }
+
+ last->wr.send_flags = IB_SEND_SIGNALED;
+ last->wr.wr_cqe = &last->cqe;
+
+ /*
+ * Remove last from batch->msg_list
+ * and splice the rest of batch->msg_list
+ * to last->sibling_list.
+ *
+ * batch->msg_list is a valid empty list
+ * at the end.
+ */
+ list_del_init(&last->sibling_list);
+ list_splice_tail_init(&batch->msg_list, &last->sibling_list);
+ batch->wr_cnt = 0;
+
+ ret = smbd_ib_post_send(sc, &first->wr);
+ if (ret) {
+ struct smbdirect_send_io *sibling, *next;
+
+ list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smbd_free_send_io(sibling);
+ }
+ smbd_free_send_io(last);
+ }
+
+release_credit:
+ if (is_last && !ret && batch->credit) {
+ atomic_add(batch->credit, &sc->send_io.bcredits.count);
+ batch->credit = 0;
+ wake_up(&sc->send_io.bcredits.wait_queue);
+ }
+
+ return ret;
+}
+
+static int wait_for_credits(struct smbdirect_socket *sc,
+ wait_queue_head_t *waitq, atomic_t *total_credits,
+ int needed)
+{
+ int ret;
+
+ do {
+ if (atomic_sub_return(needed, total_credits) >= 0)
+ return 0;
+
+ atomic_add(needed, total_credits);
+ ret = wait_event_interruptible(*waitq,
+ atomic_read(total_credits) >= needed ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return -ENOTCONN;
+ else if (ret < 0)
+ return ret;
+ } while (true);
+}
+
+static int wait_for_send_bcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ int ret;
+
+ if (batch->credit)
+ return 0;
+
+ ret = wait_for_credits(sc,
+ &sc->send_io.bcredits.wait_queue,
+ &sc->send_io.bcredits.count,
+ 1);
+ if (ret)
+ return ret;
+
+ batch->credit = 1;
+ return 0;
+}
+
+static int wait_for_send_lcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ if (batch && (atomic_read(&sc->send_io.lcredits.count) <= 1)) {
+ int ret;
+
+ ret = smbd_send_batch_flush(sc, batch, false);
+ if (ret)
+ return ret;
+ }
+
+ return wait_for_credits(sc,
+ &sc->send_io.lcredits.wait_queue,
+ &sc->send_io.lcredits.count,
+ 1);
+}
+
+static int wait_for_send_credits(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ if (batch &&
+ (batch->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
+ int ret;
+
+ ret = smbd_send_batch_flush(sc, batch, false);
+ if (ret)
+ return ret;
+ }
+
+ return wait_for_credits(sc,
+ &sc->send_io.credits.wait_queue,
+ &sc->send_io.credits.count,
+ 1);
}
static int smbd_post_send_iter(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
struct iov_iter *iter,
int *_remaining_data_length)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
- int i, rc;
+ int rc;
int header_length;
int data_length;
struct smbdirect_send_io *request;
struct smbdirect_data_transfer *packet;
int new_credits = 0;
+ struct smbdirect_send_batch _batch;
-wait_lcredit:
- /* Wait for local send credits */
- rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
- atomic_read(&sc->send_io.lcredits.count) > 0 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (rc)
- goto err_wait_lcredit;
+ if (!batch) {
+ smbd_send_batch_init(&_batch, false, 0);
+ batch = &_batch;
+ }
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = wait_for_send_bcredit(sc, batch);
+ if (rc) {
+ log_outgoing(ERR, "disconnected not sending on wait_bcredit\n");
+ rc = -EAGAIN;
+ goto err_wait_bcredit;
+ }
+
+ rc = wait_for_send_lcredit(sc, batch);
+ if (rc) {
+ log_outgoing(ERR, "disconnected not sending on wait_lcredit\n");
rc = -EAGAIN;
goto err_wait_lcredit;
}
- if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
- atomic_inc(&sc->send_io.lcredits.count);
- goto wait_lcredit;
- }
-wait_credit:
- /* Wait for send credits. A SMBD packet needs one credit */
- rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
- atomic_read(&sc->send_io.credits.count) > 0 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (rc)
- goto err_wait_credit;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ rc = wait_for_send_credits(sc, batch);
+ if (rc) {
log_outgoing(ERR, "disconnected not sending on wait_credit\n");
rc = -EAGAIN;
goto err_wait_credit;
}
- if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
- atomic_inc(&sc->send_io.credits.count);
- goto wait_credit;
+
+ new_credits = manage_credits_prior_sending(sc);
+ if (new_credits == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0 &&
+ atomic_read(&sc->recv_io.credits.count) == 0) {
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+ rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
+ atomic_read(&sc->send_io.credits.count) >= 1 ||
+ atomic_read(&sc->recv_io.credits.available) >= 1 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ rc = -ENOTCONN;
+ if (rc < 0) {
+ log_outgoing(ERR, "disconnected not sending on last credit\n");
+ rc = -EAGAIN;
+ goto err_wait_credit;
+ }
+
+ new_credits = manage_credits_prior_sending(sc);
}
- request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
- if (!request) {
- rc = -ENOMEM;
+ request = smbd_alloc_send_io(sc);
+ if (IS_ERR(request)) {
+ rc = PTR_ERR(request);
goto err_alloc;
}
- request->socket = sc;
memset(request->sge, 0, sizeof(request->sge));
/* Map the packet to DMA */
@@ -1215,9 +1527,6 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,
/* Fill in the packet header */
packet->credits_requested = cpu_to_le16(sp->send_credit_target);
-
- new_credits = manage_credits_prior_sending(sc);
- atomic_add(new_credits, &sc->recv_io.credits.count);
packet->credits_granted = cpu_to_le16(new_credits);
packet->flags = 0;
@@ -1240,32 +1549,18 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
- /*
- * Now that we got a local and a remote credit
- * we add us as pending
- */
- atomic_inc(&sc->send_io.pending.count);
+ rc = smbd_post_send(sc, batch, request);
+ if (!rc) {
+ if (batch != &_batch)
+ return 0;
- rc = smbd_post_send(sc, request);
- if (!rc)
- return 0;
-
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
-
- wake_up(&sc->send_io.pending.dec_wait_queue);
+ rc = smbd_send_batch_flush(sc, batch, true);
+ if (!rc)
+ return 0;
+ }
err_dma:
- for (i = 0; i < request->num_sge; i++)
- if (request->sge[i].addr)
- ib_dma_unmap_single(sc->ib.dev,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- mempool_free(request, sc->send_io.mem.pool);
-
- /* roll back the granted receive credits */
- atomic_sub(new_credits, &sc->recv_io.credits.count);
+ smbd_free_send_io(request);
err_alloc:
atomic_inc(&sc->send_io.credits.count);
@@ -1276,6 +1571,11 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,
wake_up(&sc->send_io.lcredits.wait_queue);
err_wait_lcredit:
+ atomic_add(batch->credit, &sc->send_io.bcredits.count);
+ batch->credit = 0;
+ wake_up(&sc->send_io.bcredits.wait_queue);
+
+err_wait_bcredit:
return rc;
}
@@ -1289,10 +1589,11 @@ static int smbd_post_send_empty(struct smbdirect_socket *sc)
int remaining_data_length = 0;
sc->statistics.send_empty++;
- return smbd_post_send_iter(sc, NULL, &remaining_data_length);
+ return smbd_post_send_iter(sc, NULL, NULL, &remaining_data_length);
}
static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
struct iov_iter *iter,
int *_remaining_data_length)
{
@@ -1305,7 +1606,7 @@ static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
*/
while (iov_iter_count(iter) > 0) {
- rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
+ rc = smbd_post_send_iter(sc, batch, iter, _remaining_data_length);
if (rc < 0)
break;
}
@@ -2227,8 +2528,10 @@ int smbd_send(struct TCP_Server_Info *server,
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smb_rqst *rqst;
struct iov_iter iter;
+ struct smbdirect_send_batch batch;
unsigned int remaining_data_length, klen;
int rc, i, rqst_idx;
+ int error = 0;
if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -EAGAIN;
@@ -2253,6 +2556,7 @@ int smbd_send(struct TCP_Server_Info *server,
num_rqst, remaining_data_length);
rqst_idx = 0;
+ smbd_send_batch_init(&batch, false, 0);
do {
rqst = &rqst_array[rqst_idx];
@@ -2271,20 +2575,28 @@ int smbd_send(struct TCP_Server_Info *server,
klen += rqst->rq_iov[i].iov_len;
iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
- rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
- if (rc < 0)
+ rc = smbd_post_send_full_iter(sc, &batch, &iter, &remaining_data_length);
+ if (rc < 0) {
+ error = rc;
break;
+ }
if (iov_iter_count(&rqst->rq_iter) > 0) {
/* And then the data pages if there are any */
- rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
+ rc = smbd_post_send_full_iter(sc, &batch, &rqst->rq_iter,
&remaining_data_length);
- if (rc < 0)
+ if (rc < 0) {
+ error = rc;
break;
+ }
}
} while (++rqst_idx < num_rqst);
+ rc = smbd_send_batch_flush(sc, &batch, true);
+ if (unlikely(!rc && error))
+ rc = error;
+
/*
* As an optimization, we don't wait for individual I/O to finish
* before sending the next one.
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index ee4c272..9526519 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -163,6 +163,17 @@ struct smbdirect_socket {
} mem;
/*
+ * This is a coordination for smbdirect_send_batch.
+ *
+ * There's only one possible credit, which means
+ * only one instance is running at a time.
+ */
+ struct {
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } bcredits;
+
+ /*
* The local credit state for ib_post_send()
*/
struct {
@@ -239,6 +250,7 @@ struct smbdirect_socket {
*/
struct {
u16 target;
+ atomic_t available;
atomic_t count;
} credits;
@@ -370,6 +382,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
disable_delayed_work_sync(&sc->idle.timer_work);
+ atomic_set(&sc->send_io.bcredits.count, 0);
+ init_waitqueue_head(&sc->send_io.bcredits.wait_queue);
+
atomic_set(&sc->send_io.lcredits.count, 0);
init_waitqueue_head(&sc->send_io.lcredits.wait_queue);
@@ -387,6 +402,7 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->recv_io.posted.refill_work);
+ atomic_set(&sc->recv_io.credits.available, 0);
atomic_set(&sc->recv_io.credits.count, 0);
INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
@@ -483,6 +499,8 @@ struct smbdirect_send_batch {
*/
bool need_invalidate_rkey;
u32 remote_key;
+
+ int credit;
};
struct smbdirect_recv_io {
diff --git a/fs/smb/server/Makefile b/fs/smb/server/Makefile
index 7d6337a..6407ba6 100644
--- a/fs/smb/server/Makefile
+++ b/fs/smb/server/Makefile
@@ -18,3 +18,4 @@
$(obj)/ksmbd_spnego_negtokentarg.asn1.o: $(obj)/ksmbd_spnego_negtokentarg.asn1.c $(obj)/ksmbd_spnego_negtokentarg.asn1.h
ksmbd-$(CONFIG_SMB_SERVER_SMBDIRECT) += transport_rdma.o
+ksmbd-$(CONFIG_PROC_FS) += proc.o
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 6cac48c..e7e3e77 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -14,6 +14,7 @@
#include "connection.h"
#include "transport_tcp.h"
#include "transport_rdma.h"
+#include "misc.h"
static DEFINE_MUTEX(init_lock);
@@ -22,6 +23,62 @@ static struct ksmbd_conn_ops default_conn_ops;
DEFINE_HASHTABLE(conn_list, CONN_HASH_BITS);
DECLARE_RWSEM(conn_list_lock);
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_clients;
+
+static int proc_show_clients(struct seq_file *m, void *v)
+{
+ struct ksmbd_conn *conn;
+ struct timespec64 now, t;
+ int i;
+
+ seq_printf(m, "#%-20s %-10s %-10s %-10s %-10s %-10s\n",
+ "<name>", "<dialect>", "<credits>", "<open files>",
+ "<requests>", "<last active>");
+
+ down_read(&conn_list_lock);
+ hash_for_each(conn_list, i, conn, hlist) {
+ jiffies_to_timespec64(jiffies - conn->last_active, &t);
+ ktime_get_real_ts64(&now);
+ t = timespec64_sub(now, t);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!conn->inet_addr)
+ seq_printf(m, "%-20pI6c", &conn->inet6_addr);
+ else
+#endif
+ seq_printf(m, "%-20pI4", &conn->inet_addr);
+ seq_printf(m, " 0x%-10x %-10u %-12d %-10d %ptT\n",
+ conn->dialect,
+ conn->total_credits,
+ atomic_read(&conn->stats.open_files_count),
+ atomic_read(&conn->req_running),
+ &t);
+ }
+ up_read(&conn_list_lock);
+ return 0;
+}
+
+static int create_proc_clients(void)
+{
+ proc_clients = ksmbd_proc_create("clients",
+ proc_show_clients, NULL);
+ if (!proc_clients)
+ return -ENOMEM;
+ return 0;
+}
+
+static void delete_proc_clients(void)
+{
+ if (proc_clients) {
+ proc_remove(proc_clients);
+ proc_clients = NULL;
+ }
+}
+#else
+static int create_proc_clients(void) { return 0; }
+static void delete_proc_clients(void) {}
+#endif
+
/**
* ksmbd_conn_free() - free resources of the connection instance
*
@@ -472,6 +529,7 @@ int ksmbd_conn_transport_init(void)
}
out:
mutex_unlock(&init_lock);
+ create_proc_clients();
return ret;
}
@@ -502,6 +560,7 @@ static void stop_sessions(void)
void ksmbd_conn_transport_destroy(void)
{
+ delete_proc_clients();
mutex_lock(&init_lock);
ksmbd_tcp_destroy();
ksmbd_rdma_stop_listening();
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 7f9bcd9..1e25870 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -7,6 +7,7 @@
#define __KSMBD_CONNECTION_H__
#include <linux/list.h>
+#include <linux/inet.h>
#include <linux/ip.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -33,7 +34,7 @@ enum {
KSMBD_SESS_RELEASING
};
-struct ksmbd_stats {
+struct ksmbd_conn_stats {
atomic_t open_files_count;
atomic64_t request_served;
};
@@ -78,7 +79,7 @@ struct ksmbd_conn {
struct list_head requests;
struct list_head async_requests;
int connection_type;
- struct ksmbd_stats stats;
+ struct ksmbd_conn_stats stats;
char ClientGUID[SMB2_CLIENT_GUID_SIZE];
struct ntlmssp_auth ntlmssp;
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index d3483d9..57dd47e 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -9,6 +9,7 @@
#include "../transport_ipc.h"
#include "../connection.h"
+#include "../stats.h"
#include "tree_connect.h"
#include "user_config.h"
@@ -79,12 +80,15 @@ ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name)
status.tree_conn = tree_conn;
atomic_set(&tree_conn->refcount, 1);
+ down_write(&sess->tree_conns_lock);
ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn,
KSMBD_DEFAULT_GFP));
+ up_write(&sess->tree_conns_lock);
if (ret) {
status.ret = -ENOMEM;
goto out_error;
}
+ ksmbd_counter_inc(KSMBD_COUNTER_TREE_CONNS);
kvfree(resp);
return status;
@@ -103,29 +107,36 @@ void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon)
kfree(tcon);
}
-int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
- struct ksmbd_tree_connect *tree_conn)
+static int __ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
+ struct ksmbd_tree_connect *tree_conn)
{
int ret;
- write_lock(&sess->tree_conns_lock);
- xa_erase(&sess->tree_conns, tree_conn->id);
- write_unlock(&sess->tree_conns_lock);
-
ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id);
ksmbd_release_tree_conn_id(sess, tree_conn->id);
ksmbd_share_config_put(tree_conn->share_conf);
+ ksmbd_counter_dec(KSMBD_COUNTER_TREE_CONNS);
if (atomic_dec_and_test(&tree_conn->refcount))
kfree(tree_conn);
return ret;
}
+int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
+ struct ksmbd_tree_connect *tree_conn)
+{
+ down_write(&sess->tree_conns_lock);
+ xa_erase(&sess->tree_conns, tree_conn->id);
+ up_write(&sess->tree_conns_lock);
+
+ return __ksmbd_tree_conn_disconnect(sess, tree_conn);
+}
+
struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
unsigned int id)
{
struct ksmbd_tree_connect *tcon;
- read_lock(&sess->tree_conns_lock);
+ down_read(&sess->tree_conns_lock);
tcon = xa_load(&sess->tree_conns, id);
if (tcon) {
if (tcon->t_state != TREE_CONNECTED)
@@ -133,7 +144,7 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
else if (!atomic_inc_not_zero(&tcon->refcount))
tcon = NULL;
}
- read_unlock(&sess->tree_conns_lock);
+ up_read(&sess->tree_conns_lock);
return tcon;
}
@@ -147,18 +158,19 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess)
if (!sess)
return -EINVAL;
+ down_write(&sess->tree_conns_lock);
xa_for_each(&sess->tree_conns, id, tc) {
- write_lock(&sess->tree_conns_lock);
if (tc->t_state == TREE_DISCONNECTED) {
- write_unlock(&sess->tree_conns_lock);
ret = -ENOENT;
continue;
}
tc->t_state = TREE_DISCONNECTED;
- write_unlock(&sess->tree_conns_lock);
- ret |= ksmbd_tree_conn_disconnect(sess, tc);
+ xa_erase(&sess->tree_conns, tc->id);
+ ret |= __ksmbd_tree_conn_disconnect(sess, tc);
}
xa_destroy(&sess->tree_conns);
+ up_write(&sess->tree_conns_lock);
+
return ret;
}
diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c
index 56c9a38..3267b86 100644
--- a/fs/smb/server/mgmt/user_config.c
+++ b/fs/smb/server/mgmt/user_config.c
@@ -90,11 +90,9 @@ void ksmbd_free_user(struct ksmbd_user *user)
kfree(user);
}
-int ksmbd_anonymous_user(struct ksmbd_user *user)
+bool ksmbd_anonymous_user(struct ksmbd_user *user)
{
- if (user->name[0] == '\0')
- return 1;
- return 0;
+ return user->name[0] == '\0';
}
bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2)
diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h
index 8c227b8..cc460b4 100644
--- a/fs/smb/server/mgmt/user_config.h
+++ b/fs/smb/server/mgmt/user_config.h
@@ -65,6 +65,6 @@ struct ksmbd_user *ksmbd_login_user(const char *account);
struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp,
struct ksmbd_login_response_ext *resp_ext);
void ksmbd_free_user(struct ksmbd_user *user);
-int ksmbd_anonymous_user(struct ksmbd_user *user);
+bool ksmbd_anonymous_user(struct ksmbd_user *user);
bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2);
#endif /* __USER_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 7d880ff..957a12d 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -12,9 +12,12 @@
#include "user_session.h"
#include "user_config.h"
#include "tree_connect.h"
+#include "share_config.h"
#include "../transport_ipc.h"
#include "../connection.h"
#include "../vfs_cache.h"
+#include "../misc.h"
+#include "../stats.h"
static DEFINE_IDA(session_ida);
@@ -27,17 +30,236 @@ struct ksmbd_session_rpc {
unsigned int method;
};
+#ifdef CONFIG_PROC_FS
+
+static const struct ksmbd_const_name ksmbd_sess_cap_const_names[] = {
+ {SMB2_GLOBAL_CAP_DFS, "dfs"},
+ {SMB2_GLOBAL_CAP_LEASING, "lease"},
+ {SMB2_GLOBAL_CAP_LARGE_MTU, "large-mtu"},
+ {SMB2_GLOBAL_CAP_MULTI_CHANNEL, "multi-channel"},
+ {SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, "persistent-handles"},
+ {SMB2_GLOBAL_CAP_DIRECTORY_LEASING, "dir-lease"},
+ {SMB2_GLOBAL_CAP_ENCRYPTION, "encryption"}
+};
+
+static const struct ksmbd_const_name ksmbd_cipher_const_names[] = {
+ {le16_to_cpu(SMB2_ENCRYPTION_AES128_CCM), "aes128-ccm"},
+ {le16_to_cpu(SMB2_ENCRYPTION_AES128_GCM), "aes128-gcm"},
+ {le16_to_cpu(SMB2_ENCRYPTION_AES256_CCM), "aes256-ccm"},
+ {le16_to_cpu(SMB2_ENCRYPTION_AES256_GCM), "aes256-gcm"},
+};
+
+static const struct ksmbd_const_name ksmbd_signing_const_names[] = {
+ {SIGNING_ALG_HMAC_SHA256, "hmac-sha256"},
+ {SIGNING_ALG_AES_CMAC, "aes-cmac"},
+ {SIGNING_ALG_AES_GMAC, "aes-gmac"},
+};
+
+static const char *session_state_string(struct ksmbd_session *session)
+{
+ switch (session->state) {
+ case SMB2_SESSION_VALID:
+ return "valid";
+ case SMB2_SESSION_IN_PROGRESS:
+ return "progress";
+ case SMB2_SESSION_EXPIRED:
+ return "expired";
+ default:
+ return "";
+ }
+}
+
+static const char *session_user_name(struct ksmbd_session *session)
+{
+ if (user_guest(session->user))
+ return "(Guest)";
+ else if (ksmbd_anonymous_user(session->user))
+ return "(Anonymous)";
+ return session->user->name;
+}
+
+static int show_proc_session(struct seq_file *m, void *v)
+{
+ struct ksmbd_session *sess;
+ struct ksmbd_tree_connect *tree_conn;
+ struct ksmbd_share_config *share_conf;
+ struct channel *chan;
+ unsigned long id;
+ int i = 0;
+
+ sess = (struct ksmbd_session *)m->private;
+ ksmbd_user_session_get(sess);
+
+ i = 0;
+ down_read(&sess->chann_lock);
+ xa_for_each(&sess->ksmbd_chann_list, id, chan) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (chan->conn->inet_addr)
+ seq_printf(m, "%-20s\t%pI4\n", "client",
+ &chan->conn->inet_addr);
+ else
+ seq_printf(m, "%-20s\t%pI6c\n", "client",
+ &chan->conn->inet6_addr);
+#else
+ seq_printf(m, "%-20s\t%pI4\n", "client",
+ &chan->conn->inet_addr);
+#endif
+ seq_printf(m, "%-20s\t%s\n", "user", session_user_name(sess));
+ seq_printf(m, "%-20s\t%llu\n", "id", sess->id);
+ seq_printf(m, "%-20s\t%s\n", "state",
+ session_state_string(sess));
+
+ seq_printf(m, "%-20s\t", "capabilities");
+ ksmbd_proc_show_flag_names(m,
+ ksmbd_sess_cap_const_names,
+ ARRAY_SIZE(ksmbd_sess_cap_const_names),
+ chan->conn->vals->req_capabilities);
+
+ if (sess->sign) {
+ seq_printf(m, "%-20s\t", "signing");
+ ksmbd_proc_show_const_name(m, "%s\t",
+ ksmbd_signing_const_names,
+ ARRAY_SIZE(ksmbd_signing_const_names),
+ le16_to_cpu(chan->conn->signing_algorithm));
+ } else if (sess->enc) {
+ seq_printf(m, "%-20s\t", "encryption");
+ ksmbd_proc_show_const_name(m, "%s\t",
+ ksmbd_cipher_const_names,
+ ARRAY_SIZE(ksmbd_cipher_const_names),
+ le16_to_cpu(chan->conn->cipher_type));
+ }
+ i++;
+ }
+ up_read(&sess->chann_lock);
+
+ seq_printf(m, "%-20s\t%d\n", "channels", i);
+
+ i = 0;
+ down_read(&sess->tree_conns_lock);
+ xa_for_each(&sess->tree_conns, id, tree_conn) {
+ share_conf = tree_conn->share_conf;
+ seq_printf(m, "%-20s\t%s\t%8d", "share",
+ share_conf->name, tree_conn->id);
+ if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_PIPE))
+ seq_printf(m, " %s ", "pipe");
+ else
+ seq_printf(m, " %s ", "disk");
+ seq_putc(m, '\n');
+ }
+ up_read(&sess->tree_conns_lock);
+
+ ksmbd_user_session_put(sess);
+ return 0;
+}
+
+void ksmbd_proc_show_flag_names(struct seq_file *m,
+ const struct ksmbd_const_name *table,
+ int count,
+ unsigned int flags)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (table[i].const_value & flags)
+ seq_printf(m, "0x%08x\t", table[i].const_value);
+ }
+ seq_putc(m, '\n');
+}
+
+void ksmbd_proc_show_const_name(struct seq_file *m,
+ const char *format,
+ const struct ksmbd_const_name *table,
+ int count,
+ unsigned int const_value)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (table[i].const_value & const_value)
+ seq_printf(m, format, table[i].name);
+ }
+ seq_putc(m, '\n');
+}
+
+static int create_proc_session(struct ksmbd_session *sess)
+{
+ char name[30];
+
+ snprintf(name, sizeof(name), "sessions/%llu", sess->id);
+ sess->proc_entry = ksmbd_proc_create(name,
+ show_proc_session, sess);
+ return 0;
+}
+
+static void delete_proc_session(struct ksmbd_session *sess)
+{
+ if (sess->proc_entry)
+ proc_remove(sess->proc_entry);
+}
+
+static int show_proc_sessions(struct seq_file *m, void *v)
+{
+ struct ksmbd_session *session;
+ struct channel *chan;
+ int i;
+ unsigned long id;
+
+ seq_printf(m, "#%-40s %-15s %-10s %-10s\n",
+ "<client>", "<user>", "<sess_id>", "<state>");
+
+ down_read(&sessions_table_lock);
+ hash_for_each(sessions_table, i, session, hlist) {
+ down_read(&session->chann_lock);
+ xa_for_each(&session->ksmbd_chann_list, id, chan) {
+ down_read(&chan->conn->session_lock);
+ ksmbd_user_session_get(session);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!chan->conn->inet_addr)
+ seq_printf(m, " %-40pI6c", &chan->conn->inet6_addr);
+ else
+#endif
+ seq_printf(m, " %-40pI4", &chan->conn->inet_addr);
+ seq_printf(m, " %-15s %-10llu %-10s\n",
+ session_user_name(session),
+ session->id,
+ session_state_string(session));
+
+ ksmbd_user_session_put(session);
+ up_read(&chan->conn->session_lock);
+ }
+ up_read(&session->chann_lock);
+ }
+ up_read(&sessions_table_lock);
+ return 0;
+}
+
+int create_proc_sessions(void)
+{
+ if (!ksmbd_proc_create("sessions/sessions",
+ show_proc_sessions, NULL))
+ return -ENOMEM;
+ return 0;
+}
+#else
+int create_proc_sessions(void) { return 0; }
+static int create_proc_session(struct ksmbd_session *sess) { return 0; }
+static void delete_proc_session(struct ksmbd_session *sess) {}
+#endif
+
static void free_channel_list(struct ksmbd_session *sess)
{
struct channel *chann;
unsigned long index;
+ down_write(&sess->chann_lock);
xa_for_each(&sess->ksmbd_chann_list, index, chann) {
xa_erase(&sess->ksmbd_chann_list, index);
kfree(chann);
}
xa_destroy(&sess->ksmbd_chann_list);
+ up_write(&sess->chann_lock);
}
static void __session_rpc_close(struct ksmbd_session *sess,
@@ -159,6 +381,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
if (!sess)
return;
+ delete_proc_session(sess);
+
if (sess->user)
ksmbd_free_user(sess->user);
@@ -220,7 +444,9 @@ static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess)
{
struct channel *chann;
+ down_write(&sess->chann_lock);
chann = xa_erase(&sess->ksmbd_chann_list, (long)conn);
+ up_write(&sess->chann_lock);
if (!chann)
return -ENOENT;
@@ -451,9 +677,10 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->ksmbd_chann_list);
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
- rwlock_init(&sess->tree_conns_lock);
atomic_set(&sess->refcnt, 2);
+ init_rwsem(&sess->tree_conns_lock);
init_rwsem(&sess->rpc_lock);
+ init_rwsem(&sess->chann_lock);
ret = __init_smb2_session(sess);
if (ret)
@@ -465,6 +692,8 @@ static struct ksmbd_session *__session_create(int protocol)
hash_add(sessions_table, &sess->hlist, sess->id);
up_write(&sessions_table_lock);
+ create_proc_session(sess);
+ ksmbd_counter_inc(KSMBD_COUNTER_SESSIONS);
return sess;
error:
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index c5749d6e..6aebd38 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -41,7 +41,6 @@ struct ksmbd_session {
bool sign;
bool enc;
- bool is_anonymous;
int state;
__u8 *Preauth_HashValue;
@@ -49,6 +48,7 @@ struct ksmbd_session {
char sess_key[CIFS_KEY_SIZE];
struct hlist_node hlist;
+ struct rw_semaphore chann_lock;
struct xarray ksmbd_chann_list;
struct xarray tree_conns;
struct ida tree_conn_ida;
@@ -60,8 +60,11 @@ struct ksmbd_session {
struct ksmbd_file_table file_table;
unsigned long last_active;
- rwlock_t tree_conns_lock;
+ struct rw_semaphore tree_conns_lock;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_entry;
+#endif
atomic_t refcnt;
struct rw_semaphore rpc_lock;
};
@@ -111,4 +114,5 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id);
void ksmbd_user_session_get(struct ksmbd_session *sess);
void ksmbd_user_session_put(struct ksmbd_session *sess);
+int create_proc_sessions(void);
#endif /* __USER_SESSION_MANAGEMENT_H__ */
diff --git a/fs/smb/server/misc.h b/fs/smb/server/misc.h
index 1facfcd..1342369 100644
--- a/fs/smb/server/misc.h
+++ b/fs/smb/server/misc.h
@@ -6,6 +6,9 @@
#ifndef __KSMBD_MISC_H__
#define __KSMBD_MISC_H__
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#endif
struct ksmbd_share_config;
struct nls_table;
struct kstat;
@@ -34,4 +37,31 @@ char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info,
struct timespec64 ksmbd_NTtimeToUnix(__le64 ntutc);
u64 ksmbd_UnixTimeToNT(struct timespec64 t);
long long ksmbd_systime(void);
+
+#ifdef CONFIG_PROC_FS
+struct ksmbd_const_name {
+ unsigned int const_value;
+ const char *name;
+};
+
+void ksmbd_proc_init(void);
+void ksmbd_proc_cleanup(void);
+void ksmbd_proc_reset(void);
+struct proc_dir_entry *ksmbd_proc_create(const char *name,
+ int (*show)(struct seq_file *m, void *v),
+ void *v);
+void ksmbd_proc_show_flag_names(struct seq_file *m,
+ const struct ksmbd_const_name *table,
+ int count,
+ unsigned int flags);
+void ksmbd_proc_show_const_name(struct seq_file *m,
+ const char *format,
+ const struct ksmbd_const_name *table,
+ int count,
+ unsigned int const_value);
+#else
+static inline void ksmbd_proc_init(void) {}
+static inline void ksmbd_proc_cleanup(void) {}
+static inline void ksmbd_proc_reset(void) {}
+#endif
#endif /* __KSMBD_MISC_H__ */
diff --git a/fs/smb/server/proc.c b/fs/smb/server/proc.c
new file mode 100644
index 0000000..101a2cc
--- /dev/null
+++ b/fs/smb/server/proc.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025, LG Electronics.
+ * Author(s): Hyunchul Lee <hyc.lee@gmail.com>
+ * Copyright (C) 2025, Samsung Electronics.
+ * Author(s): Vedansh Bhardwaj <v.bhardwaj@samsung.com>
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include "misc.h"
+#include "server.h"
+#include "stats.h"
+#include "smb_common.h"
+#include "smb2pdu.h"
+
+static struct proc_dir_entry *ksmbd_proc_fs;
+struct ksmbd_counters ksmbd_counters;
+
+struct proc_dir_entry *ksmbd_proc_create(const char *name,
+ int (*show)(struct seq_file *m, void *v),
+ void *v)
+{
+ return proc_create_single_data(name, 0400, ksmbd_proc_fs,
+ show, v);
+}
+
+struct ksmbd_const_smb2_process_req {
+ unsigned int const_value;
+ const char *name;
+};
+
+static const struct ksmbd_const_smb2_process_req smb2_process_req[KSMBD_COUNTER_MAX_REQS] = {
+ {le16_to_cpu(SMB2_NEGOTIATE), "SMB2_NEGOTIATE"},
+ {le16_to_cpu(SMB2_SESSION_SETUP), "SMB2_SESSION_SETUP"},
+ {le16_to_cpu(SMB2_LOGOFF), "SMB2_LOGOFF"},
+ {le16_to_cpu(SMB2_TREE_CONNECT), "SMB2_TREE_CONNECT"},
+ {le16_to_cpu(SMB2_TREE_DISCONNECT), "SMB2_TREE_DISCONNECT"},
+ {le16_to_cpu(SMB2_CREATE), "SMB2_CREATE"},
+ {le16_to_cpu(SMB2_CLOSE), "SMB2_CLOSE"},
+ {le16_to_cpu(SMB2_FLUSH), "SMB2_FLUSH"},
+ {le16_to_cpu(SMB2_READ), "SMB2_READ"},
+ {le16_to_cpu(SMB2_WRITE), "SMB2_WRITE"},
+ {le16_to_cpu(SMB2_LOCK), "SMB2_LOCK"},
+ {le16_to_cpu(SMB2_IOCTL), "SMB2_IOCTL"},
+ {le16_to_cpu(SMB2_CANCEL), "SMB2_CANCEL"},
+ {le16_to_cpu(SMB2_ECHO), "SMB2_ECHO"},
+ {le16_to_cpu(SMB2_QUERY_DIRECTORY), "SMB2_QUERY_DIRECTORY"},
+ {le16_to_cpu(SMB2_CHANGE_NOTIFY), "SMB2_CHANGE_NOTIFY"},
+ {le16_to_cpu(SMB2_QUERY_INFO), "SMB2_QUERY_INFO"},
+ {le16_to_cpu(SMB2_SET_INFO), "SMB2_SET_INFO"},
+ {le16_to_cpu(SMB2_OPLOCK_BREAK), "SMB2_OPLOCK_BREAK"},
+};
+
+static int proc_show_ksmbd_stats(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "Server\n");
+ seq_printf(m, "name: %s\n", ksmbd_server_string());
+ seq_printf(m, "netbios: %s\n", ksmbd_netbios_name());
+ seq_printf(m, "work group: %s\n", ksmbd_work_group());
+ seq_printf(m, "min protocol: %s\n", ksmbd_get_protocol_string(server_conf.min_protocol));
+ seq_printf(m, "max protocol: %s\n", ksmbd_get_protocol_string(server_conf.max_protocol));
+ seq_printf(m, "flags: 0x%08x\n", server_conf.flags);
+ seq_printf(m, "share_fake_fscaps: 0x%08x\n",
+ server_conf.share_fake_fscaps);
+ seq_printf(m, "sessions: %lld\n",
+ ksmbd_counter_sum(KSMBD_COUNTER_SESSIONS));
+ seq_printf(m, "tree connects: %lld\n",
+ ksmbd_counter_sum(KSMBD_COUNTER_TREE_CONNS));
+ seq_printf(m, "read bytes: %lld\n",
+ ksmbd_counter_sum(KSMBD_COUNTER_READ_BYTES));
+ seq_printf(m, "written bytes: %lld\n",
+ ksmbd_counter_sum(KSMBD_COUNTER_WRITE_BYTES));
+
+ seq_puts(m, "\nSMB2\n");
+ for (i = 0; i < KSMBD_COUNTER_MAX_REQS; i++)
+ seq_printf(m, "%-20s:\t%lld\n", smb2_process_req[i].name,
+ ksmbd_counter_sum(KSMBD_COUNTER_FIRST_REQ + i));
+ return 0;
+}
+
+void ksmbd_proc_cleanup(void)
+{
+ int i;
+
+ if (!ksmbd_proc_fs)
+ return;
+
+ proc_remove(ksmbd_proc_fs);
+
+ for (i = 0; i < ARRAY_SIZE(ksmbd_counters.counters); i++)
+ percpu_counter_destroy(&ksmbd_counters.counters[i]);
+
+ ksmbd_proc_fs = NULL;
+}
+
+void ksmbd_proc_reset(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ksmbd_counters.counters); i++)
+ percpu_counter_set(&ksmbd_counters.counters[i], 0);
+}
+
+void ksmbd_proc_init(void)
+{
+ int i;
+ int retval;
+
+ ksmbd_proc_fs = proc_mkdir("fs/ksmbd", NULL);
+ if (!ksmbd_proc_fs)
+ return;
+
+ if (!proc_mkdir_mode("sessions", 0400, ksmbd_proc_fs))
+ goto err_out;
+
+ for (i = 0; i < ARRAY_SIZE(ksmbd_counters.counters); i++) {
+ retval = percpu_counter_init(&ksmbd_counters.counters[i], 0, GFP_KERNEL);
+ if (retval)
+ goto err_out;
+ }
+
+ if (!ksmbd_proc_create("server", proc_show_ksmbd_stats, NULL))
+ goto err_out;
+
+ ksmbd_proc_reset();
+ return;
+err_out:
+ ksmbd_proc_cleanup();
+}
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 554ae90..c2c0743 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -21,6 +21,8 @@
#include "mgmt/user_session.h"
#include "crypto_ctx.h"
#include "auth.h"
+#include "misc.h"
+#include "stats.h"
int ksmbd_debug_types;
@@ -126,25 +128,27 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn,
andx_again:
if (command >= conn->max_cmds) {
conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
- return SERVER_HANDLER_CONTINUE;
+ return SERVER_HANDLER_ABORT;
}
cmds = &conn->cmds[command];
if (!cmds->proc) {
ksmbd_debug(SMB, "*** not implemented yet cmd = %x\n", command);
conn->ops->set_rsp_status(work, STATUS_NOT_IMPLEMENTED);
- return SERVER_HANDLER_CONTINUE;
+ return SERVER_HANDLER_ABORT;
}
if (work->sess && conn->ops->is_sign_req(work, command)) {
ret = conn->ops->check_sign_req(work);
if (!ret) {
conn->ops->set_rsp_status(work, STATUS_ACCESS_DENIED);
- return SERVER_HANDLER_CONTINUE;
+ return SERVER_HANDLER_ABORT;
}
}
ret = cmds->proc(work);
+ if (conn->ops->inc_reqs)
+ conn->ops->inc_reqs(command);
if (ret < 0)
ksmbd_debug(CONN, "Failed to process %u [%d]\n", command, ret);
@@ -359,6 +363,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl)
{
int ret;
+ ksmbd_proc_reset();
ret = ksmbd_conn_transport_init();
if (ret) {
server_queue_ctrl_reset_work();
@@ -531,6 +536,7 @@ static int ksmbd_server_shutdown(void)
{
WRITE_ONCE(server_conf.state, SERVER_STATE_SHUTTING_DOWN);
+ ksmbd_proc_cleanup();
class_unregister(&ksmbd_control_class);
ksmbd_workqueue_destroy();
ksmbd_ipc_release();
@@ -554,6 +560,9 @@ static int __init ksmbd_server_init(void)
return ret;
}
+ ksmbd_proc_init();
+ create_proc_sessions();
+
ksmbd_server_tcp_callbacks_init();
ret = server_conf_init();
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index edd7eca..c9a32ee 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -11,6 +11,7 @@
#include "connection.h"
#include "smb_common.h"
#include "server.h"
+#include "stats.h"
static struct smb_version_values smb21_server_values = {
.version_string = SMB21_VERSION_STRING,
@@ -121,6 +122,7 @@ static struct smb_version_values smb311_server_values = {
static struct smb_version_ops smb2_0_server_ops = {
.get_cmd_val = get_smb2_cmd_val,
+ .inc_reqs = ksmbd_counter_inc_reqs,
.init_rsp_hdr = init_smb2_rsp_hdr,
.set_rsp_status = set_smb2_rsp_status,
.allocate_rsp_buf = smb2_allocate_rsp_buf,
@@ -134,6 +136,7 @@ static struct smb_version_ops smb2_0_server_ops = {
static struct smb_version_ops smb3_0_server_ops = {
.get_cmd_val = get_smb2_cmd_val,
+ .inc_reqs = ksmbd_counter_inc_reqs,
.init_rsp_hdr = init_smb2_rsp_hdr,
.set_rsp_status = set_smb2_rsp_status,
.allocate_rsp_buf = smb2_allocate_rsp_buf,
@@ -152,6 +155,7 @@ static struct smb_version_ops smb3_0_server_ops = {
static struct smb_version_ops smb3_11_server_ops = {
.get_cmd_val = get_smb2_cmd_val,
+ .inc_reqs = ksmbd_counter_inc_reqs,
.init_rsp_hdr = init_smb2_rsp_hdr,
.set_rsp_status = set_smb2_rsp_status,
.allocate_rsp_buf = smb2_allocate_rsp_buf,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 2fcd0d4d..cbb31ef 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -39,6 +39,7 @@
#include "mgmt/user_session.h"
#include "mgmt/ksmbd_ida.h"
#include "ndr.h"
+#include "stats.h"
#include "transport_tcp.h"
static void __wbuf(struct ksmbd_work *work, void **req, void **rsp)
@@ -79,7 +80,13 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id)
struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn)
{
- return xa_load(&sess->ksmbd_chann_list, (long)conn);
+ struct channel *chann;
+
+ down_read(&sess->chann_lock);
+ chann = xa_load(&sess->ksmbd_chann_list, (long)conn);
+ up_read(&sess->chann_lock);
+
+ return chann;
}
/**
@@ -1558,8 +1565,10 @@ static int ntlm_authenticate(struct ksmbd_work *work,
return -ENOMEM;
chann->conn = conn;
+ down_write(&sess->chann_lock);
old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann,
KSMBD_DEFAULT_GFP);
+ up_write(&sess->chann_lock);
if (xa_is_err(old)) {
kfree(chann);
return xa_err(old);
@@ -1651,8 +1660,10 @@ static int krb5_authenticate(struct ksmbd_work *work,
return -ENOMEM;
chann->conn = conn;
+ down_write(&sess->chann_lock);
old = xa_store(&sess->ksmbd_chann_list, (long)conn,
chann, KSMBD_DEFAULT_GFP);
+ up_write(&sess->chann_lock);
if (xa_is_err(old)) {
kfree(chann);
return xa_err(old);
@@ -2026,9 +2037,9 @@ int smb2_tree_connect(struct ksmbd_work *work)
if (conn->posix_ext_supported)
status.tree_conn->posix_extensions = true;
- write_lock(&sess->tree_conns_lock);
+ down_write(&sess->tree_conns_lock);
status.tree_conn->t_state = TREE_CONNECTED;
- write_unlock(&sess->tree_conns_lock);
+ up_write(&sess->tree_conns_lock);
rsp->StructureSize = cpu_to_le16(16);
out_err1:
if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE && share &&
@@ -2182,16 +2193,16 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
ksmbd_close_tree_conn_fds(work);
- write_lock(&sess->tree_conns_lock);
+ down_write(&sess->tree_conns_lock);
if (tcon->t_state == TREE_DISCONNECTED) {
- write_unlock(&sess->tree_conns_lock);
+ up_write(&sess->tree_conns_lock);
rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
err = -ENOENT;
goto err_out;
}
tcon->t_state = TREE_DISCONNECTED;
- write_unlock(&sess->tree_conns_lock);
+ up_write(&sess->tree_conns_lock);
err = ksmbd_tree_conn_disconnect(sess, tcon);
if (err) {
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 1cd7e73..741aabd 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -98,6 +98,30 @@ inline int ksmbd_max_protocol(void)
return SMB311_PROT;
}
+static const struct {
+ int version;
+ const char *string;
+} version_strings[] = {
+#ifdef CONFIG_SMB_INSECURE_SERVER
+ {SMB1_PROT, SMB1_VERSION_STRING},
+#endif
+ {SMB2_PROT, SMB20_VERSION_STRING},
+ {SMB21_PROT, SMB21_VERSION_STRING},
+ {SMB30_PROT, SMB30_VERSION_STRING},
+ {SMB302_PROT, SMB302_VERSION_STRING},
+ {SMB311_PROT, SMB311_VERSION_STRING},
+};
+
+const char *ksmbd_get_protocol_string(int version)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(version_strings); i++) {
+ if (version_strings[i].version == version)
+ return version_strings[i].string;
+ }
+ return "";
+}
int ksmbd_lookup_protocol_idx(char *str)
{
int offt = ARRAY_SIZE(smb1_protos) - 1;
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index ddd6867..ca7e361 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -143,6 +143,7 @@ struct file_id_both_directory_info {
struct smb_version_ops {
u16 (*get_cmd_val)(struct ksmbd_work *swork);
+ void (*inc_reqs)(unsigned int cmd);
int (*init_rsp_hdr)(struct ksmbd_work *swork);
void (*set_rsp_status)(struct ksmbd_work *swork, __le32 err);
int (*allocate_rsp_buf)(struct ksmbd_work *work);
@@ -165,6 +166,7 @@ struct smb_version_cmds {
int ksmbd_min_protocol(void);
int ksmbd_max_protocol(void);
+const char *ksmbd_get_protocol_string(int version);
int ksmbd_lookup_protocol_idx(char *str);
diff --git a/fs/smb/server/stats.h b/fs/smb/server/stats.h
new file mode 100644
index 0000000..b60c30c
--- /dev/null
+++ b/fs/smb/server/stats.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2025, LG Electronics.
+ * Author(s): Hyunchul Lee <hyc.lee@gmail.com>
+ * Copyright (C) 2025, Samsung Electronics.
+ * Author(s): Vedansh Bhardwaj <v.bhardwaj@samsung.com>
+ */
+
+#ifndef __KSMBD_STATS_H__
+#define __KSMBD_STATS_H__
+
+#define KSMBD_COUNTER_MAX_REQS 19
+
+enum {
+ KSMBD_COUNTER_SESSIONS = 0,
+ KSMBD_COUNTER_TREE_CONNS,
+ KSMBD_COUNTER_REQUESTS,
+ KSMBD_COUNTER_READ_BYTES,
+ KSMBD_COUNTER_WRITE_BYTES,
+ KSMBD_COUNTER_FIRST_REQ,
+ KSMBD_COUNTER_LAST_REQ = KSMBD_COUNTER_FIRST_REQ +
+ KSMBD_COUNTER_MAX_REQS - 1,
+ KSMBD_COUNTER_MAX,
+};
+
+#ifdef CONFIG_PROC_FS
+extern struct ksmbd_counters ksmbd_counters;
+
+struct ksmbd_counters {
+ struct percpu_counter counters[KSMBD_COUNTER_MAX];
+};
+
+static inline void ksmbd_counter_inc(int type)
+{
+ percpu_counter_inc(&ksmbd_counters.counters[type]);
+}
+
+static inline void ksmbd_counter_dec(int type)
+{
+ percpu_counter_dec(&ksmbd_counters.counters[type]);
+}
+
+static inline void ksmbd_counter_add(int type, s64 value)
+{
+ percpu_counter_add(&ksmbd_counters.counters[type], value);
+}
+
+static inline void ksmbd_counter_sub(int type, s64 value)
+{
+ percpu_counter_sub(&ksmbd_counters.counters[type], value);
+}
+
+static inline void ksmbd_counter_inc_reqs(unsigned int cmd)
+{
+ if (cmd < KSMBD_COUNTER_MAX_REQS)
+ percpu_counter_inc(&ksmbd_counters.counters[KSMBD_COUNTER_FIRST_REQ + cmd]);
+}
+
+static inline s64 ksmbd_counter_sum(int type)
+{
+ return percpu_counter_sum_positive(&ksmbd_counters.counters[type]);
+}
+#else
+
+static inline void ksmbd_counter_inc(int type) {}
+static inline void ksmbd_counter_dec(int type) {}
+static inline void ksmbd_counter_add(int type, s64 value) {}
+static inline void ksmbd_counter_sub(int type, s64 value) {}
+static inline void ksmbd_counter_inc_reqs(unsigned int cmd) {}
+static inline s64 ksmbd_counter_sum(int type) { return 0; }
+#endif
+
+#endif
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index e427393..fb36fb9 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -61,9 +61,6 @@
* Those may change after a SMB_DIRECT negotiation
*/
-/* Set 445 port to SMB Direct port by default */
-static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND;
-
/* The local peer's maximum number of credits to grant to the peer */
static int smb_direct_receive_credit_max = 255;
@@ -73,8 +70,23 @@ static int smb_direct_send_credit_target = 255;
/* The maximum single message size can be sent to remote peer */
static int smb_direct_max_send_size = 1364;
-/* The maximum fragmented upper-layer payload receive size supported */
-static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
+/*
+ * The maximum fragmented upper-layer payload receive size supported
+ *
+ * Assume max_payload_per_credit is
+ * smb_direct_receive_credit_max - 24 = 1340
+ *
+ * The maximum number would be
+ * smb_direct_receive_credit_max * max_payload_per_credit
+ *
+ * 1340 * 255 = 341700 (0x536C4)
+ *
+ * The minimum value from the spec is 131072 (0x20000)
+ *
+ * For now we use the logic we used before:
+ * (1364 * 255) / 2 = 173910 (0x2A756)
+ */
+static int smb_direct_max_fragmented_recv_size = (1364 * 255) / 2;
/* The maximum single-message size which can be received */
static int smb_direct_max_receive_size = 1364;
@@ -90,8 +102,9 @@ struct smb_direct_device {
};
static struct smb_direct_listener {
+ int port;
struct rdma_cm_id *cm_id;
-} smb_direct_listener;
+} smb_direct_ib_listener, smb_direct_iw_listener;
static struct workqueue_struct *smb_direct_wq;
@@ -221,6 +234,7 @@ static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.bcredits.wait_queue);
wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
@@ -644,6 +658,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_data_transfer *data_transfer =
(struct smbdirect_data_transfer *)recvmsg->packet;
u32 remaining_data_length, data_offset, data_length;
+ int current_recv_credits;
u16 old_recv_credit_target;
if (wc->byte_len <
@@ -682,7 +697,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
atomic_dec(&sc->recv_io.posted.count);
- atomic_dec(&sc->recv_io.credits.count);
+ current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count);
old_recv_credit_target = sc->recv_io.credits.target;
sc->recv_io.credits.target =
@@ -702,7 +717,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
wake_up(&sc->send_io.credits.wait_queue);
if (data_length) {
- if (sc->recv_io.credits.target > old_recv_credit_target)
+ if (current_recv_credits <= (sc->recv_io.credits.target / 4) ||
+ sc->recv_io.credits.target > old_recv_credit_target)
queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
enqueue_reassembly(sc, recvmsg, (int)data_length);
@@ -1028,6 +1044,17 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
}
}
+ atomic_add(credits, &sc->recv_io.credits.available);
+
+ /*
+ * If the last send credit is waiting for credits
+ * it can grant we need to wake it up
+ */
+ if (credits &&
+ atomic_read(&sc->send_io.bcredits.count) == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0)
+ wake_up(&sc->send_io.credits.wait_queue);
+
if (credits)
queue_work(sc->workqueue, &sc->idle.immediate_work);
}
@@ -1045,6 +1072,31 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
+ if (unlikely(!(sendmsg->wr.send_flags & IB_SEND_SIGNALED))) {
+ /*
+ * This happens when smbdirect_send_io is a sibling
+ * before the final message, it is signaled on
+ * error anyway, so we need to skip
+ * smbdirect_connection_free_send_io here,
+ * otherwise is will destroy the memory
+ * of the siblings too, which will cause
+ * use after free problems for the others
+ * triggered from ib_drain_qp().
+ */
+ if (wc->status != IB_WC_SUCCESS)
+ goto skip_free;
+
+ /*
+ * This should not happen!
+ * But we better just close the
+ * connection...
+ */
+ pr_err("unexpected send completion wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smb_direct_disconnect_rdma_connection(sc);
+ return;
+ }
+
/*
* Free possible siblings and then the main send_io
*/
@@ -1058,6 +1110,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
lcredits += 1;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
+skip_free:
pr_err("Send error. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
@@ -1074,19 +1127,37 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
static int manage_credits_prior_sending(struct smbdirect_socket *sc)
{
+ int missing;
+ int available;
int new_credits;
if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
return 0;
- new_credits = atomic_read(&sc->recv_io.posted.count);
- if (new_credits == 0)
+ missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count);
+ available = atomic_xchg(&sc->recv_io.credits.available, 0);
+ new_credits = (u16)min3(U16_MAX, missing, available);
+ if (new_credits <= 0) {
+ /*
+ * If credits are available, but not granted
+ * we need to re-add them again.
+ */
+ if (available)
+ atomic_add(available, &sc->recv_io.credits.available);
return 0;
+ }
- new_credits -= atomic_read(&sc->recv_io.credits.count);
- if (new_credits <= 0)
- return 0;
+ if (new_credits < available) {
+ /*
+ * Readd the remaining available again.
+ */
+ available -= new_credits;
+ atomic_add(available, &sc->recv_io.credits.available);
+ }
+ /*
+ * Remember we granted the credits
+ */
atomic_add(new_credits, &sc->recv_io.credits.count);
return new_credits;
}
@@ -1130,6 +1201,7 @@ static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx,
send_ctx->wr_cnt = 0;
send_ctx->need_invalidate_rkey = need_invalidate_rkey;
send_ctx->remote_key = remote_key;
+ send_ctx->credit = 0;
}
static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
@@ -1137,10 +1209,10 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
bool is_last)
{
struct smbdirect_send_io *first, *last;
- int ret;
+ int ret = 0;
if (list_empty(&send_ctx->msg_list))
- return 0;
+ goto release_credit;
first = list_first_entry(&send_ctx->msg_list,
struct smbdirect_send_io,
@@ -1182,6 +1254,13 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
smb_direct_free_sendmsg(sc, last);
}
+release_credit:
+ if (is_last && !ret && send_ctx->credit) {
+ atomic_add(send_ctx->credit, &sc->send_io.bcredits.count);
+ send_ctx->credit = 0;
+ wake_up(&sc->send_io.bcredits.wait_queue);
+ }
+
return ret;
}
@@ -1207,6 +1286,25 @@ static int wait_for_credits(struct smbdirect_socket *sc,
} while (true);
}
+static int wait_for_send_bcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx)
+{
+ int ret;
+
+ if (send_ctx->credit)
+ return 0;
+
+ ret = wait_for_credits(sc,
+ &sc->send_io.bcredits.wait_queue,
+ &sc->send_io.bcredits.count,
+ 1);
+ if (ret)
+ return ret;
+
+ send_ctx->credit = 1;
+ return 0;
+}
+
static int wait_for_send_lcredit(struct smbdirect_socket *sc,
struct smbdirect_send_batch *send_ctx)
{
@@ -1256,6 +1354,7 @@ static int calc_rw_credits(struct smbdirect_socket *sc,
static int smb_direct_create_header(struct smbdirect_socket *sc,
int size, int remaining_data_length,
+ int new_credits,
struct smbdirect_send_io **sendmsg_out)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
@@ -1271,7 +1370,7 @@ static int smb_direct_create_header(struct smbdirect_socket *sc,
/* Fill in the packet header */
packet = (struct smbdirect_data_transfer *)sendmsg->packet;
packet->credits_requested = cpu_to_le16(sp->send_credit_target);
- packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
+ packet->credits_granted = cpu_to_le16(new_credits);
packet->flags = 0;
if (manage_keep_alive_before_sending(sc))
@@ -1408,6 +1507,17 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
struct smbdirect_send_io *msg;
int data_length;
struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
+ struct smbdirect_send_batch _send_ctx;
+ int new_credits;
+
+ if (!send_ctx) {
+ smb_direct_send_ctx_init(&_send_ctx, false, 0);
+ send_ctx = &_send_ctx;
+ }
+
+ ret = wait_for_send_bcredit(sc, send_ctx);
+ if (ret)
+ goto bcredit_failed;
ret = wait_for_send_lcredit(sc, send_ctx);
if (ret)
@@ -1417,12 +1527,29 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
if (ret)
goto credit_failed;
+ new_credits = manage_credits_prior_sending(sc);
+ if (new_credits == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0 &&
+ atomic_read(&sc->recv_io.credits.count) == 0) {
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+ ret = wait_event_interruptible(sc->send_io.credits.wait_queue,
+ atomic_read(&sc->send_io.credits.count) >= 1 ||
+ atomic_read(&sc->recv_io.credits.available) >= 1 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ ret = -ENOTCONN;
+ if (ret < 0)
+ goto credit_failed;
+
+ new_credits = manage_credits_prior_sending(sc);
+ }
+
data_length = 0;
for (i = 0; i < niov; i++)
data_length += iov[i].iov_len;
ret = smb_direct_create_header(sc, data_length, remaining_data_length,
- &msg);
+ new_credits, &msg);
if (ret)
goto header_failed;
@@ -1460,6 +1587,13 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
ret = post_sendmsg(sc, send_ctx, msg);
if (ret)
goto err;
+
+ if (send_ctx == &_send_ctx) {
+ ret = smb_direct_flush_send_list(sc, send_ctx, true);
+ if (ret)
+ goto err;
+ }
+
return 0;
err:
smb_direct_free_sendmsg(sc, msg);
@@ -1468,6 +1602,9 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
credit_failed:
atomic_inc(&sc->send_io.lcredits.count);
lcredit_failed:
+ atomic_add(send_ctx->credit, &sc->send_io.bcredits.count);
+ send_ctx->credit = 0;
+bcredit_failed:
return ret;
}
@@ -1939,6 +2076,7 @@ static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc,
resp->max_fragmented_size =
cpu_to_le32(sp->max_fragmented_recv_size);
+ atomic_set(&sc->send_io.bcredits.count, 1);
sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
sc->status = SMBDIRECT_SOCKET_CONNECTED;
}
@@ -2408,6 +2546,29 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
le32_to_cpu(req->max_receive_size));
sp->max_fragmented_send_size =
le32_to_cpu(req->max_fragmented_size);
+ /*
+ * The maximum fragmented upper-layer payload receive size supported
+ *
+ * Assume max_payload_per_credit is
+ * smb_direct_receive_credit_max - 24 = 1340
+ *
+ * The maximum number would be
+ * smb_direct_receive_credit_max * max_payload_per_credit
+ *
+ * 1340 * 255 = 341700 (0x536C4)
+ *
+ * The minimum value from the spec is 131072 (0x20000)
+ *
+ * For now we use the logic we used before:
+ * (1364 * 255) / 2 = 173910 (0x2A756)
+ *
+ * We need to adjust this here in case the peer
+ * lowered sp->max_recv_size.
+ *
+ * TODO: instead of adjusting max_fragmented_recv_size
+ * we should adjust the number of available buffers,
+ * but for now we keep the current logic.
+ */
sp->max_fragmented_recv_size =
(sp->recv_credit_max * sp->max_recv_size) / 2;
sc->recv_io.credits.target = le16_to_cpu(req->credits_requested);
@@ -2495,6 +2656,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id,
struct rdma_cm_event *event)
{
+ struct smb_direct_listener *listener = new_cm_id->context;
struct smb_direct_transport *t;
struct smbdirect_socket *sc;
struct smbdirect_socket_parameters *sp;
@@ -2583,7 +2745,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id,
handler = kthread_run(ksmbd_conn_handler_loop,
KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- smb_direct_port);
+ listener->port);
if (IS_ERR(handler)) {
ret = PTR_ERR(handler);
pr_err("Can't start thread\n");
@@ -2620,39 +2782,73 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
return 0;
}
-static int smb_direct_listen(int port)
+static int smb_direct_listen(struct smb_direct_listener *listener,
+ int port)
{
int ret;
struct rdma_cm_id *cm_id;
+ u8 node_type = RDMA_NODE_UNSPECIFIED;
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
+ switch (port) {
+ case SMB_DIRECT_PORT_IWARP:
+ /*
+ * only allow iWarp devices
+ * for port 5445.
+ */
+ node_type = RDMA_NODE_RNIC;
+ break;
+ case SMB_DIRECT_PORT_INFINIBAND:
+ /*
+ * only allow InfiniBand, RoCEv1 or RoCEv2
+ * devices for port 445.
+ *
+ * (Basically don't allow iWarp devices)
+ */
+ node_type = RDMA_NODE_IB_CA;
+ break;
+ default:
+ pr_err("unsupported smbdirect port=%d!\n", port);
+ return -ENODEV;
+ }
+
cm_id = rdma_create_id(&init_net, smb_direct_listen_handler,
- &smb_direct_listener, RDMA_PS_TCP, IB_QPT_RC);
+ listener, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id));
return PTR_ERR(cm_id);
}
+ ret = rdma_restrict_node_type(cm_id, node_type);
+ if (ret) {
+ pr_err("rdma_restrict_node_type(%u) failed %d\n",
+ node_type, ret);
+ goto err;
+ }
+
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
if (ret) {
pr_err("Can't bind: %d\n", ret);
goto err;
}
- smb_direct_listener.cm_id = cm_id;
-
ret = rdma_listen(cm_id, 10);
if (ret) {
pr_err("Can't listen: %d\n", ret);
goto err;
}
+
+ listener->port = port;
+ listener->cm_id = cm_id;
+
return 0;
err:
- smb_direct_listener.cm_id = NULL;
+ listener->port = 0;
+ listener->cm_id = NULL;
rdma_destroy_id(cm_id);
return ret;
}
@@ -2661,10 +2857,6 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev)
{
struct smb_direct_device *smb_dev;
- /* Set 5445 port if device type is iWARP(No IB) */
- if (ib_dev->node_type != RDMA_NODE_IB_CA)
- smb_direct_port = SMB_DIRECT_PORT_IWARP;
-
if (!rdma_frwr_is_supported(&ib_dev->attrs))
return 0;
@@ -2707,8 +2899,9 @@ int ksmbd_rdma_init(void)
{
int ret;
- smb_direct_port = SMB_DIRECT_PORT_INFINIBAND;
- smb_direct_listener.cm_id = NULL;
+ smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) {
+ .cm_id = NULL,
+ };
ret = ib_register_client(&smb_direct_ib_client);
if (ret) {
@@ -2724,31 +2917,53 @@ int ksmbd_rdma_init(void)
smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
0);
- if (!smb_direct_wq)
- return -ENOMEM;
-
- ret = smb_direct_listen(smb_direct_port);
- if (ret) {
- destroy_workqueue(smb_direct_wq);
- smb_direct_wq = NULL;
- pr_err("Can't listen: %d\n", ret);
- return ret;
+ if (!smb_direct_wq) {
+ ret = -ENOMEM;
+ goto err;
}
- ksmbd_debug(RDMA, "init RDMA listener. cm_id=%p\n",
- smb_direct_listener.cm_id);
+ ret = smb_direct_listen(&smb_direct_ib_listener,
+ SMB_DIRECT_PORT_INFINIBAND);
+ if (ret) {
+ pr_err("Can't listen on InfiniBand/RoCEv1/RoCEv2: %d\n", ret);
+ goto err;
+ }
+
+ ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. cm_id=%p\n",
+ smb_direct_ib_listener.cm_id);
+
+ ret = smb_direct_listen(&smb_direct_iw_listener,
+ SMB_DIRECT_PORT_IWARP);
+ if (ret) {
+ pr_err("Can't listen on iWarp: %d\n", ret);
+ goto err;
+ }
+
+ ksmbd_debug(RDMA, "iWarp RDMA listener. cm_id=%p\n",
+ smb_direct_iw_listener.cm_id);
+
return 0;
+err:
+ ksmbd_rdma_stop_listening();
+ ksmbd_rdma_destroy();
+ return ret;
}
void ksmbd_rdma_stop_listening(void)
{
- if (!smb_direct_listener.cm_id)
+ if (!smb_direct_ib_listener.cm_id && !smb_direct_iw_listener.cm_id)
return;
ib_unregister_client(&smb_direct_ib_client);
- rdma_destroy_id(smb_direct_listener.cm_id);
- smb_direct_listener.cm_id = NULL;
+ if (smb_direct_ib_listener.cm_id)
+ rdma_destroy_id(smb_direct_ib_listener.cm_id);
+ if (smb_direct_iw_listener.cm_id)
+ rdma_destroy_id(smb_direct_iw_listener.cm_id);
+
+ smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) {
+ .cm_id = NULL,
+ };
}
void ksmbd_rdma_destroy(void)
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 4bb0793..2436dab 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -40,6 +40,7 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops;
static void tcp_stop_kthread(struct task_struct *kthread);
static struct interface *alloc_iface(char *ifname);
+static void ksmbd_tcp_disconnect(struct ksmbd_transport *t);
#define KSMBD_TRANS(t) (&(t)->transport)
#define TCP_TRANS(t) ((struct tcp_transport *)container_of(t, \
@@ -202,7 +203,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
rc = PTR_ERR(handler);
- free_transport(t);
+ ksmbd_tcp_disconnect(KSMBD_TRANS(t));
}
return rc;
}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index fd0a5b2..fbdc854 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -31,6 +31,7 @@
#include "ndr.h"
#include "auth.h"
#include "misc.h"
+#include "stats.h"
#include "smb_common.h"
#include "mgmt/share_config.h"
@@ -380,6 +381,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
}
filp->f_pos = *pos;
+ ksmbd_counter_add(KSMBD_COUNTER_READ_BYTES, (s64)nbytes);
return nbytes;
}
@@ -517,6 +519,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
pr_err("fsync failed for filename = %pD, err = %d\n",
fp->filp, err);
}
+ ksmbd_counter_add(KSMBD_COUNTER_WRITE_BYTES, (s64)*written);
out:
return err;
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 6ef1165..e302e40 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -16,10 +16,12 @@
#include "oplock.h"
#include "vfs.h"
#include "connection.h"
+#include "misc.h"
#include "mgmt/tree_connect.h"
#include "mgmt/user_session.h"
#include "smb_common.h"
#include "server.h"
+#include "smb2pdu.h"
#define S_DEL_PENDING 1
#define S_DEL_ON_CLS 2
@@ -34,6 +36,97 @@ static struct ksmbd_file_table global_ft;
static atomic_long_t fd_limit;
static struct kmem_cache *filp_cache;
+#define OPLOCK_NONE 0
+#define OPLOCK_EXCLUSIVE 1
+#define OPLOCK_BATCH 2
+#define OPLOCK_READ 3 /* level 2 oplock */
+
+#ifdef CONFIG_PROC_FS
+
+static const struct ksmbd_const_name ksmbd_lease_const_names[] = {
+ {le32_to_cpu(SMB2_LEASE_NONE_LE), "LEASE_NONE"},
+ {le32_to_cpu(SMB2_LEASE_READ_CACHING_LE), "LEASE_R"},
+ {le32_to_cpu(SMB2_LEASE_HANDLE_CACHING_LE), "LEASE_H"},
+ {le32_to_cpu(SMB2_LEASE_WRITE_CACHING_LE), "LEASE_W"},
+ {le32_to_cpu(SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE), "LEASE_RH"},
+ {le32_to_cpu(SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE), "LEASE_RW"},
+ {le32_to_cpu(SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE), "LEASE_WH"},
+ {le32_to_cpu(SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE), "LEASE_RWH"},
+};
+
+static const struct ksmbd_const_name ksmbd_oplock_const_names[] = {
+ {SMB2_OPLOCK_LEVEL_NONE, "OPLOCK_NONE"},
+ {SMB2_OPLOCK_LEVEL_II, "OPLOCK_II"},
+ {SMB2_OPLOCK_LEVEL_EXCLUSIVE, "OPLOCK_EXECL"},
+ {SMB2_OPLOCK_LEVEL_BATCH, "OPLOCK_BATCH"},
+};
+
+static int proc_show_files(struct seq_file *m, void *v)
+{
+ struct ksmbd_file *fp = NULL;
+ unsigned int id;
+ struct oplock_info *opinfo;
+
+ seq_printf(m, "#%-10s %-10s %-10s %-10s %-15s %-10s %-10s %s\n",
+ "<tree id>", "<pid>", "<vid>", "<refcnt>",
+ "<oplock>", "<daccess>", "<saccess>",
+ "<name>");
+
+ read_lock(&global_ft.lock);
+ idr_for_each_entry(global_ft.idr, fp, id) {
+ seq_printf(m, "%#-10x %#-10llx %#-10llx %#-10x",
+ fp->tcon->id,
+ fp->persistent_id,
+ fp->volatile_id,
+ atomic_read(&fp->refcount));
+
+ rcu_read_lock();
+ opinfo = rcu_dereference(fp->f_opinfo);
+ rcu_read_unlock();
+
+ if (!opinfo) {
+ seq_printf(m, " %-15s", " ");
+ } else {
+ const struct ksmbd_const_name *const_names;
+ int count;
+ unsigned int level;
+
+ if (opinfo->is_lease) {
+ const_names = ksmbd_lease_const_names;
+ count = ARRAY_SIZE(ksmbd_lease_const_names);
+ level = le32_to_cpu(opinfo->o_lease->state);
+ } else {
+ const_names = ksmbd_oplock_const_names;
+ count = ARRAY_SIZE(ksmbd_oplock_const_names);
+ level = opinfo->level;
+ }
+ ksmbd_proc_show_const_name(m, " %-15s",
+ const_names, count, level);
+ }
+
+ seq_printf(m, " %#010x %#010x %s\n",
+ le32_to_cpu(fp->daccess),
+ le32_to_cpu(fp->saccess),
+ fp->filp->f_path.dentry->d_name.name);
+ }
+ read_unlock(&global_ft.lock);
+ return 0;
+}
+
+static int create_proc_files(void)
+{
+ ksmbd_proc_create("files", proc_show_files, NULL);
+ return 0;
+}
+#else
+static int create_proc_files(void) { return 0; }
+#endif
+
static bool durable_scavenger_running;
static DEFINE_MUTEX(durable_scavenger_lock);
static wait_queue_head_t dh_wq;
@@ -949,6 +1042,7 @@ void ksmbd_close_session_fds(struct ksmbd_work *work)
int ksmbd_init_global_file_table(void)
{
+ create_proc_files();
return ksmbd_init_file_table(&global_ft);
}
diff --git a/fs/super.c b/fs/super.c
index b13c1fd..784b529 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -620,6 +620,7 @@ void generic_shutdown_super(struct super_block *sb)
const struct super_operations *sop = sb->s_op;
if (sb->s_root) {
+ fsnotify_sb_delete(sb);
shrink_dcache_for_umount(sb);
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
@@ -632,9 +633,8 @@ void generic_shutdown_super(struct super_block *sb)
/*
* Clean up and evict any inodes that still have references due
- * to fsnotify or the security policy.
+ * to the security policy.
*/
- fsnotify_sb_delete(sb);
security_sb_delete(sb);
if (sb->s_dio_done_wq) {
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 435559a..ddb4a88 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -5,6 +5,7 @@
init.o \
measure.o \
open.o \
+ pagecache.o \
read_metadata.o \
verify.o
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 95ec42b..c944807 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -41,14 +41,15 @@ static int hash_one_block(const struct merkle_tree_params *params,
return 0;
}
-static int write_merkle_tree_block(struct inode *inode, const u8 *buf,
+static int write_merkle_tree_block(struct file *file, const u8 *buf,
unsigned long index,
const struct merkle_tree_params *params)
{
+ struct inode *inode = file_inode(file);
u64 pos = (u64)index << params->log_blocksize;
int err;
- err = inode->i_sb->s_vop->write_merkle_tree_block(inode, buf, pos,
+ err = inode->i_sb->s_vop->write_merkle_tree_block(file, buf, pos,
params->block_size);
if (err)
fsverity_err(inode, "Error %d writing Merkle tree block %lu",
@@ -135,7 +136,7 @@ static int build_merkle_tree(struct file *filp,
err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
- err = write_merkle_tree_block(inode,
+ err = write_merkle_tree_block(filp,
buffers[level].data,
level_offset[level],
params);
@@ -155,7 +156,7 @@ static int build_merkle_tree(struct file *filp,
err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
- err = write_merkle_tree_block(inode,
+ err = write_merkle_tree_block(filp,
buffers[level].data,
level_offset[level],
params);
@@ -265,8 +266,25 @@ static int enable_verity(struct file *filp,
}
/*
+ * Add the fsverity_info into the hash table before finishing the
+ * initialization so that we don't have to undo the enabling when memory
+ * allocation for the hash table fails. This is safe because looking up
+ * the fsverity_info always first checks the S_VERITY flag on the inode,
+ * which will only be set at the very end of the ->end_enable_verity
+ * method.
+ */
+ err = fsverity_set_info(vi);
+ if (err) {
+ fsverity_free_info(vi);
+ goto rollback;
+ }
+
+ /*
* Tell the filesystem to finish enabling verity on the file.
- * Serialized with ->begin_enable_verity() by the inode lock.
+ * Serialized with ->begin_enable_verity() by the inode lock. The file
+ * system needs to set the S_VERITY flag on the inode at the very end of
+ * the method, at which point the fsverity information can be accessed
+ * by other threads.
*/
inode_lock(inode);
err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
@@ -274,19 +292,10 @@ static int enable_verity(struct file *filp,
if (err) {
fsverity_err(inode, "%ps() failed with err %d",
vops->end_enable_verity, err);
- fsverity_free_info(vi);
+ fsverity_remove_info(vi);
} else if (WARN_ON_ONCE(!IS_VERITY(inode))) {
+ fsverity_remove_info(vi);
err = -EINVAL;
- fsverity_free_info(vi);
- } else {
- /* Successfully enabled verity */
-
- /*
- * Readers can start using the inode's verity info immediately,
- * so it can't be rolled back once set. So don't set it until
- * just after the filesystem has successfully enabled verity.
- */
- fsverity_set_info(inode, vi);
}
out:
kfree(params.hashstate);
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index dd20b13..2887cb84 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "fs-verity: " fmt
#include <linux/fsverity.h>
+#include <linux/rhashtable.h>
/*
* Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty;
@@ -63,17 +64,18 @@ struct merkle_tree_params {
* fsverity_info - cached verity metadata for an inode
*
* When a verity file is first opened, an instance of this struct is allocated
- * and a pointer to it is stored in the file's in-memory inode. It remains
- * until the inode is evicted. It caches information about the Merkle tree
- * that's needed to efficiently verify data read from the file. It also caches
- * the file digest. The Merkle tree pages themselves are not cached here, but
- * the filesystem may cache them.
+ * and a pointer to it is stored in the global hash table, indexed by the inode
+ * pointer value. It remains alive until the inode is evicted. It caches
+ * information about the Merkle tree that's needed to efficiently verify data
+ * read from the file. It also caches the file digest. The Merkle tree pages
+ * themselves are not cached here, but the filesystem may cache them.
*/
struct fsverity_info {
+ struct rhash_head rhash_head;
struct merkle_tree_params tree_params;
u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
- const struct inode *inode;
+ struct inode *inode;
unsigned long *hash_block_verified;
};
@@ -124,12 +126,12 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
unsigned int log_blocksize,
const u8 *salt, size_t salt_size);
-struct fsverity_info *fsverity_create_info(const struct inode *inode,
+struct fsverity_info *fsverity_create_info(struct inode *inode,
struct fsverity_descriptor *desc);
-void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
-
+int fsverity_set_info(struct fsverity_info *vi);
void fsverity_free_info(struct fsverity_info *vi);
+void fsverity_remove_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret);
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 77b1c97..dfa0d1af 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -12,6 +12,14 @@
#include <linux/slab.h>
static struct kmem_cache *fsverity_info_cachep;
+static struct rhashtable fsverity_info_hash;
+
+static const struct rhashtable_params fsverity_info_hash_params = {
+ .key_len = sizeof_field(struct fsverity_info, inode),
+ .key_offset = offsetof(struct fsverity_info, inode),
+ .head_offset = offsetof(struct fsverity_info, rhash_head),
+ .automatic_shrinking = true,
+};
/**
* fsverity_init_merkle_tree_params() - initialize Merkle tree parameters
@@ -175,7 +183,7 @@ static void compute_file_digest(const struct fsverity_hash_alg *hash_alg,
* appended builtin signature), and check the signature if present. The
* fsverity_descriptor must have already undergone basic validation.
*/
-struct fsverity_info *fsverity_create_info(const struct inode *inode,
+struct fsverity_info *fsverity_create_info(struct inode *inode,
struct fsverity_descriptor *desc)
{
struct fsverity_info *vi;
@@ -241,33 +249,19 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
return ERR_PTR(err);
}
-void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
+int fsverity_set_info(struct fsverity_info *vi)
{
- /*
- * Multiple tasks may race to set the inode's verity info pointer, so
- * use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fsverity_get_info(). I.e., publish the pointer with a RELEASE
- * barrier so that other tasks can ACQUIRE it.
- */
- if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) {
- /* Lost the race, so free the verity info we allocated. */
- fsverity_free_info(vi);
- /*
- * Afterwards, the caller may access the inode's verity info
- * directly, so make sure to ACQUIRE the winning verity info.
- */
- (void)fsverity_get_info(inode);
- }
+ return rhashtable_lookup_insert_fast(&fsverity_info_hash,
+ &vi->rhash_head,
+ fsverity_info_hash_params);
}
-void fsverity_free_info(struct fsverity_info *vi)
+struct fsverity_info *__fsverity_get_info(const struct inode *inode)
{
- if (!vi)
- return;
- kfree(vi->tree_params.hashstate);
- kvfree(vi->hash_block_verified);
- kmem_cache_free(fsverity_info_cachep, vi);
+ return rhashtable_lookup_fast(&fsverity_info_hash, &inode,
+ fsverity_info_hash_params);
}
+EXPORT_SYMBOL_GPL(__fsverity_get_info);
static bool validate_fsverity_descriptor(struct inode *inode,
const struct fsverity_descriptor *desc,
@@ -352,7 +346,7 @@ int fsverity_get_descriptor(struct inode *inode,
static int ensure_verity_info(struct inode *inode)
{
- struct fsverity_info *vi = fsverity_get_info(inode);
+ struct fsverity_info *vi = fsverity_get_info(inode), *found;
struct fsverity_descriptor *desc;
int err;
@@ -369,8 +363,19 @@ static int ensure_verity_info(struct inode *inode)
goto out_free_desc;
}
- fsverity_set_info(inode, vi);
- err = 0;
+ /*
+ * Multiple tasks may race to set the inode's verity info, in which case
+ * we might find an existing fsverity_info in the hash table.
+ */
+ found = rhashtable_lookup_get_insert_fast(&fsverity_info_hash,
+ &vi->rhash_head,
+ fsverity_info_hash_params);
+ if (found) {
+ fsverity_free_info(vi);
+ if (IS_ERR(found))
+ err = PTR_ERR(found);
+ }
+
out_free_desc:
kfree(desc);
return err;
@@ -384,25 +389,32 @@ int __fsverity_file_open(struct inode *inode, struct file *filp)
}
EXPORT_SYMBOL_GPL(__fsverity_file_open);
-int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+void fsverity_free_info(struct fsverity_info *vi)
{
- if (attr->ia_valid & ATTR_SIZE)
- return -EPERM;
- return 0;
+ kfree(vi->tree_params.hashstate);
+ kvfree(vi->hash_block_verified);
+ kmem_cache_free(fsverity_info_cachep, vi);
}
-EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr);
-void __fsverity_cleanup_inode(struct inode *inode)
+void fsverity_remove_info(struct fsverity_info *vi)
{
- struct fsverity_info **vi_addr = fsverity_info_addr(inode);
-
- fsverity_free_info(*vi_addr);
- *vi_addr = NULL;
+ rhashtable_remove_fast(&fsverity_info_hash, &vi->rhash_head,
+ fsverity_info_hash_params);
+ fsverity_free_info(vi);
}
-EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
+
+void fsverity_cleanup_inode(struct inode *inode)
+{
+ struct fsverity_info *vi = fsverity_get_info(inode);
+
+ if (vi)
+ fsverity_remove_info(vi);
+}
void __init fsverity_init_info_cache(void)
{
+ if (rhashtable_init(&fsverity_info_hash, &fsverity_info_hash_params))
+ panic("failed to initialize fsverity hash\n");
fsverity_info_cachep = KMEM_CACHE_USERCOPY(
fsverity_info,
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC,
diff --git a/fs/verity/pagecache.c b/fs/verity/pagecache.c
new file mode 100644
index 0000000..1819314
--- /dev/null
+++ b/fs/verity/pagecache.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#include <linux/export.h>
+#include <linux/fsverity.h>
+#include <linux/pagemap.h>
+
+/**
+ * generic_read_merkle_tree_page - generic ->read_merkle_tree_page helper
+ * @inode: inode containing the Merkle tree
+ * @index: 0-based index of the Merkle tree page in the inode
+ *
+ * The caller needs to adjust @index from the Merkle-tree relative index passed
+ * to ->read_merkle_tree_page to the actual index where the Merkle tree is
+ * stored in the page cache for @inode.
+ */
+struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index)
+{
+ struct folio *folio;
+
+ folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
+ return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page);
+
+/**
+ * generic_readahead_merkle_tree() - generic ->readahead_merkle_tree helper
+ * @inode: inode containing the Merkle tree
+ * @index: 0-based index of the first Merkle tree page to read ahead in the
+ * inode
+ * @nr_pages: the number of Merkle tree pages that should be read ahead
+ *
+ * The caller needs to adjust @index from the Merkle-tree relative index passed
+ * to ->read_merkle_tree_page to the actual index where the Merkle tree is
+ * stored in the page cache for @inode.
+ */
+void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+ unsigned long nr_pages)
+{
+ struct folio *folio;
+
+ lockdep_assert_held(&inode->i_mapping->invalidate_lock);
+
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (folio == ERR_PTR(-ENOENT) ||
+ (!IS_ERR(folio) && !folio_test_uptodate(folio))) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
+ page_cache_ra_unbounded(&ractl, nr_pages, 0);
+ }
+ if (!IS_ERR(folio))
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(generic_readahead_merkle_tree);
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index cba5d6a..b4c0892 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -28,24 +28,33 @@ static int fsverity_read_merkle_tree(struct inode *inode,
if (offset >= end_offset)
return 0;
offs_in_page = offset_in_page(offset);
+ index = offset >> PAGE_SHIFT;
last_index = (end_offset - 1) >> PAGE_SHIFT;
/*
+ * Kick off readahead for the range we are going to read to ensure a
+ * single large sequential read instead of lots of small ones.
+ */
+ if (inode->i_sb->s_vop->readahead_merkle_tree) {
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ inode->i_sb->s_vop->readahead_merkle_tree(
+ inode, index, last_index - index + 1);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+ }
+
+ /*
* Iterate through each Merkle tree page in the requested range and copy
* the requested portion to userspace. Note that the Merkle tree block
* size isn't important here, as we are returning a byte stream; i.e.,
* we can just work with pages even if the tree block size != PAGE_SIZE.
*/
- for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
- unsigned long num_ra_pages =
- min_t(unsigned long, last_index - index + 1,
- inode->i_sb->s_bdi->io_pages);
+ for (; index <= last_index; index++) {
unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
PAGE_SIZE - offs_in_page);
struct page *page;
const void *virt;
- page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
+ page = vops->read_merkle_tree_page(inode, index);
if (IS_ERR(page)) {
err = PTR_ERR(page);
fsverity_err(inode,
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 86067c8..31797f9 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -19,9 +19,7 @@ struct fsverity_pending_block {
};
struct fsverity_verification_context {
- struct inode *inode;
struct fsverity_info *vi;
- unsigned long max_ra_pages;
/*
* This is the queue of data blocks that are pending verification. When
@@ -37,6 +35,50 @@ struct fsverity_verification_context {
static struct workqueue_struct *fsverity_read_workqueue;
+/**
+ * fsverity_readahead() - kick off readahead on fsverity hashes
+ * @vi: fsverity_info for the inode to be read
+ * @index: first file data page index that is being read
+ * @nr_pages: number of file data pages to be read
+ *
+ * Start readahead on the fsverity hashes that are needed to verify the file
+ * data in the range from @index to @index + @nr_pages (exclusive upper bound).
+ *
+ * To be called from the file systems' ->read_folio and ->readahead methods to
+ * ensure that the hashes are already cached on completion of the file data
+ * read if possible.
+ */
+void fsverity_readahead(struct fsverity_info *vi, pgoff_t index,
+ unsigned long nr_pages)
+{
+ struct inode *inode = vi->inode;
+ const struct merkle_tree_params *params = &vi->tree_params;
+ u64 start_hidx = (u64)index << params->log_blocks_per_page;
+ u64 end_hidx =
+ (((u64)index + nr_pages) << params->log_blocks_per_page) - 1;
+ int level;
+
+ if (!inode->i_sb->s_vop->readahead_merkle_tree)
+ return;
+
+ for (level = 0; level < params->num_levels; level++) {
+ unsigned long level_start = params->level_start[level];
+ unsigned long next_start_hidx = start_hidx >> params->log_arity;
+ unsigned long next_end_hidx = end_hidx >> params->log_arity;
+ pgoff_t start_idx = (level_start + next_start_hidx) >>
+ params->log_blocks_per_page;
+ pgoff_t end_idx = (level_start + next_end_hidx) >>
+ params->log_blocks_per_page;
+
+ inode->i_sb->s_vop->readahead_merkle_tree(
+ inode, start_idx, end_idx - start_idx + 1);
+
+ start_hidx = next_start_hidx;
+ end_hidx = next_end_hidx;
+ }
+}
+EXPORT_SYMBOL_GPL(fsverity_readahead);
+
/*
* Returns true if the hash block with index @hblock_idx in the tree, located in
* @hpage, has already been verified.
@@ -113,10 +155,10 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
*
* Return: %true if the data block is valid, else %false.
*/
-static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
- const struct fsverity_pending_block *dblock,
- unsigned long max_ra_pages)
+static bool verify_data_block(struct fsverity_info *vi,
+ const struct fsverity_pending_block *dblock)
{
+ struct inode *inode = vi->inode;
const u64 data_pos = dblock->pos;
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
@@ -200,8 +242,7 @@ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
(params->block_size - 1);
hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
- hpage_idx, level == 0 ? min(max_ra_pages,
- params->tree_pages - hpage_idx) : 0);
+ hpage_idx);
if (IS_ERR(hpage)) {
fsverity_err(inode,
"Error %ld reading Merkle tree page %lu",
@@ -272,14 +313,9 @@ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
static void
fsverity_init_verification_context(struct fsverity_verification_context *ctx,
- struct inode *inode,
- unsigned long max_ra_pages)
+ struct fsverity_info *vi)
{
- struct fsverity_info *vi = *fsverity_info_addr(inode);
-
- ctx->inode = inode;
ctx->vi = vi;
- ctx->max_ra_pages = max_ra_pages;
ctx->num_pending = 0;
if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
sha256_finup_2x_is_optimized())
@@ -322,8 +358,7 @@ fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
}
for (i = 0; i < ctx->num_pending; i++) {
- if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
- ctx->max_ra_pages))
+ if (!verify_data_block(vi, &ctx->pending_blocks[i]))
return false;
}
fsverity_clear_pending_blocks(ctx);
@@ -359,6 +394,7 @@ static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
/**
* fsverity_verify_blocks() - verify data in a folio
+ * @vi: fsverity_info for the inode to be read
* @folio: the folio containing the data to verify
* @len: the length of the data to verify in the folio
* @offset: the offset of the data to verify in the folio
@@ -369,11 +405,12 @@ static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
*
* Return: %true if the data is valid, else %false.
*/
-bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
+bool fsverity_verify_blocks(struct fsverity_info *vi, struct folio *folio,
+ size_t len, size_t offset)
{
struct fsverity_verification_context ctx;
- fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+ fsverity_init_verification_context(&ctx, vi);
if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
fsverity_verify_pending_blocks(&ctx))
@@ -386,6 +423,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
#ifdef CONFIG_BLOCK
/**
* fsverity_verify_bio() - verify a 'read' bio that has just completed
+ * @vi: fsverity_info for the inode to be read
* @bio: the bio to verify
*
* Verify the bio's data against the file's Merkle tree. All bio data segments
@@ -398,27 +436,12 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
* filesystems) must instead call fsverity_verify_page() directly on each page.
* All filesystems must also call fsverity_verify_page() on holes.
*/
-void fsverity_verify_bio(struct bio *bio)
+void fsverity_verify_bio(struct fsverity_info *vi, struct bio *bio)
{
- struct inode *inode = bio_first_folio_all(bio)->mapping->host;
struct fsverity_verification_context ctx;
struct folio_iter fi;
- unsigned long max_ra_pages = 0;
- if (bio->bi_opf & REQ_RAHEAD) {
- /*
- * If this bio is for data readahead, then we also do readahead
- * of the first (largest) level of the Merkle tree. Namely,
- * when a Merkle tree page is read, we also try to piggy-back on
- * some additional pages -- up to 1/4 the number of data pages.
- *
- * This improves sequential read performance, as it greatly
- * reduces the number of I/O requests made to the Merkle tree.
- */
- max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
- }
-
- fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+ fsverity_init_verification_context(&ctx, vi);
bio_for_each_folio_all(fi, bio) {
if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0d954ea..9598540 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -553,7 +553,7 @@ struct fsnotify_mark_connector {
/* Used listing heads to free after srcu period expires */
struct fsnotify_mark_connector *destroy_next;
};
- struct hlist_head list;
+ struct hlist_head list; /* List of marks */
};
/*
@@ -562,6 +562,9 @@ struct fsnotify_mark_connector {
*/
struct fsnotify_sb_info {
struct fsnotify_mark_connector __rcu *sb_marks;
+ /* List of connectors for inode marks */
+ struct list_head inode_conn_list;
+ spinlock_t list_lock; /* Lock protecting inode_conn_list */
/*
* Number of inode/mount/sb objects that are being watched in this sb.
* Note that inodes objects are currently double-accounted.
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 5bc7280..fed9102 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -31,13 +31,6 @@ struct fsverity_info;
/* Verity operations for filesystems */
struct fsverity_operations {
/**
- * The offset of the pointer to struct fsverity_info in the
- * filesystem-specific part of the inode, relative to the beginning of
- * the common part of the inode (the 'struct inode').
- */
- ptrdiff_t inode_info_offs;
-
- /**
* Begin enabling verity on the given file.
*
* @filp: a readonly file descriptor for the file
@@ -97,10 +90,6 @@ struct fsverity_operations {
*
* @inode: the inode
* @index: 0-based index of the page within the Merkle tree
- * @num_ra_pages: The number of Merkle tree pages that should be
- * prefetched starting at @index if the page at @index
- * isn't already cached. Implementations may ignore this
- * argument; it's only a performance optimization.
*
* This can be called at any time on an open verity file. It may be
* called by multiple processes concurrently, even with the same page.
@@ -110,13 +99,28 @@ struct fsverity_operations {
* Return: the page on success, ERR_PTR() on failure
*/
struct page *(*read_merkle_tree_page)(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages);
+ pgoff_t index);
/**
- * Write a Merkle tree block to the given inode.
+ * Perform readahead of a Merkle tree for the given inode.
*
- * @inode: the inode for which the Merkle tree is being built
+ * @inode: the inode
+ * @index: 0-based index of the first page within the Merkle tree
+ * @nr_pages: number of pages to be read ahead.
+ *
+ * This can be called at any time on an open verity file. It may be
+ * called by multiple processes concurrently, even with the same range.
+ *
+ * Optional method so that ->read_merkle_tree_page preferably finds
+ * cached data instead of issuing dependent I/O.
+ */
+ void (*readahead_merkle_tree)(struct inode *inode, pgoff_t index,
+ unsigned long nr_pages);
+
+ /**
+ * Write a Merkle tree block to the given file.
+ *
+ * @file: the file for which the Merkle tree is being built
* @buf: the Merkle tree block to write
* @pos: the position of the block in the Merkle tree (in bytes)
* @size: the Merkle tree block size (in bytes)
@@ -126,43 +130,48 @@ struct fsverity_operations {
*
* Return: 0 on success, -errno on failure
*/
- int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
+ int (*write_merkle_tree_block)(struct file *file, const void *buf,
u64 pos, unsigned int size);
};
#ifdef CONFIG_FS_VERITY
-
-/*
- * Returns the address of the verity info pointer within the filesystem-specific
- * part of the inode. (To save memory on filesystems that don't support
- * fsverity, a field in 'struct inode' itself is no longer used.)
+/**
+ * fsverity_active() - do reads from the inode need to go through fs-verity?
+ * @inode: inode to check
+ *
+ * This checks whether the inode's verity info has been set, and reads need
+ * to verify the file data.
+ *
+ * Return: true if reads need to go through fs-verity, otherwise false
*/
-static inline struct fsverity_info **
-fsverity_info_addr(const struct inode *inode)
+static inline bool fsverity_active(const struct inode *inode)
{
- VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0);
- return (void *)inode + inode->i_sb->s_vop->inode_info_offs;
+ if (IS_VERITY(inode)) {
+ /*
+ * This pairs with the try_cmpxchg in set_mask_bits()
+ * used to set the S_VERITY bit in i_flags.
+ */
+ smp_mb();
+ return true;
+ }
+
+ return false;
}
+struct fsverity_info *__fsverity_get_info(const struct inode *inode);
+/**
+ * fsverity_get_info - get fsverity information for an inode
+ * @inode: inode to operate on.
+ *
+ * This gets the fsverity_info for @inode if it exists. Safe to call without
+ * knowin that a fsverity_info exist for @inode, including on file systems that
+ * do not support fsverity.
+ */
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
- /*
- * Since this function can be called on inodes belonging to filesystems
- * that don't support fsverity at all, and fsverity_info_addr() doesn't
- * work on such filesystems, we have to start with an IS_VERITY() check.
- * Checking IS_VERITY() here is also useful to minimize the overhead of
- * fsverity_active() on non-verity files.
- */
- if (!IS_VERITY(inode))
+ if (!fsverity_active(inode))
return NULL;
-
- /*
- * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e.,
- * another task may publish the inode's verity info concurrently,
- * executing a RELEASE barrier. Use smp_load_acquire() here to safely
- * ACQUIRE the memory the other task published.
- */
- return smp_load_acquire(fsverity_info_addr(inode));
+ return __fsverity_get_info(inode);
}
/* enable.c */
@@ -179,27 +188,6 @@ int fsverity_get_digest(struct inode *inode,
/* open.c */
int __fsverity_file_open(struct inode *inode, struct file *filp);
-int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
-void __fsverity_cleanup_inode(struct inode *inode);
-
-/**
- * fsverity_cleanup_inode() - free the inode's verity info, if present
- * @inode: an inode being evicted
- *
- * Filesystems must call this on inode eviction to free the inode's verity info.
- */
-static inline void fsverity_cleanup_inode(struct inode *inode)
-{
- /*
- * Only IS_VERITY() inodes can have verity info, so start by checking
- * for IS_VERITY() (which is faster than retrieving the pointer to the
- * verity info). This minimizes overhead for non-verity inodes.
- */
- if (IS_VERITY(inode))
- __fsverity_cleanup_inode(inode);
- else
- VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL);
-}
/* read_metadata.c */
@@ -207,12 +195,18 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
/* verify.c */
-bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset);
-void fsverity_verify_bio(struct bio *bio);
+bool fsverity_verify_blocks(struct fsverity_info *vi, struct folio *folio,
+ size_t len, size_t offset);
+void fsverity_verify_bio(struct fsverity_info *vi, struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);
#else /* !CONFIG_FS_VERITY */
+static inline bool fsverity_active(const struct inode *inode)
+{
+ return false;
+}
+
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
return NULL;
@@ -251,16 +245,6 @@ static inline int __fsverity_file_open(struct inode *inode, struct file *filp)
return -EOPNOTSUPP;
}
-static inline int __fsverity_prepare_setattr(struct dentry *dentry,
- struct iattr *attr)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void fsverity_cleanup_inode(struct inode *inode)
-{
-}
-
/* read_metadata.c */
static inline int fsverity_ioctl_read_metadata(struct file *filp,
@@ -271,14 +255,16 @@ static inline int fsverity_ioctl_read_metadata(struct file *filp,
/* verify.c */
-static inline bool fsverity_verify_blocks(struct folio *folio, size_t len,
+static inline bool fsverity_verify_blocks(struct fsverity_info *vi,
+ struct folio *folio, size_t len,
size_t offset)
{
WARN_ON_ONCE(1);
return false;
}
-static inline void fsverity_verify_bio(struct bio *bio)
+static inline void fsverity_verify_bio(struct fsverity_info *vi,
+ struct bio *bio)
{
WARN_ON_ONCE(1);
}
@@ -290,32 +276,16 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work)
#endif /* !CONFIG_FS_VERITY */
-static inline bool fsverity_verify_folio(struct folio *folio)
+static inline bool fsverity_verify_folio(struct fsverity_info *vi,
+ struct folio *folio)
{
- return fsverity_verify_blocks(folio, folio_size(folio), 0);
+ return fsverity_verify_blocks(vi, folio, folio_size(folio), 0);
}
-static inline bool fsverity_verify_page(struct page *page)
+static inline bool fsverity_verify_page(struct fsverity_info *vi,
+ struct page *page)
{
- return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0);
-}
-
-/**
- * fsverity_active() - do reads from the inode need to go through fs-verity?
- * @inode: inode to check
- *
- * This checks whether the inode's verity info has been set.
- *
- * Filesystems call this from ->readahead() to check whether the pages need to
- * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to
- * a race condition where the file is being read concurrently with
- * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.)
- *
- * Return: true if reads need to go through fs-verity, otherwise false
- */
-static inline bool fsverity_active(const struct inode *inode)
-{
- return fsverity_get_info(inode) != NULL;
+ return fsverity_verify_blocks(vi, page_folio(page), PAGE_SIZE, 0);
}
/**
@@ -338,22 +308,12 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp)
return 0;
}
-/**
- * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
- * @dentry: dentry through which the inode is being changed
- * @attr: attributes to change
- *
- * Verity files are immutable, so deny truncates. This isn't covered by the
- * open-time check because sys_truncate() takes a path, not a file descriptor.
- *
- * Return: 0 on success, -errno on failure
- */
-static inline int fsverity_prepare_setattr(struct dentry *dentry,
- struct iattr *attr)
-{
- if (IS_VERITY(d_inode(dentry)))
- return __fsverity_prepare_setattr(dentry, attr);
- return 0;
-}
+void fsverity_cleanup_inode(struct inode *inode);
+void fsverity_readahead(struct fsverity_info *vi, pgoff_t index,
+ unsigned long nr_pages);
+
+struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index);
+void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
+ unsigned long nr_pages);
#endif /* _LINUX_FSVERITY_H */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e947af6..d87be1f2 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -598,6 +598,10 @@ enum {
#define FATTR4_WORD2_TIME_DELEG_ACCESS BIT(FATTR4_TIME_DELEG_ACCESS - 64)
#define FATTR4_WORD2_TIME_DELEG_MODIFY BIT(FATTR4_TIME_DELEG_MODIFY - 64)
#define FATTR4_WORD2_OPEN_ARGUMENTS BIT(FATTR4_OPEN_ARGUMENTS - 64)
+#define FATTR4_WORD2_ACL_TRUEFORM BIT(FATTR4_ACL_TRUEFORM - 64)
+#define FATTR4_WORD2_ACL_TRUEFORM_SCOPE BIT(FATTR4_ACL_TRUEFORM_SCOPE - 64)
+#define FATTR4_WORD2_POSIX_DEFAULT_ACL BIT(FATTR4_POSIX_DEFAULT_ACL - 64)
+#define FATTR4_WORD2_POSIX_ACCESS_ACL BIT(FATTR4_POSIX_ACCESS_ACL - 64)
/* MDS threshold bitmap bits */
#define THRESHOLD_RD (1UL << 0)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5506d20..4dc14c7 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -35,8 +35,10 @@
*/
struct svc_pool {
unsigned int sp_id; /* pool id; also node id on NUMA */
+ unsigned int sp_nrthreads; /* # of threads currently running in pool */
+ unsigned int sp_nrthrmin; /* Min number of threads to run per pool */
+ unsigned int sp_nrthrmax; /* Max requested number of threads in pool */
struct lwq sp_xprts; /* pending transports */
- unsigned int sp_nrthreads; /* # of threads in pool */
struct list_head sp_all_threads; /* all server threads */
struct llist_head sp_idle_threads; /* idle server threads */
@@ -53,6 +55,7 @@ enum {
SP_TASK_PENDING, /* still work to do even if no xprt is queued */
SP_NEED_VICTIM, /* One thread needs to agree to exit */
SP_VICTIM_REMAINS, /* One thread needs to actually exit */
+ SP_TASK_STARTING, /* Task has started but not added to idle yet */
};
@@ -71,7 +74,7 @@ struct svc_serv {
struct svc_stat * sv_stats; /* RPC statistics */
spinlock_t sv_lock;
unsigned int sv_nprogs; /* Number of sv_programs */
- unsigned int sv_nrthreads; /* # of server threads */
+ unsigned int sv_nrthreads; /* # of running server threads */
unsigned int sv_max_payload; /* datagram payload size */
unsigned int sv_max_mesg; /* max_payload + 1 page for overheads */
unsigned int sv_xdrsize; /* XDR buffer size */
@@ -440,13 +443,17 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int,
bool svc_rqst_replace_page(struct svc_rqst *rqstp,
struct page *page);
void svc_rqst_release_pages(struct svc_rqst *rqstp);
+int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *prog,
unsigned int nprog,
struct svc_stat *stats,
unsigned int bufsize,
int (*threadfn)(void *data));
-int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int svc_set_pool_threads(struct svc_serv *serv, struct svc_pool *pool,
+ unsigned int min_threads, unsigned int max_threads);
+int svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads,
+ unsigned int nrservs);
int svc_pool_stats_open(struct svc_info *si, struct file *file);
void svc_process(struct svc_rqst *rqstp);
void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index de37069..372a008 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -61,7 +61,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
/*
* Function prototypes.
*/
-void svc_recv(struct svc_rqst *rqstp);
+int svc_recv(struct svc_rqst *rqstp, long timeo);
void svc_send(struct svc_rqst *rqstp);
int svc_addsock(struct svc_serv *serv, struct net *net,
const int fd, char *name_return, const size_t len,
diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
index 66ca3ec..a723fb1d 100644
--- a/include/linux/sunrpc/xdrgen/_builtins.h
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -46,6 +46,66 @@ xdrgen_encode_bool(struct xdr_stream *xdr, bool val)
return true;
}
+/*
+ * De facto (non-standard but commonly implemented) signed short type:
+ * - Wire sends sign-extended 32-bit value (e.g., 0xFFFFFFFF)
+ * - be32_to_cpup() returns u32 (0xFFFFFFFF)
+ * - Explicit (s16) cast truncates to 16 bits (0xFFFF = -1)
+ */
+static inline bool
+xdrgen_decode_short(struct xdr_stream *xdr, s16 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = (s16)be32_to_cpup(p);
+ return true;
+}
+
+/*
+ * De facto (non-standard but commonly implemented) signed short type:
+ * - C integer promotion sign-extends s16 val to int before passing to
+ * cpu_to_be32()
+ * - This is well-defined: -1 as s16 -1 as int 0xFFFFFFFF on wire
+ */
+static inline bool
+xdrgen_encode_short(struct xdr_stream *xdr, s16 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
+/*
+ * De facto (non-standard but commonly implemented) unsigned short type:
+ * 16-bit integer zero-extended to fill one XDR_UNIT.
+ */
+static inline bool
+xdrgen_decode_unsigned_short(struct xdr_stream *xdr, u16 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = (u16)be32_to_cpup(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_short(struct xdr_stream *xdr, u16 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
static inline bool
xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr)
{
@@ -188,12 +248,10 @@ xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
return false;
if (unlikely(maxlen && len > maxlen))
return false;
- if (len != 0) {
- p = xdr_inline_decode(xdr, len);
- if (unlikely(!p))
- return false;
- ptr->data = (unsigned char *)p;
- }
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return false;
+ ptr->data = (unsigned char *)p;
ptr->len = len;
return true;
}
@@ -219,12 +277,10 @@ xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
return false;
if (unlikely(maxlen && len > maxlen))
return false;
- if (len != 0) {
- p = xdr_inline_decode(xdr, len);
- if (unlikely(!p))
- return false;
- ptr->data = (u8 *)p;
- }
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return false;
+ ptr->data = (u8 *)p;
ptr->len = len;
return true;
}
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
index cf21a14..4ac54bd 100644
--- a/include/linux/sunrpc/xdrgen/nfs4_1.h
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by xdrgen. Manual edits will be lost. */
/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
+/* XDR specification modification time: Thu Jan 8 23:12:07 2026 */
#ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
#define _LINUX_XDRGEN_NFS4_1_DEF_H
@@ -18,6 +18,14 @@ typedef struct {
uint32_t *element;
} bitmap4;
+typedef opaque utf8string;
+
+typedef utf8string utf8str_cis;
+
+typedef utf8string utf8str_cs;
+
+typedef utf8string utf8str_mixed;
+
struct nfstime4 {
int64_t seconds;
uint32_t nseconds;
@@ -40,6 +48,7 @@ enum open_args_share_access4 {
OPEN_ARGS_SHARE_ACCESS_WRITE = 2,
OPEN_ARGS_SHARE_ACCESS_BOTH = 3,
};
+
typedef enum open_args_share_access4 open_args_share_access4;
enum open_args_share_deny4 {
@@ -48,6 +57,7 @@ enum open_args_share_deny4 {
OPEN_ARGS_SHARE_DENY_WRITE = 2,
OPEN_ARGS_SHARE_DENY_BOTH = 3,
};
+
typedef enum open_args_share_deny4 open_args_share_deny4;
enum open_args_share_access_want4 {
@@ -59,6 +69,7 @@ enum open_args_share_access_want4 {
OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20,
OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21,
};
+
typedef enum open_args_share_access_want4 open_args_share_access_want4;
enum open_args_open_claim4 {
@@ -70,6 +81,7 @@ enum open_args_open_claim4 {
OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5,
OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6,
};
+
typedef enum open_args_open_claim4 open_args_open_claim4;
enum open_args_createmode4 {
@@ -78,10 +90,15 @@ enum open_args_createmode4 {
OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2,
OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3,
};
+
typedef enum open_args_createmode4 open_args_createmode4;
typedef struct open_arguments4 fattr4_open_arguments;
+/*
+ * Determine what OPEN supports.
+ */
+
enum { FATTR4_OPEN_ARGUMENTS = 86 };
enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 };
@@ -90,6 +107,11 @@ typedef struct nfstime4 fattr4_time_deleg_access;
typedef struct nfstime4 fattr4_time_deleg_modify;
+/*
+ * New RECOMMENDED Attribute for
+ * delegation caching of times
+ */
+
enum { FATTR4_TIME_DELEG_ACCESS = 84 };
enum { FATTR4_TIME_DELEG_MODIFY = 85 };
@@ -124,13 +146,88 @@ enum open_delegation_type4 {
OPEN_DELEGATE_READ_ATTRS_DELEG = 4,
OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5,
};
+
typedef enum open_delegation_type4 open_delegation_type4;
+enum aclmodel4 {
+ ACL_MODEL_NFS4 = 1,
+ ACL_MODEL_POSIX_DRAFT = 2,
+ ACL_MODEL_NONE = 3,
+};
+
+typedef enum aclmodel4 aclmodel4;
+
+enum aclscope4 {
+ ACL_SCOPE_FILE_OBJECT = 1,
+ ACL_SCOPE_FILE_SYSTEM = 2,
+ ACL_SCOPE_SERVER = 3,
+};
+
+typedef enum aclscope4 aclscope4;
+
+enum posixacetag4 {
+ POSIXACE4_TAG_USER_OBJ = 1,
+ POSIXACE4_TAG_USER = 2,
+ POSIXACE4_TAG_GROUP_OBJ = 3,
+ POSIXACE4_TAG_GROUP = 4,
+ POSIXACE4_TAG_MASK = 5,
+ POSIXACE4_TAG_OTHER = 6,
+};
+
+typedef enum posixacetag4 posixacetag4;
+
+typedef uint32_t posixaceperm4;
+
+enum { POSIXACE4_PERM_EXECUTE = 0x00000001 };
+
+enum { POSIXACE4_PERM_WRITE = 0x00000002 };
+
+enum { POSIXACE4_PERM_READ = 0x00000004 };
+
+struct posixace4 {
+ posixacetag4 tag;
+ posixaceperm4 perm;
+ utf8str_mixed who;
+};
+
+typedef aclmodel4 fattr4_acl_trueform;
+
+typedef aclscope4 fattr4_acl_trueform_scope;
+
+typedef struct {
+ u32 count;
+ struct posixace4 *element;
+} fattr4_posix_default_acl;
+
+typedef struct {
+ u32 count;
+ struct posixace4 *element;
+} fattr4_posix_access_acl;
+
+/*
+ * New for POSIX ACL extension
+ */
+
+enum { FATTR4_ACL_TRUEFORM = 89 };
+
+enum { FATTR4_ACL_TRUEFORM_SCOPE = 90 };
+
+enum { FATTR4_POSIX_DEFAULT_ACL = 91 };
+
+enum { FATTR4_POSIX_ACCESS_ACL = 92 };
+
#define NFS4_int64_t_sz \
(XDR_hyper)
#define NFS4_uint32_t_sz \
(XDR_unsigned_int)
#define NFS4_bitmap4_sz (XDR_unsigned_int)
+#define NFS4_utf8string_sz (XDR_unsigned_int)
+#define NFS4_utf8str_cis_sz \
+ (NFS4_utf8string_sz)
+#define NFS4_utf8str_cs_sz \
+ (NFS4_utf8string_sz)
+#define NFS4_utf8str_mixed_sz \
+ (NFS4_utf8string_sz)
#define NFS4_nfstime4_sz \
(NFS4_int64_t_sz + NFS4_uint32_t_sz)
#define NFS4_fattr4_offline_sz \
@@ -149,5 +246,18 @@ typedef enum open_delegation_type4 open_delegation_type4;
#define NFS4_fattr4_time_deleg_modify_sz \
(NFS4_nfstime4_sz)
#define NFS4_open_delegation_type4_sz (XDR_int)
+#define NFS4_aclmodel4_sz (XDR_int)
+#define NFS4_aclscope4_sz (XDR_int)
+#define NFS4_posixacetag4_sz (XDR_int)
+#define NFS4_posixaceperm4_sz \
+ (NFS4_uint32_t_sz)
+#define NFS4_posixace4_sz \
+ (NFS4_posixacetag4_sz + NFS4_posixaceperm4_sz + NFS4_utf8str_mixed_sz)
+#define NFS4_fattr4_acl_trueform_sz \
+ (NFS4_aclmodel4_sz)
+#define NFS4_fattr4_acl_trueform_scope_sz \
+ (NFS4_aclscope4_sz)
+#define NFS4_fattr4_posix_default_acl_sz (XDR_unsigned_int)
+#define NFS4_fattr4_posix_access_acl_sz (XDR_unsigned_int)
#endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 9bd930a..6de6fd8 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -169,6 +169,23 @@ struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler,
void rdma_destroy_id(struct rdma_cm_id *id);
/**
+ * rdma_restrict_node_type - Restrict an RDMA identifier to specific
+ * RDMA device node type.
+ *
+ * @id: RDMA identifier.
+ * @node_type: The device node type. Only RDMA_NODE_UNSPECIFIED (default),
+ * RDMA_NODE_RNIC and RDMA_NODE_IB_CA are allowed
+ *
+ * This allows the caller to restrict the possible devices
+ * used to iWarp (RDMA_NODE_RNIC) or InfiniBand/RoCEv1/RoCEv2 (RDMA_NODE_IB_CA).
+ *
+ * It needs to be called before the RDMA identifier is bound
+ * to an device, which mean it should be called before
+ * rdma_bind_addr(), rdma_bind_addr() and rdma_listen().
+ */
+int rdma_restrict_node_type(struct rdma_cm_id *id, u8 node_type);
+
+/**
* rdma_bind_addr - Bind an RDMA identifier to a source address and
* associated RDMA device, if needed.
*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index fd76d14..a3e8fe4 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -102,6 +102,9 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
+TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE);
+TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
+TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
#define show_fc_reason(reason) \
@@ -115,7 +118,10 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
{ EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \
{ EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \
{ EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \
- { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"})
+ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \
+ { EXT4_FC_REASON_MIGRATE, "MIGRATE"}, \
+ { EXT4_FC_REASON_VERITY, "VERITY"}, \
+ { EXT4_FC_REASON_MOVE_EXT, "MOVE_EXT"})
TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 71c7196..e629c49 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -55,7 +55,7 @@
NFSERR_NODEV = 19, /* v2 v3 v4 */
NFSERR_NOTDIR = 20, /* v2 v3 v4 */
NFSERR_ISDIR = 21, /* v2 v3 v4 */
- NFSERR_INVAL = 22, /* v2 v3 v4 */
+ NFSERR_INVAL = 22, /* v3 v4 */
NFSERR_FBIG = 27, /* v2 v3 v4 */
NFSERR_NOSPC = 28, /* v2 v3 v4 */
NFSERR_ROFS = 30, /* v2 v3 v4 */
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index e157e200..e9efbc9 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -35,6 +35,7 @@ enum {
NFSD_A_SERVER_GRACETIME,
NFSD_A_SERVER_LEASETIME,
NFSD_A_SERVER_SCOPE,
+ NFSD_A_SERVER_MIN_THREADS,
__NFSD_A_SERVER_MAX,
NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1)
diff --git a/mm/readahead.c b/mm/readahead.c
index b415c99..f43d035 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -204,8 +204,9 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
* not the function you want to call. Use page_cache_async_readahead()
* or page_cache_sync_readahead() instead.
*
- * Context: File is referenced by caller. Mutexes may be held by caller.
- * May sleep, but will not reenter filesystem to reclaim memory.
+ * Context: File is referenced by caller, and ractl->mapping->invalidate_lock
+ * must be held by the caller at least in shared mode. Mutexes may be held by
+ * caller. May sleep, but will not reenter filesystem to reclaim memory.
*/
void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
@@ -228,9 +229,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
unsigned int nofs = memalloc_nofs_save();
+ lockdep_assert_held(&mapping->invalidate_lock);
+
trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read,
lookahead_size);
- filemap_invalidate_lock_shared(mapping);
index = mapping_align_index(mapping, index);
/*
@@ -300,7 +302,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* will then handle the error.
*/
read_pages(ractl);
- filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
@@ -314,9 +315,9 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
static void do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = ractl->mapping->host;
+ struct address_space *mapping = ractl->mapping;
unsigned long index = readahead_index(ractl);
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(mapping->host);
pgoff_t end_index; /* The last page we want to read */
if (isize == 0)
@@ -329,7 +330,9 @@ static void do_page_cache_ra(struct readahead_control *ractl,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
+ filemap_invalidate_lock_shared(mapping);
page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
+ filemap_invalidate_unlock_shared(mapping);
}
/*
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 7d2cdc2..f320c0a 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -320,29 +320,47 @@ static int gssx_dec_status(struct xdr_stream *xdr,
/* status->minor_status */
p = xdr_inline_decode(xdr, 8);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto out_free_mech;
+ }
p = xdr_decode_hyper(p, &status->minor_status);
/* status->major_status_string */
err = gssx_dec_buffer(xdr, &status->major_status_string);
if (err)
- return err;
+ goto out_free_mech;
/* status->minor_status_string */
err = gssx_dec_buffer(xdr, &status->minor_status_string);
if (err)
- return err;
+ goto out_free_major_status_string;
/* status->server_ctx */
err = gssx_dec_buffer(xdr, &status->server_ctx);
if (err)
- return err;
+ goto out_free_minor_status_string;
/* we assume we have no options for now, so simply consume them */
/* status->options */
err = dummy_dec_opt_array(xdr, &status->options);
+ if (err)
+ goto out_free_server_ctx;
+ return 0;
+
+out_free_server_ctx:
+ kfree(status->server_ctx.data);
+ status->server_ctx.data = NULL;
+out_free_minor_status_string:
+ kfree(status->minor_status_string.data);
+ status->minor_status_string.data = NULL;
+out_free_major_status_string:
+ kfree(status->major_status_string.data);
+ status->major_status_string.data = NULL;
+out_free_mech:
+ kfree(status->mech.data);
+ status->mech.data = NULL;
return err;
}
@@ -505,28 +523,35 @@ static int gssx_dec_name(struct xdr_stream *xdr,
/* name->name_type */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* name->exported_name */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* name->exported_composite_name */
err = gssx_dec_buffer(xdr, &dummy_netobj);
if (err)
- return err;
+ goto out_free_display_name;
/* we assume we have no attributes for now, so simply consume them */
/* name->name_attributes */
err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array);
if (err)
- return err;
+ goto out_free_display_name;
/* we assume we have no options for now, so simply consume them */
/* name->extensions */
err = dummy_dec_opt_array(xdr, &dummy_option_array);
+ if (err)
+ goto out_free_display_name;
+ return 0;
+
+out_free_display_name:
+ kfree(name->display_name.data);
+ name->display_name.data = NULL;
return err;
}
@@ -649,32 +674,34 @@ static int gssx_dec_ctx(struct xdr_stream *xdr,
/* ctx->state */
err = gssx_dec_buffer(xdr, &ctx->state);
if (err)
- return err;
+ goto out_free_exported_context_token;
/* ctx->need_release */
err = gssx_dec_bool(xdr, &ctx->need_release);
if (err)
- return err;
+ goto out_free_state;
/* ctx->mech */
err = gssx_dec_buffer(xdr, &ctx->mech);
if (err)
- return err;
+ goto out_free_state;
/* ctx->src_name */
err = gssx_dec_name(xdr, &ctx->src_name);
if (err)
- return err;
+ goto out_free_mech;
/* ctx->targ_name */
err = gssx_dec_name(xdr, &ctx->targ_name);
if (err)
- return err;
+ goto out_free_src_name;
/* ctx->lifetime */
p = xdr_inline_decode(xdr, 8+8);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto out_free_targ_name;
+ }
p = xdr_decode_hyper(p, &ctx->lifetime);
/* ctx->ctx_flags */
@@ -683,17 +710,36 @@ static int gssx_dec_ctx(struct xdr_stream *xdr,
/* ctx->locally_initiated */
err = gssx_dec_bool(xdr, &ctx->locally_initiated);
if (err)
- return err;
+ goto out_free_targ_name;
/* ctx->open */
err = gssx_dec_bool(xdr, &ctx->open);
if (err)
- return err;
+ goto out_free_targ_name;
/* we assume we have no options for now, so simply consume them */
/* ctx->options */
err = dummy_dec_opt_array(xdr, &ctx->options);
+ if (err)
+ goto out_free_targ_name;
+ return 0;
+
+out_free_targ_name:
+ kfree(ctx->targ_name.display_name.data);
+ ctx->targ_name.display_name.data = NULL;
+out_free_src_name:
+ kfree(ctx->src_name.display_name.data);
+ ctx->src_name.display_name.data = NULL;
+out_free_mech:
+ kfree(ctx->mech.data);
+ ctx->mech.data = NULL;
+out_free_state:
+ kfree(ctx->state.data);
+ ctx->state.data = NULL;
+out_free_exported_context_token:
+ kfree(ctx->exported_context_token.data);
+ ctx->exported_context_token.data = NULL;
return err;
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4704dce..346ac56 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -763,108 +763,88 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
}
EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
-static struct svc_pool *
-svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+/**
+ * svc_new_thread - spawn a new thread in the given pool
+ * @serv: the serv to which the pool belongs
+ * @pool: pool in which thread should be spawned
+ *
+ * Create a new thread inside @pool, which is a part of @serv.
+ * Caller must hold the service mutex.
+ *
+ * Returns 0 on success, or -errno on failure.
+ */
+int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool)
{
- return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools];
-}
+ struct svc_rqst *rqstp;
+ struct task_struct *task;
+ int node;
+ int err = 0;
-static struct svc_pool *
-svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool,
- unsigned int *state)
-{
- struct svc_pool *pool;
- unsigned int i;
+ node = svc_pool_map_get_node(pool->sp_id);
- pool = target_pool;
-
- if (!pool) {
- for (i = 0; i < serv->sv_nrpools; i++) {
- pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
- if (pool->sp_nrthreads)
- break;
- }
+ rqstp = svc_prepare_thread(serv, pool, node);
+ if (!rqstp)
+ return -ENOMEM;
+ task = kthread_create_on_node(serv->sv_threadfn, rqstp,
+ node, "%s", serv->sv_name);
+ if (IS_ERR(task)) {
+ err = PTR_ERR(task);
+ goto out;
}
- if (pool && pool->sp_nrthreads) {
- set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
- set_bit(SP_NEED_VICTIM, &pool->sp_flags);
- return pool;
- }
- return NULL;
+ rqstp->rq_task = task;
+ if (serv->sv_nrpools > 1)
+ svc_pool_map_set_cpumask(task, pool->sp_id);
+
+ svc_sock_update_bufs(serv);
+ wake_up_process(task);
+
+ /* Wait for the thread to signal initialization status */
+ wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
+ err = rqstp->rq_err;
+out:
+ if (err)
+ svc_exit_thread(rqstp);
+ return err;
}
+EXPORT_SYMBOL_GPL(svc_new_thread);
static int
svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- struct svc_rqst *rqstp;
- struct task_struct *task;
- struct svc_pool *chosen_pool;
- unsigned int state = serv->sv_nrthreads-1;
- int node;
- int err;
+ int err = 0;
- do {
- nrservs--;
- chosen_pool = svc_pool_next(serv, pool, &state);
- node = svc_pool_map_get_node(chosen_pool->sp_id);
+ while (!err && nrservs--)
+ err = svc_new_thread(serv, pool);
- rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (!rqstp)
- return -ENOMEM;
- task = kthread_create_on_node(serv->sv_threadfn, rqstp,
- node, "%s", serv->sv_name);
- if (IS_ERR(task)) {
- svc_exit_thread(rqstp);
- return PTR_ERR(task);
- }
-
- rqstp->rq_task = task;
- if (serv->sv_nrpools > 1)
- svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
-
- svc_sock_update_bufs(serv);
- wake_up_process(task);
-
- wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
- err = rqstp->rq_err;
- if (err) {
- svc_exit_thread(rqstp);
- return err;
- }
- } while (nrservs > 0);
-
- return 0;
+ return err;
}
static int
svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- unsigned int state = serv->sv_nrthreads-1;
- struct svc_pool *victim;
-
do {
- victim = svc_pool_victim(serv, pool, &state);
- if (!victim)
- break;
- svc_pool_wake_idle_thread(victim);
- wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS,
- TASK_IDLE);
+ set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+ set_bit(SP_NEED_VICTIM, &pool->sp_flags);
+ svc_pool_wake_idle_thread(pool);
+ wait_on_bit(&pool->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE);
nrservs++;
} while (nrservs < 0);
return 0;
}
/**
- * svc_set_num_threads - adjust number of threads per RPC service
+ * svc_set_pool_threads - adjust number of threads per pool
* @serv: RPC service to adjust
- * @pool: Specific pool from which to choose threads, or NULL
- * @nrservs: New number of threads for @serv (0 or less means kill all threads)
+ * @pool: Specific pool from which to choose threads
+ * @min_threads: min number of threads to run in @pool
+ * @max_threads: max number of threads in @pool (0 means kill all threads)
*
- * Create or destroy threads to make the number of threads for @serv the
- * given number. If @pool is non-NULL, change only threads in that pool;
- * otherwise, round-robin between all pools for @serv. @serv's
- * sv_nrthreads is adjusted for each thread created or destroyed.
+ * Create or destroy threads in @pool to bring it into an acceptable range
+ * between @min_threads and @max_threads.
+ *
+ * If @min_threads is 0 or larger than @max_threads, then it is ignored and
+ * the pool will be set to run a static @max_threads number of threads.
*
* Caller must ensure mutual exclusion between this and server startup or
* shutdown.
@@ -873,19 +853,85 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
* starting a thread.
*/
int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+svc_set_pool_threads(struct svc_serv *serv, struct svc_pool *pool,
+ unsigned int min_threads, unsigned int max_threads)
{
- if (!pool)
- nrservs -= serv->sv_nrthreads;
- else
- nrservs -= pool->sp_nrthreads;
+ int delta;
- if (nrservs > 0)
- return svc_start_kthreads(serv, pool, nrservs);
- if (nrservs < 0)
- return svc_stop_kthreads(serv, pool, nrservs);
+ if (!pool)
+ return -EINVAL;
+
+ /* clamp min threads to the max */
+ if (min_threads > max_threads)
+ min_threads = max_threads;
+
+ pool->sp_nrthrmin = min_threads;
+ pool->sp_nrthrmax = max_threads;
+
+ /*
+ * When min_threads is set, then only change the number of
+ * threads to bring it within an acceptable range.
+ */
+ if (min_threads) {
+ if (pool->sp_nrthreads > max_threads)
+ delta = max_threads;
+ else if (pool->sp_nrthreads < min_threads)
+ delta = min_threads;
+ else
+ return 0;
+ } else {
+ delta = max_threads;
+ }
+
+ delta -= pool->sp_nrthreads;
+ if (delta > 0)
+ return svc_start_kthreads(serv, pool, delta);
+ if (delta < 0)
+ return svc_stop_kthreads(serv, pool, delta);
return 0;
}
+EXPORT_SYMBOL_GPL(svc_set_pool_threads);
+
+/**
+ * svc_set_num_threads - adjust number of threads in serv
+ * @serv: RPC service to adjust
+ * @min_threads: min number of threads to run per pool
+ * @nrservs: New number of threads for @serv (0 means kill all threads)
+ *
+ * Create or destroy threads in @serv to bring it to @nrservs. If there
+ * are multiple pools then the new threads or victims will be distributed
+ * evenly among them.
+ *
+ * Caller must ensure mutual exclusion between this and server startup or
+ * shutdown.
+ *
+ * Returns zero on success or a negative errno if an error occurred while
+ * starting a thread. On failure, some pools may have already been
+ * adjusted; the caller is responsible for recovery.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads,
+ unsigned int nrservs)
+{
+ unsigned int base = nrservs / serv->sv_nrpools;
+ unsigned int remain = nrservs % serv->sv_nrpools;
+ int i, err = 0;
+
+ for (i = 0; i < serv->sv_nrpools; ++i) {
+ struct svc_pool *pool = &serv->sv_pools[i];
+ int threads = base;
+
+ if (remain) {
+ ++threads;
+ --remain;
+ }
+
+ err = svc_set_pool_threads(serv, pool, min_threads, threads);
+ if (err)
+ break;
+ }
+ return err;
+}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
/**
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 6973184..56a663b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -714,15 +714,21 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
return true;
}
-static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
+static bool svc_schedule_timeout(long timeo)
+{
+ return schedule_timeout(timeo ? timeo : MAX_SCHEDULE_TIMEOUT) == 0;
+}
+
+static bool svc_thread_wait_for_work(struct svc_rqst *rqstp, long timeo)
{
struct svc_pool *pool = rqstp->rq_pool;
+ bool did_timeout = false;
if (svc_thread_should_sleep(rqstp)) {
set_current_state(TASK_IDLE | TASK_FREEZABLE);
llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
if (likely(svc_thread_should_sleep(rqstp)))
- schedule();
+ did_timeout = svc_schedule_timeout(timeo);
while (!llist_del_first_this(&pool->sp_idle_threads,
&rqstp->rq_idle)) {
@@ -734,7 +740,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
* for this new work. This thread can safely sleep
* until woken again.
*/
- schedule();
+ did_timeout = svc_schedule_timeout(timeo);
set_current_state(TASK_IDLE | TASK_FREEZABLE);
}
__set_current_state(TASK_RUNNING);
@@ -742,6 +748,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
cond_resched();
}
try_to_freeze();
+ return did_timeout;
}
static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -835,25 +842,38 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp)
/**
* svc_recv - Receive and process the next request on any transport
* @rqstp: an idle RPC service thread
+ * @timeo: timeout (in jiffies) (0 means infinite timeout)
*
* This code is carefully organised not to touch any cachelines in
* the shared svc_serv structure, only cachelines in the local
* svc_pool.
+ *
+ * If the timeout is 0, then the sleep will never time out.
+ *
+ * Returns -ETIMEDOUT if idle for an extended period
+ * -EBUSY if there is more work to do than available threads
+ * 0 otherwise.
*/
-void svc_recv(struct svc_rqst *rqstp)
+int svc_recv(struct svc_rqst *rqstp, long timeo)
{
struct svc_pool *pool = rqstp->rq_pool;
+ bool did_timeout;
+ int ret = 0;
if (!svc_alloc_arg(rqstp))
- return;
+ return ret;
- svc_thread_wait_for_work(rqstp);
+ did_timeout = svc_thread_wait_for_work(rqstp, timeo);
+
+ if (did_timeout && svc_thread_should_sleep(rqstp) &&
+ pool->sp_nrthrmin && pool->sp_nrthreads > pool->sp_nrthrmin)
+ ret = -ETIMEDOUT;
clear_bit(SP_TASK_PENDING, &pool->sp_flags);
if (svc_thread_should_stop(rqstp)) {
svc_thread_wake_next(rqstp);
- return;
+ return ret;
}
rqstp->rq_xprt = svc_xprt_dequeue(pool);
@@ -865,10 +885,22 @@ void svc_recv(struct svc_rqst *rqstp)
* cache information to be provided. When there are no
* idle threads, we reduce the wait time.
*/
- if (pool->sp_idle_threads.first)
+ if (pool->sp_idle_threads.first) {
rqstp->rq_chandle.thread_wait = 5 * HZ;
- else
+ } else {
rqstp->rq_chandle.thread_wait = 1 * HZ;
+ /*
+ * No idle threads: signal -EBUSY so the caller
+ * can consider spawning another thread. Use
+ * SP_TASK_STARTING to limit this signal to one
+ * thread at a time; the caller clears this flag
+ * after starting a new thread.
+ */
+ if (!did_timeout && timeo &&
+ !test_and_set_bit(SP_TASK_STARTING,
+ &pool->sp_flags))
+ ret = -EBUSY;
+ }
trace_svc_xprt_dequeue(rqstp);
svc_handle_xprt(rqstp, xprt);
@@ -887,6 +919,7 @@ void svc_recv(struct svc_rqst *rqstp)
}
}
#endif
+ return ret;
}
EXPORT_SYMBOL_GPL(svc_recv);
diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README
index 27218a7..2cf05d1e 100644
--- a/tools/net/sunrpc/xdrgen/README
+++ b/tools/net/sunrpc/xdrgen/README
@@ -250,8 +250,6 @@
Enable something like a #include to dynamically insert the content
of other specification files
-Properly support line-by-line pass-through via the "%" decorator
-
Build a unit test suite for verifying translation of XDR language
into compilable code
diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py
index e22632c..5c3a4a4 100644
--- a/tools/net/sunrpc/xdrgen/generators/__init__.py
+++ b/tools/net/sunrpc/xdrgen/generators/__init__.py
@@ -6,7 +6,7 @@
from jinja2 import Environment, FileSystemLoader, Template
from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier
-from xdr_ast import public_apis, pass_by_reference, get_header_name
+from xdr_ast import public_apis, pass_by_reference, structs, get_header_name
from xdr_parse import get_xdr_annotate
@@ -25,6 +25,7 @@
environment.globals["annotate"] = get_xdr_annotate()
environment.globals["public_apis"] = public_apis
environment.globals["pass_by_reference"] = pass_by_reference
+ environment.globals["structs"] = structs
return environment
case _:
raise NotImplementedError("Language not supported")
@@ -58,6 +59,8 @@
"""Return name of C type"""
builtin_native_c_type = {
"bool": "bool",
+ "short": "s16",
+ "unsigned_short": "u16",
"int": "s32",
"unsigned_int": "u32",
"long": "s32",
diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py
index e62f715..b4ed3ed 100644
--- a/tools/net/sunrpc/xdrgen/generators/enum.py
+++ b/tools/net/sunrpc/xdrgen/generators/enum.py
@@ -5,6 +5,7 @@
from generators import SourceGenerator, create_jinja2_environment
from xdr_ast import _XdrEnum, public_apis, big_endian, get_header_name
+from xdr_parse import get_xdr_enum_validation
class XdrEnumGenerator(SourceGenerator):
@@ -42,7 +43,13 @@
template = self.environment.get_template("decoder/enum_be.j2")
else:
template = self.environment.get_template("decoder/enum.j2")
- print(template.render(name=node.name))
+ print(
+ template.render(
+ name=node.name,
+ enumerators=node.enumerators,
+ validate=get_xdr_enum_validation(),
+ )
+ )
def emit_encoder(self, node: _XdrEnum) -> None:
"""Emit one encoder function for an XDR enum type"""
diff --git a/tools/net/sunrpc/xdrgen/generators/passthru.py b/tools/net/sunrpc/xdrgen/generators/passthru.py
new file mode 100644
index 0000000..cb17bd9
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/passthru.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code for XDR pass-through lines"""
+
+from generators import SourceGenerator, create_jinja2_environment
+from xdr_ast import _XdrPassthru
+
+
+class XdrPassthruGenerator(SourceGenerator):
+ """Generate source code for XDR pass-through content"""
+
+ def __init__(self, language: str, peer: str):
+ """Initialize an instance of this class"""
+ self.environment = create_jinja2_environment(language, "passthru")
+ self.peer = peer
+
+ def emit_definition(self, node: _XdrPassthru) -> None:
+ """Emit one pass-through line"""
+ template = self.environment.get_template("definition.j2")
+ print(template.render(content=node.content))
+
+ def emit_decoder(self, node: _XdrPassthru) -> None:
+ """Emit one pass-through line"""
+ template = self.environment.get_template("source.j2")
+ print(template.render(content=node.content))
diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py
index ac3cf16..c0cb3f6 100644
--- a/tools/net/sunrpc/xdrgen/generators/program.py
+++ b/tools/net/sunrpc/xdrgen/generators/program.py
@@ -5,8 +5,9 @@
from jinja2 import Environment
-from generators import SourceGenerator, create_jinja2_environment
+from generators import SourceGenerator, create_jinja2_environment, get_jinja2_template
from xdr_ast import _RpcProgram, _RpcVersion, excluded_apis
+from xdr_ast import max_widths, get_header_name
def emit_version_definitions(
@@ -127,6 +128,9 @@
for version in node.versions:
emit_version_definitions(self.environment, program, version)
+ template = self.environment.get_template("definition/program.j2")
+ print(template.render(name=raw_name, value=node.number))
+
def emit_declaration(self, node: _RpcProgram) -> None:
"""Emit a declaration pair for each of an RPC programs's procedures"""
raw_name = node.name
@@ -166,3 +170,35 @@
emit_version_argument_encoders(
self.environment, program, version,
)
+
+ def emit_maxsize(self, node: _RpcProgram) -> None:
+ """Emit maxsize macro for maximum RPC argument size"""
+ header = get_header_name().upper()
+
+ # Find the largest argument across all versions
+ max_arg_width = 0
+ max_arg_name = None
+ for version in node.versions:
+ for procedure in version.procedures:
+ if procedure.name in excluded_apis:
+ continue
+ arg_name = procedure.argument.type_name
+ if arg_name == "void":
+ continue
+ if arg_name not in max_widths:
+ continue
+ if max_widths[arg_name] > max_arg_width:
+ max_arg_width = max_widths[arg_name]
+ max_arg_name = arg_name
+
+ if max_arg_name is None:
+ return
+
+ macro_name = header + "_MAX_ARGS_SZ"
+ template = get_jinja2_template(self.environment, "maxsize", "max_args")
+ print(
+ template.render(
+ macro=macro_name,
+ width=header + "_" + max_arg_name + "_sz",
+ )
+ )
diff --git a/tools/net/sunrpc/xdrgen/generators/typedef.py b/tools/net/sunrpc/xdrgen/generators/typedef.py
index fab72e9..75e3a40 100644
--- a/tools/net/sunrpc/xdrgen/generators/typedef.py
+++ b/tools/net/sunrpc/xdrgen/generators/typedef.py
@@ -58,7 +58,7 @@
elif isinstance(node, _XdrOptionalData):
raise NotImplementedError("<optional_data> typedef not yet implemented")
elif isinstance(node, _XdrVoid):
- raise NotImplementedError("<void> typedef not yet implemented")
+ raise ValueError("invalid void usage in RPC Specification")
else:
raise NotImplementedError("typedef: type not recognized")
@@ -104,7 +104,7 @@
elif isinstance(node, _XdrOptionalData):
raise NotImplementedError("<optional_data> typedef not yet implemented")
elif isinstance(node, _XdrVoid):
- raise NotImplementedError("<void> typedef not yet implemented")
+ raise ValueError("invalid void usage in RPC Specification")
else:
raise NotImplementedError("typedef: type not recognized")
@@ -165,7 +165,7 @@
elif isinstance(node, _XdrOptionalData):
raise NotImplementedError("<optional_data> typedef not yet implemented")
elif isinstance(node, _XdrVoid):
- raise NotImplementedError("<void> typedef not yet implemented")
+ raise ValueError("invalid void usage in RPC Specification")
else:
raise NotImplementedError("typedef: type not recognized")
@@ -225,7 +225,7 @@
elif isinstance(node, _XdrOptionalData):
raise NotImplementedError("<optional_data> typedef not yet implemented")
elif isinstance(node, _XdrVoid):
- raise NotImplementedError("<void> typedef not yet implemented")
+ raise ValueError("invalid void usage in RPC Specification")
else:
raise NotImplementedError("typedef: type not recognized")
diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py
index ad1f214..d15837d 100644
--- a/tools/net/sunrpc/xdrgen/generators/union.py
+++ b/tools/net/sunrpc/xdrgen/generators/union.py
@@ -84,6 +84,31 @@
print(template.render(name=node.name, type=node.spec.type_name))
+def emit_union_arm_decoder(
+ environment: Environment, node: _XdrCaseSpec
+) -> None:
+ """Emit decoder for an XDR union's arm (data only, no case/break)"""
+
+ if isinstance(node.arm, _XdrVoid):
+ return
+ if isinstance(node.arm, _XdrString):
+ type_name = "char *"
+ classifier = ""
+ else:
+ type_name = node.arm.spec.type_name
+ classifier = node.arm.spec.c_classifier
+
+ assert isinstance(node.arm, (_XdrBasic, _XdrString))
+ template = get_jinja2_template(environment, "decoder", node.arm.template)
+ print(
+ template.render(
+ name=node.arm.name,
+ type=type_name,
+ classifier=classifier,
+ )
+ )
+
+
def emit_union_case_spec_decoder(
environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
) -> None:
@@ -151,19 +176,33 @@
template = get_jinja2_template(environment, "decoder", "open")
print(template.render(name=node.name))
- emit_union_switch_spec_decoder(environment, node.discriminant)
+ # For boolean discriminants, use if statement instead of switch
+ if node.discriminant.spec.type_name == "bool":
+ template = get_jinja2_template(environment, "decoder", "bool_spec")
+ print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name))
- for case in node.cases:
- emit_union_case_spec_decoder(
- environment,
- case,
- node.discriminant.spec.type_name in big_endian,
- )
+ # Find and emit the TRUE case
+ for case in node.cases:
+ if case.values and case.values[0] == "TRUE":
+ emit_union_arm_decoder(environment, case)
+ break
- emit_union_default_spec_decoder(environment, node)
+ template = get_jinja2_template(environment, "decoder", "close")
+ print(template.render())
+ else:
+ emit_union_switch_spec_decoder(environment, node.discriminant)
- template = get_jinja2_template(environment, "decoder", "close")
- print(template.render())
+ for case in node.cases:
+ emit_union_case_spec_decoder(
+ environment,
+ case,
+ node.discriminant.spec.type_name in big_endian,
+ )
+
+ emit_union_default_spec_decoder(environment, node)
+
+ template = get_jinja2_template(environment, "decoder", "close")
+ print(template.render())
def emit_union_switch_spec_encoder(
@@ -175,6 +214,28 @@
print(template.render(name=node.name, type=node.spec.type_name))
+def emit_union_arm_encoder(
+ environment: Environment, node: _XdrCaseSpec
+) -> None:
+ """Emit encoder for an XDR union's arm (data only, no case/break)"""
+
+ if isinstance(node.arm, _XdrVoid):
+ return
+ if isinstance(node.arm, _XdrString):
+ type_name = "char *"
+ else:
+ type_name = node.arm.spec.type_name
+
+ assert isinstance(node.arm, (_XdrBasic, _XdrString))
+ template = get_jinja2_template(environment, "encoder", node.arm.template)
+ print(
+ template.render(
+ name=node.arm.name,
+ type=type_name,
+ )
+ )
+
+
def emit_union_case_spec_encoder(
environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
) -> None:
@@ -235,19 +296,33 @@
template = get_jinja2_template(environment, "encoder", "open")
print(template.render(name=node.name))
- emit_union_switch_spec_encoder(environment, node.discriminant)
+ # For boolean discriminants, use if statement instead of switch
+ if node.discriminant.spec.type_name == "bool":
+ template = get_jinja2_template(environment, "encoder", "bool_spec")
+ print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name))
- for case in node.cases:
- emit_union_case_spec_encoder(
- environment,
- case,
- node.discriminant.spec.type_name in big_endian,
- )
+ # Find and emit the TRUE case
+ for case in node.cases:
+ if case.values and case.values[0] == "TRUE":
+ emit_union_arm_encoder(environment, case)
+ break
- emit_union_default_spec_encoder(environment, node)
+ template = get_jinja2_template(environment, "encoder", "close")
+ print(template.render())
+ else:
+ emit_union_switch_spec_encoder(environment, node.discriminant)
- template = get_jinja2_template(environment, "encoder", "close")
- print(template.render())
+ for case in node.cases:
+ emit_union_case_spec_encoder(
+ environment,
+ case,
+ node.discriminant.spec.type_name in big_endian,
+ )
+
+ emit_union_default_spec_encoder(environment, node)
+
+ template = get_jinja2_template(environment, "encoder", "close")
+ print(template.render())
def emit_union_maxsize(environment: Environment, node: _XdrUnion) -> None:
diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
index 7c2c1b8..1d2afff 100644
--- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark
+++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
@@ -20,9 +20,11 @@
type_specifier : unsigned_hyper
| unsigned_long
| unsigned_int
+ | unsigned_short
| hyper
| long
| int
+ | short
| float
| double
| quadruple
@@ -35,9 +37,11 @@
unsigned_hyper : "unsigned" "hyper"
unsigned_long : "unsigned" "long"
unsigned_int : "unsigned" "int"
+unsigned_short : "unsigned" "short"
hyper : "hyper"
long : "long"
int : "int"
+short : "short"
float : "float"
double : "double"
quadruple : "quadruple"
@@ -74,6 +78,9 @@
| type_def
| program_def
| pragma_def
+ | passthru_def
+
+passthru_def : PASSTHRU
//
// RPC program definitions not specified in RFC 4506
@@ -111,8 +118,7 @@
hexadecimal_constant : /0x([a-f]|[A-F]|[0-9])+/
octal_constant : /0[0-7]+/
-PASSTHRU : "%" | "%" /.+/
-%ignore PASSTHRU
+PASSTHRU : /%.*/
%import common.C_COMMENT
%ignore C_COMMENT
diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
index c5e8d79..ed83d48 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
@@ -8,9 +8,8 @@
from argparse import Namespace
from lark import logger
-from lark.exceptions import UnexpectedInput
+from lark.exceptions import VisitError
-from generators.constant import XdrConstantGenerator
from generators.enum import XdrEnumGenerator
from generators.header_bottom import XdrHeaderBottomGenerator
from generators.header_top import XdrHeaderTopGenerator
@@ -21,9 +20,10 @@
from generators.union import XdrUnionGenerator
from xdr_ast import transform_parse_tree, _RpcProgram, Specification
-from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer
-from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
+from xdr_ast import _XdrEnum, _XdrPointer, _XdrTypedef, _XdrStruct, _XdrUnion
from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
logger.setLevel(logging.INFO)
@@ -50,20 +50,24 @@
gen.emit_declaration(definition.value)
-def handle_parse_error(e: UnexpectedInput) -> bool:
- """Simple parse error reporting, no recovery attempted"""
- print(e)
- return True
-
-
def subcmd(args: Namespace) -> int:
"""Generate definitions and declarations"""
set_xdr_annotate(args.annotate)
parser = xdr_parser()
with open(args.filename, encoding="utf-8") as f:
- parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
- ast = transform_parse_tree(parse_tree)
+ source = f.read()
+ try:
+ parse_tree = parser.parse(
+ source, on_error=make_error_handler(source, args.filename)
+ )
+ except XdrParseError:
+ return 1
+ try:
+ ast = transform_parse_tree(parse_tree)
+ except VisitError as e:
+ handle_transform_error(e, source, args.filename)
+ return 1
gen = XdrHeaderTopGenerator(args.language, args.peer)
gen.emit_declaration(args.filename, ast)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
index c956e27..a48ca05 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -8,12 +8,13 @@
from argparse import Namespace
from lark import logger
-from lark.exceptions import UnexpectedInput
+from lark.exceptions import VisitError
from generators.constant import XdrConstantGenerator
from generators.enum import XdrEnumGenerator
from generators.header_bottom import XdrHeaderBottomGenerator
from generators.header_top import XdrHeaderTopGenerator
+from generators.passthru import XdrPassthruGenerator
from generators.pointer import XdrPointerGenerator
from generators.program import XdrProgramGenerator
from generators.typedef import XdrTypedefGenerator
@@ -21,9 +22,11 @@
from generators.union import XdrUnionGenerator
from xdr_ast import transform_parse_tree, Specification
-from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer
+from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPassthru, _XdrPointer
from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
logger.setLevel(logging.INFO)
@@ -45,6 +48,8 @@
gen = XdrStructGenerator(language, peer)
elif isinstance(definition.value, _XdrUnion):
gen = XdrUnionGenerator(language, peer)
+ elif isinstance(definition.value, _XdrPassthru):
+ gen = XdrPassthruGenerator(language, peer)
else:
continue
gen.emit_definition(definition.value)
@@ -64,25 +69,31 @@
gen = XdrStructGenerator(language, peer)
elif isinstance(definition.value, _XdrUnion):
gen = XdrUnionGenerator(language, peer)
+ elif isinstance(definition.value, _RpcProgram):
+ gen = XdrProgramGenerator(language, peer)
else:
continue
gen.emit_maxsize(definition.value)
-def handle_parse_error(e: UnexpectedInput) -> bool:
- """Simple parse error reporting, no recovery attempted"""
- print(e)
- return True
-
-
def subcmd(args: Namespace) -> int:
"""Generate definitions"""
set_xdr_annotate(args.annotate)
parser = xdr_parser()
with open(args.filename, encoding="utf-8") as f:
- parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
- ast = transform_parse_tree(parse_tree)
+ source = f.read()
+ try:
+ parse_tree = parser.parse(
+ source, on_error=make_error_handler(source, args.filename)
+ )
+ except XdrParseError:
+ return 1
+ try:
+ ast = transform_parse_tree(parse_tree)
+ except VisitError as e:
+ handle_transform_error(e, source, args.filename)
+ return 1
gen = XdrHeaderTopGenerator(args.language, args.peer)
gen.emit_definition(args.filename, ast)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py
index 36cc437..e1da4963 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/lint.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py
@@ -8,26 +8,31 @@
from argparse import Namespace
from lark import logger
-from lark.exceptions import UnexpectedInput
+from lark.exceptions import VisitError
-from xdr_parse import xdr_parser
+from xdr_parse import xdr_parser, make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
from xdr_ast import transform_parse_tree
logger.setLevel(logging.DEBUG)
-def handle_parse_error(e: UnexpectedInput) -> bool:
- """Simple parse error reporting, no recovery attempted"""
- print(e)
- return True
-
-
def subcmd(args: Namespace) -> int:
"""Lexical and syntax check of an XDR specification"""
parser = xdr_parser()
with open(args.filename, encoding="utf-8") as f:
- parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
- transform_parse_tree(parse_tree)
+ source = f.read()
+ try:
+ parse_tree = parser.parse(
+ source, on_error=make_error_handler(source, args.filename)
+ )
+ except XdrParseError:
+ return 1
+ try:
+ transform_parse_tree(parse_tree)
+ except VisitError as e:
+ handle_transform_error(e, source, args.filename)
+ return 1
return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
index 2024954..27e8767 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/source.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -8,10 +8,11 @@
from argparse import Namespace
from lark import logger
-from lark.exceptions import UnexpectedInput
+from lark.exceptions import VisitError
from generators.source_top import XdrSourceTopGenerator
from generators.enum import XdrEnumGenerator
+from generators.passthru import XdrPassthruGenerator
from generators.pointer import XdrPointerGenerator
from generators.program import XdrProgramGenerator
from generators.typedef import XdrTypedefGenerator
@@ -19,10 +20,12 @@
from generators.union import XdrUnionGenerator
from xdr_ast import transform_parse_tree, _RpcProgram, Specification
-from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer
+from xdr_ast import _XdrAst, _XdrEnum, _XdrPassthru, _XdrPointer
from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
-from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation
+from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
logger.setLevel(logging.INFO)
@@ -72,40 +75,54 @@
gen.emit_source(filename, root)
for definition in root.definitions:
- emit_source_decoder(definition.value, language, "server")
+ if isinstance(definition.value, _XdrPassthru):
+ passthru_gen = XdrPassthruGenerator(language, "server")
+ passthru_gen.emit_decoder(definition.value)
+ else:
+ emit_source_decoder(definition.value, language, "server")
for definition in root.definitions:
- emit_source_encoder(definition.value, language, "server")
+ if not isinstance(definition.value, _XdrPassthru):
+ emit_source_encoder(definition.value, language, "server")
def generate_client_source(filename: str, root: Specification, language: str) -> None:
- """Generate server-side source code"""
+ """Generate client-side source code"""
gen = XdrSourceTopGenerator(language, "client")
gen.emit_source(filename, root)
- print("")
for definition in root.definitions:
- emit_source_encoder(definition.value, language, "client")
+ if isinstance(definition.value, _XdrPassthru):
+ passthru_gen = XdrPassthruGenerator(language, "client")
+ passthru_gen.emit_decoder(definition.value)
+ else:
+ emit_source_encoder(definition.value, language, "client")
for definition in root.definitions:
- emit_source_decoder(definition.value, language, "client")
+ if not isinstance(definition.value, _XdrPassthru):
+ emit_source_decoder(definition.value, language, "client")
# cel: todo: client needs PROC macros
-def handle_parse_error(e: UnexpectedInput) -> bool:
- """Simple parse error reporting, no recovery attempted"""
- print(e)
- return True
-
-
def subcmd(args: Namespace) -> int:
"""Generate encoder and decoder functions"""
set_xdr_annotate(args.annotate)
+ set_xdr_enum_validation(not args.no_enum_validation)
parser = xdr_parser()
with open(args.filename, encoding="utf-8") as f:
- parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
- ast = transform_parse_tree(parse_tree)
+ source = f.read()
+ try:
+ parse_tree = parser.parse(
+ source, on_error=make_error_handler(source, args.filename)
+ )
+ except XdrParseError:
+ return 1
+ try:
+ ast = transform_parse_tree(parse_tree)
+ except VisitError as e:
+ handle_transform_error(e, source, args.filename)
+ return 1
match args.peer:
case "server":
generate_server_source(args.filename, ast, args.language)
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
index d1405c7..c7ae506 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
@@ -1,4 +1,3 @@
{# SPDX-License-Identifier: GPL-2.0 #}
-
bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr);
bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, {{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
index 6482984f..735a341 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
@@ -14,6 +14,17 @@
if (xdr_stream_decode_u32(xdr, &val) < 0)
return false;
+{% if validate and enumerators %}
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+{% for e in enumerators %}
+ case {{ e.name }}:
+{% endfor %}
+ break;
+ default:
+ return false;
+ }
+{% endif %}
*ptr = val;
return true;
}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
index 44c391c..82782a5 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
@@ -10,5 +10,25 @@
{% endif %}
xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr)
{
+{% if validate and enumerators %}
+ __be32 raw;
+ u32 val;
+
+ if (xdr_stream_decode_be32(xdr, &raw) < 0)
+ return false;
+ val = be32_to_cpu(raw);
+ /* Compiler may optimize to a range check for dense enums */
+ switch (val) {
+{% for e in enumerators %}
+ case {{ e.name }}:
+{% endfor %}
+ break;
+ default:
+ return false;
+ }
+ *ptr = raw;
+ return true;
+{% else %}
return xdr_stream_decode_be32(xdr, ptr) == 0;
+{% endif %}
}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
index a07586c..446266a 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
@@ -1,3 +1,4 @@
{# SPDX-License-Identifier: GPL-2.0 #}
};
+
typedef enum {{ name }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
index 2c18948..cfeee22 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
@@ -1,3 +1,4 @@
{# SPDX-License-Identifier: GPL-2.0 #}
};
+
typedef __be32 {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2
new file mode 100644
index 0000000..900c751
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{{ content }}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2
new file mode 100644
index 0000000..900c751
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{{ content }}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
index 0b1709c..19b219d 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
@@ -14,7 +14,11 @@
{% if argument == 'void' %}
return xdrgen_decode_void(xdr);
{% else %}
+{% if argument in structs %}
struct {{ argument }} *argp = rqstp->rq_argp;
+{% else %}
+ {{ argument }} *argp = rqstp->rq_argp;
+{% endif %}
return xdrgen_decode_{{ argument }}(xdr, argp);
{% endif %}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2
new file mode 100644
index 0000000..320663f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+#ifndef {{ name }}
+#define {{ name }} ({{ value }})
+#endif
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
index 6fc61a5..746592c 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
@@ -14,8 +14,14 @@
{% if result == 'void' %}
return xdrgen_encode_void(xdr);
{% else %}
+{% if result in structs %}
struct {{ result }} *resp = rqstp->rq_resp;
return xdrgen_encode_{{ result }}(xdr, resp);
+{% else %}
+ {{ result }} *resp = rqstp->rq_resp;
+
+ return xdrgen_encode_{{ result }}(xdr, *resp);
+{% endif %}
{% endif %}
}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2
new file mode 100644
index 0000000..9f3bfb4
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+#define {{ '{:<31}'.format(macro) }} \
+ ({{ width }})
diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
index c5518c5..df3598c3 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
@@ -8,6 +8,5 @@
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/xdrgen/_defs.h>
#include <linux/sunrpc/xdrgen/_builtins.h>
-#include <linux/sunrpc/xdrgen/nlm4.h>
#include <linux/sunrpc/clnt.h>
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2
new file mode 100644
index 0000000..05ad491
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+ /* discriminant {{ name }} */
+{% endif %}
+ if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}))
+ return false;
+ if (ptr->{{ name }}) {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
index 01d716d..5fc1937 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
@@ -3,6 +3,7 @@
};
{%- if name in public_apis %}
+
bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr);
bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr);
{%- endif -%}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2
new file mode 100644
index 0000000..e5135ed
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+ /* discriminant {{ name }} */
+{% endif %}
+ if (!xdrgen_encode_{{ type }}(xdr, ptr->{{ name }}))
+ return false;
+ if (ptr->{{ name }}) {
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index 5233e73..14bff94 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -34,6 +34,8 @@
symbolic_widths = {
"void": ["XDR_void"],
"bool": ["XDR_bool"],
+ "short": ["XDR_short"],
+ "unsigned_short": ["XDR_unsigned_short"],
"int": ["XDR_int"],
"unsigned_int": ["XDR_unsigned_int"],
"long": ["XDR_long"],
@@ -48,6 +50,8 @@
max_widths = {
"void": 0,
"bool": 1,
+ "short": 1,
+ "unsigned_short": 1,
"int": 1,
"unsigned_int": 1,
"long": 1,
@@ -326,8 +330,6 @@
"""An XDR enum definition"""
name: str
- minimum: int
- maximum: int
enumerators: List[_XdrEnumerator]
def max_width(self) -> int:
@@ -515,6 +517,13 @@
@dataclass
+class _XdrPassthru(_XdrAst):
+ """Passthrough line to emit verbatim in output"""
+
+ content: str
+
+
+@dataclass
class Definition(_XdrAst, ast_utils.WithMeta):
"""Corresponds to 'definition' in the grammar"""
@@ -568,8 +577,6 @@
value = children[1].value
return _XdrConstant(name, value)
- # cel: Python can compute a min() and max() for the enumerator values
- # so that the generated code can perform proper range checking.
def enum(self, children):
"""Instantiate one _XdrEnum object"""
enum_name = children[0].symbol
@@ -583,7 +590,7 @@
enumerators.append(_XdrEnumerator(name, value))
i = i + 2
- return _XdrEnum(enum_name, 0, 0, enumerators)
+ return _XdrEnum(enum_name, enumerators)
def fixed_length_opaque(self, children):
"""Instantiate one _XdrFixedLengthOpaque declaration object"""
@@ -738,14 +745,42 @@
raise NotImplementedError("Directive not supported")
return _Pragma()
+ def passthru_def(self, children):
+ """Instantiate one _XdrPassthru object"""
+ token = children[0]
+ content = token.value[1:]
+ return _XdrPassthru(content)
+
transformer = ast_utils.create_transformer(this_module, ParseToAst())
+def _merge_consecutive_passthru(definitions: List[Definition]) -> List[Definition]:
+ """Merge consecutive passthru definitions into single nodes"""
+ result = []
+ i = 0
+ while i < len(definitions):
+ if isinstance(definitions[i].value, _XdrPassthru):
+ lines = [definitions[i].value.content]
+ meta = definitions[i].meta
+ j = i + 1
+ while j < len(definitions) and isinstance(definitions[j].value, _XdrPassthru):
+ lines.append(definitions[j].value.content)
+ j += 1
+ merged = _XdrPassthru("\n".join(lines))
+ result.append(Definition(meta, merged))
+ i = j
+ else:
+ result.append(definitions[i])
+ i += 1
+ return result
+
+
def transform_parse_tree(parse_tree):
"""Transform productions into an abstract syntax tree"""
-
- return transformer.transform(parse_tree)
+ ast = transformer.transform(parse_tree)
+ ast.definitions = _merge_consecutive_passthru(ast.definitions)
+ return ast
def get_header_name() -> str:
diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py
index 964b44e..241e96c 100644
--- a/tools/net/sunrpc/xdrgen/xdr_parse.py
+++ b/tools/net/sunrpc/xdrgen/xdr_parse.py
@@ -3,12 +3,43 @@
"""Common parsing code for xdrgen"""
+import sys
+from typing import Callable
+
from lark import Lark
+from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError
# Set to True to emit annotation comments in generated source
annotate = False
+# Set to True to emit enum value validation in decoders
+enum_validation = True
+
+# Map internal Lark token names to human-readable names
+TOKEN_NAMES = {
+ "__ANON_0": "identifier",
+ "__ANON_1": "number",
+ "SEMICOLON": "';'",
+ "LBRACE": "'{'",
+ "RBRACE": "'}'",
+ "LPAR": "'('",
+ "RPAR": "')'",
+ "LSQB": "'['",
+ "RSQB": "']'",
+ "LESSTHAN": "'<'",
+ "MORETHAN": "'>'",
+ "EQUAL": "'='",
+ "COLON": "':'",
+ "COMMA": "','",
+ "STAR": "'*'",
+ "$END": "end of file",
+}
+
+
+class XdrParseError(Exception):
+ """Raised when XDR parsing fails"""
+
def set_xdr_annotate(set_it: bool) -> None:
"""Set 'annotate' if --annotate was specified on the command line"""
@@ -21,6 +52,113 @@
return annotate
+def set_xdr_enum_validation(set_it: bool) -> None:
+ """Set 'enum_validation' based on command line options"""
+ global enum_validation
+ enum_validation = set_it
+
+
+def get_xdr_enum_validation() -> bool:
+ """Return True when enum validation is enabled for decoder generation"""
+ return enum_validation
+
+
+def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]:
+ """Create an error handler that reports the first parse error and aborts.
+
+ Args:
+ source: The XDR source text being parsed
+ filename: The name of the file being parsed
+
+ Returns:
+ An error handler function for use with Lark's on_error parameter
+ """
+ lines = source.splitlines()
+
+ def handle_parse_error(e: UnexpectedInput) -> bool:
+ """Report a parse error with context and abort parsing"""
+ line_num = e.line
+ column = e.column
+ line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else ""
+
+ # Build the error message
+ msg_parts = [f"{filename}:{line_num}:{column}: parse error"]
+
+ # Show what was found vs what was expected
+ if isinstance(e, UnexpectedToken):
+ token = e.token
+ if token.type == "__ANON_0":
+ found = f"identifier '{token.value}'"
+ elif token.type == "__ANON_1":
+ found = f"number '{token.value}'"
+ else:
+ found = f"'{token.value}'"
+ msg_parts.append(f"Unexpected {found}")
+
+ # Provide helpful expected tokens list
+ expected = e.expected
+ if expected:
+ readable = [
+ TOKEN_NAMES.get(exp, exp.lower().replace("_", " "))
+ for exp in sorted(expected)
+ ]
+ if len(readable) == 1:
+ msg_parts.append(f"Expected {readable[0]}")
+ elif len(readable) <= 4:
+ msg_parts.append(f"Expected one of: {', '.join(readable)}")
+ else:
+ msg_parts.append(str(e).split("\n")[0])
+
+ # Show the offending line with a caret pointing to the error
+ msg_parts.append("")
+ msg_parts.append(f" {line_text}")
+ prefix = line_text[: column - 1].expandtabs()
+ msg_parts.append(f" {' ' * len(prefix)}^")
+
+ sys.stderr.write("\n".join(msg_parts) + "\n")
+ raise XdrParseError()
+
+ return handle_parse_error
+
+
+def handle_transform_error(e: VisitError, source: str, filename: str) -> None:
+ """Report a transform error with context.
+
+ Args:
+ e: The VisitError from Lark's transformer
+ source: The XDR source text being parsed
+ filename: The name of the file being parsed
+ """
+ lines = source.splitlines()
+
+ # Extract position from the tree node if available
+ line_num = 0
+ column = 0
+ if hasattr(e.obj, "meta") and e.obj.meta:
+ line_num = e.obj.meta.line
+ column = e.obj.meta.column
+
+ line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else ""
+
+ # Build the error message
+ msg_parts = [f"{filename}:{line_num}:{column}: semantic error"]
+
+ # The original exception is typically a KeyError for undefined types
+ if isinstance(e.orig_exc, KeyError):
+ msg_parts.append(f"Undefined type '{e.orig_exc.args[0]}'")
+ else:
+ msg_parts.append(str(e.orig_exc))
+
+ # Show the offending line with a caret pointing to the error
+ if line_text:
+ msg_parts.append("")
+ msg_parts.append(f" {line_text}")
+ prefix = line_text[: column - 1].expandtabs()
+ msg_parts.append(f" {' ' * len(prefix)}^")
+
+ sys.stderr.write("\n".join(msg_parts) + "\n")
+
+
def xdr_parser() -> Lark:
"""Return a Lark parser instance configured with the XDR language grammar"""
diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen
index 3afd054..b2fb43f 100755
--- a/tools/net/sunrpc/xdrgen/xdrgen
+++ b/tools/net/sunrpc/xdrgen/xdrgen
@@ -123,6 +123,12 @@
help="Generate code for client or server side",
type=str,
)
+ source_parser.add_argument(
+ "--no-enum-validation",
+ action="store_true",
+ default=False,
+ help="Disable enum value validation in decoders",
+ )
source_parser.add_argument("filename", help="File containing an XDR specification")
source_parser.set_defaults(func=source.subcmd)
@@ -133,7 +139,5 @@
try:
if __name__ == "__main__":
sys.exit(main())
-except SystemExit:
- sys.exit(0)
except (KeyboardInterrupt, BrokenPipeError):
sys.exit(1)