From 4f4317c13a40194940acf4a71670179c4faca2b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Dec 2020 09:53:23 -0500 Subject: btrfs: fix error handling in commit_fs_roots While doing error injection I would sometimes get a corrupt file system. This is because I was injecting errors at btrfs_search_slot, but would only do it one time per stack. This uncovered a problem in commit_fs_roots, where if we get an error we would just break. However we're in a nested loop, the first loop being a loop to find all the dirty fs roots, and then subsequent root updates would succeed clearing the error value. This isn't likely to happen in real scenarios, however we could potentially get a random ENOMEM once and then not again, and we'd end up with a corrupted file system. Fix this by moving the error checking around a bit to the main loop, as this is the only place where something will fail, and return the error as soon as it occurs. With this patch my reproducer no longer corrupts the file system. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6af7f2bf92de..fbf93067642a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,7 +1319,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *gang[8]; int i; int ret; - int err = 0; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1331,6 +1330,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; + int ret2; + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1350,17 +1351,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) root->node); } - err = btrfs_update_root(trans, fs_info->tree_root, + ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret2) + return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); - return err; + return 0; } /* -- cgit v1.2.3-70-g09d2 From 3cc64e7ebfb0d7faaba2438334c43466955a96e8 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 20 Nov 2020 09:08:04 +0800 Subject: btrfs: clarify error returns values in __load_free_space_cache Return value in __load_free_space_cache is not properly set after (unlikely) memory allocation failures and 0 is returned instead. This is not a problem for the caller load_free_space_cache because only value 1 is considered as 'cache loaded' but for clarity it's better to set the errors accordingly. Fixes: a67509c30079 ("Btrfs: add a io_ctl struct and helpers for dealing with the space cache") Reported-by: Hulk Robot Signed-off-by: Zhihao Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4d8897879c9c..71d0d14bc18b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -775,8 +775,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -785,6 +787,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -805,6 +808,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; -- cgit v1.2.3-70-g09d2 From 149716570be98185150860fe922bf89ed080bd3c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 10 Dec 2020 10:38:32 +0200 Subject: btrfs: cleanup local variables in btrfs_file_write_iter First replace all inode instances with a pointer to btrfs_inode. This removes multiple invocations of the BTRFS_I macro, subsequently remove 2 local variables as they are called only once and simply refer to them directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e41459b8de6..e65223e3510d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1997,9 +1997,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; @@ -2008,7 +2006,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2016,7 +2014,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; if (sync) - atomic_inc(&BTRFS_I(inode)->sync_writers); + atomic_inc(&inode->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) num_written = btrfs_direct_write(iocb, from); @@ -2028,14 +2026,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occurred. */ - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_sub_trans = root->log_transid; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); if (num_written > 0) num_written = generic_write_sync(iocb, num_written); if (sync) - atomic_dec(&BTRFS_I(inode)->sync_writers); + atomic_dec(&inode->sync_writers); current->backing_dev_info = NULL; return num_written; -- cgit v1.2.3-70-g09d2 From 453e4873869f5e967188d8b018efc34a57eed44f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:32 +0200 Subject: btrfs: rename btrfs_find_highest_objectid to btrfs_init_root_free_objectid This function is used to initialize the in-memory btrfs_root::highest_objectid member, which is used to get an available objectid. Rename it to better reflect its semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++------- fs/btrfs/disk-io.h | 2 +- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b35b7e88136..47646a79d3fc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1367,8 +1367,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; @@ -2646,8 +2645,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; @@ -4745,7 +4743,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_init_root_free_objectid(struct btrfs_root *root) { struct btrfs_path *path; int ret; @@ -4769,10 +4767,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, + root->highest_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e45057c0c016..5e5bc603fbdf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,7 +134,7 @@ int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 254c2ee43aae..8ee0700a980f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6307,8 +6307,7 @@ again: * root->objectid_mutex is not acquired as log replay * could only happen during mount. */ - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); } wc.replay_dest->log_root = NULL; -- cgit v1.2.3-70-g09d2 From 543068a217a877bb6fa831fc448c9cc131db4feb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:33 +0200 Subject: btrfs: rename btrfs_find_free_objectid to btrfs_get_free_objectid This better reflects the semantics of the function i.e no search is performed whatsoever. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/transaction.c | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 47646a79d3fc..98384d3e41ac 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4778,7 +4778,7 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e5bc603fbdf..9f4a2a1e3d36 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -133,7 +133,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 71d0d14bc18b..fd6ddd6b8165 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -198,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, int ret; u64 ino; - ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8e0a6b038d3..0fe4df05006d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6371,7 +6371,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6435,7 +6435,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6579,7 +6579,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_fail; @@ -9079,7 +9079,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) return ret; @@ -9575,7 +9575,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9909,7 +9909,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dde49a791f3e..ec83803800c5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,7 +613,7 @@ static noinline int create_subvol(struct inode *dir, if (!root_item) return -ENOMEM; - ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); + ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index df63ef64c5c0..2698805ebd26 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3434,7 +3434,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(trans); } - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fbf93067642a..3bcb5444536e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1526,7 +1526,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; - pending->error = btrfs_find_free_objectid(tree_root, &objectid); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto no_free_objectid; -- cgit v1.2.3-70-g09d2 From 6b8fad576a3c8f822a888873c5acdfb31de53c4c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:35 +0200 Subject: btrfs: rename btrfs_root::highest_objectid to free_objectid This reflects the true purpose of the member as it's being used solely in context where a new objectid is being allocated. Future changes will also change the way it's being used to closely follow this semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 14 +++++++------- fs/btrfs/ioctl.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4debdbdde2ab..dc77aac2476c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1104,7 +1104,7 @@ struct btrfs_root { u32 type; - u64 highest_objectid; + u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 98384d3e41ac..a5ae0bbd983e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1016,7 +1016,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1373,7 +1373,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -2651,7 +2651,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->highest_objectid = max_t(u64, found_key.objectid, + root->free_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: @@ -4783,7 +4783,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", root->root_key.objectid); @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->highest_objectid; + *objectid = ++root->free_objectid; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ec83803800c5..2041c4b6fd0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -725,7 +725,7 @@ static noinline int create_subvol(struct inode *dir, } mutex_lock(&new_root->objectid_mutex); - new_root->highest_objectid = new_dirid; + new_root->free_objectid = new_dirid; mutex_unlock(&new_root->objectid_mutex); /* -- cgit v1.2.3-70-g09d2 From 23125104d8485505cd19581025a3d6fc14e9945a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:36 +0200 Subject: btrfs: make btrfs_root::free_objectid hold the next available objectid Adjust the way free_objectid is being initialized, it now stores BTRFS_FIRST_FREE_OBJECTID rather than the, somewhat arbitrary, BTRFS_FIRST_FREE_OBJECTID - 1. This change also has the added benefit that now it becomes unnecessary to explicitly initialize free_objectid for a newly create fs root. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/inode.c | 8 ++++++-- fs/btrfs/ioctl.c | 4 ---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a5ae0bbd983e..5473bed6a7e8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->free_objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); } else { - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } ret = 0; error: @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->free_objectid; + *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fe4df05006d..356905d97656 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8598,9 +8598,13 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct inode *inode; int err; u64 index = 0; + u64 ino; + + err = btrfs_get_free_objectid(new_root, &ino); + if (err < 0) + return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, - new_dirid, new_dirid, + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2041c4b6fd0e..d8422074da02 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -724,10 +724,6 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - mutex_lock(&new_root->objectid_mutex); - new_root->free_objectid = new_dirid; - mutex_unlock(&new_root->objectid_mutex); - /* * insert the directory item */ -- cgit v1.2.3-70-g09d2 From 69948022c9261a87c3c256bfa21c132f5099c690 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:37 +0200 Subject: btrfs: remove new_dirid argument from btrfs_create_subvol_root It's no longer used. While at it also remove new_dirid in create_subvol as it's used in a single place and open code it. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 5 ++--- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dc77aac2476c..f5c636b29451 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3107,8 +3107,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid); + struct btrfs_root *parent_root); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 356905d97656..af5558f87243 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8592,8 +8592,7 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid) + struct btrfs_root *parent_root) { struct inode *inode; int err; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d8422074da02..5b9b0a390f0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -606,7 +606,6 @@ static noinline int create_subvol(struct inode *dir, int err; dev_t anon_dev = 0; u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -693,7 +692,7 @@ static noinline int create_subvol(struct inode *dir, free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(root_item, new_dirid); + btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; @@ -716,7 +715,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ -- cgit v1.2.3-70-g09d2 From f75e2b79b5ba9dd3e0899840a329c3da02dc8937 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:43 -0500 Subject: btrfs: allow error injection for btrfs_search_slot and btrfs_cow_block The following patches are going to address error handling in relocation, in order to test those patches I need to be able to inject errors in btrfs_search_slot and btrfs_cow_block, as we call both of these pretty often in different cases during relocation. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cc89b63d65a4..56e132d825a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1494,6 +1494,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * helper function for defrag to decide if two blocks pointed to by a @@ -2821,6 +2822,7 @@ done: btrfs_release_path(p); return ret; } +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); /* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the -- cgit v1.2.3-70-g09d2 From 1fec12a560033ebe8fa6857dd3cbf9677371fbee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:45 -0500 Subject: btrfs: noinline btrfs_should_cancel_balance I was attempting to reproduce a problem that Zygo hit, but my error injection wasn't firing for a few of the common calls to btrfs_should_cancel_balance. This is because the compiler decided to inline it at these spots. Keep this from happening by explicitly marking the function as noinline so that error injection will always work. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2698805ebd26..8e51b39cbfbb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2615,7 +2615,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, /* * Allow error injection to test balance cancellation */ -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || fatal_signal_pending(current); -- cgit v1.2.3-70-g09d2 From 0d73a11c62642a25b688d09ae04b3b1f1b58ebb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:46 -0500 Subject: btrfs: ref-verify: pass down tree block level when building refs I noticed that sometimes I would have the wrong level printed out with ref-verify while testing some error injection related problems. This is because we only get the level from the main extent item, but our references could go off the current leaf into another, and at that point we lose our level. Fix this by keeping track of the last tree block level that we found, the same way we keep track of our bytenr and num_bytes, in case we happen to wander into another leaf while still processing the references for a bytenr. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 4b9b6c52a83b..409b02566b25 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, } static int process_leaf(struct btrfs_root *root, - struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret = 0; + int i = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); @@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root, case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, - &tree_block_level); + tree_block_level); break; case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, key.offset, 0, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, key.offset, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = btrfs_item_ptr(leaf, i, @@ -549,7 +550,8 @@ static int process_leaf(struct btrfs_root *root, /* Walk down to the leaf from the given level */ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, - int level, u64 *bytenr, u64 *num_bytes) + int level, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct extent_buffer *eb; int ret = 0; @@ -565,7 +567,8 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK; } else { - ret = process_leaf(root, path, bytenr, num_bytes); + ret = process_leaf(root, path, bytenr, num_bytes, + tree_block_level); if (ret) break; } @@ -974,6 +977,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct extent_buffer *eb; + int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -998,7 +1002,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * different leaf from the original extent item. */ ret = walk_down_tree(fs_info->extent_root, path, level, - &bytenr, &num_bytes); + &bytenr, &num_bytes, &tree_block_level); if (ret) break; ret = walk_up_tree(path, &level); -- cgit v1.2.3-70-g09d2 From 1478143ac81acc4094f8501a88e9e6ef9ff0e4a5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:47 -0500 Subject: btrfs: ref-verify: make sure owner is set for all refs I noticed that shared ref entries in ref-verify didn't have the proper owner set, which caused me to think there was something seriously wrong. However the problem is if we have a parent we simply weren't filling out the owner part of the reference, even though we have it. Fix this by making sure we set all the proper fields when we modify a reference, this way we'll have the proper owner if a problem happens and we don't waste time thinking we're updating the wrong level. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 409b02566b25..2b490becbe67 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -669,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root; - u64 owner; - u64 offset; + u64 ref_root = 0; + u64 owner = 0; + u64 offset = 0; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; if (generic_ref->type == BTRFS_REF_METADATA) { - ref_root = generic_ref->tree_ref.root; + if (!parent) + ref_root = generic_ref->tree_ref.root; owner = generic_ref->tree_ref.level; - offset = 0; - } else { + } else if (!parent) { ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; @@ -696,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, goto out; } - if (parent) { - ref->parent = parent; - } else { - ref->root_objectid = ref_root; - ref->owner = owner; - ref->offset = offset; - } + ref->parent = parent; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; memcpy(&ra->ref, ref, sizeof(struct ref_entry)); -- cgit v1.2.3-70-g09d2 From 7056bf69e5a338811738a7932b8e707aaca9fdd0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 17 Dec 2020 15:21:16 +0200 Subject: btrfs: consolidate btrfs_previous_item ret val handling in btrfs_shrink_device Instead of having three 'if' to handle non-NULL return value consolidate this in one 'if (ret)'. That way the code is more obvious: - Always drop delete_unused_bgs_mutex if ret is not NULL - If ret is negative -> goto done - If it's 1 -> reset ret to 0, release the path and finish the loop. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6c24c8ad749..a8ec8539cd8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4668,11 +4668,10 @@ again: } ret = btrfs_previous_item(root, path, 0, key.type); - if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret < 0) - goto done; if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + if (ret < 0) + goto done; ret = 0; btrfs_release_path(path); break; -- cgit v1.2.3-70-g09d2 From 9c4a062a94752dabd3954ef39c4dfed581c664b9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 11 Jan 2021 11:42:32 +0000 Subject: btrfs: send: remove stale code when checking for shared extents After commit 040ee6120cb670 ("Btrfs: send, improve clone range") we do not use anymore the data_offset field of struct backref_ctx, as after that we do all the necessary checks for the data offset of file extent items at clone_range(). Since there are no more users of data_offset from that structure, remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 78a35374d492..3bcbf2bcb869 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1191,9 +1191,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* data offset in the file extent item */ - u64 data_offset; - /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1401,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; - /* - * For non-compressed extents iterate_extent_inodes() gives us extent - * offsets that already take into account the data offset, but not for - * compressed extents, since the offset is logical and not relative to - * the physical extent locations. We must take this into account to - * avoid sending clone offsets that go beyond the source file's size, - * which would result in the clone ioctl failing with -EINVAL on the - * receiving end. - */ - if (compressed == BTRFS_COMPRESS_NONE) - backref_ctx->data_offset = 0; - else - backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. -- cgit v1.2.3-70-g09d2 From 9db4dc241e87fccd8301357d5ef908f40b50f2e3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 11 Jan 2021 12:58:11 +0200 Subject: btrfs: make btrfs_start_delalloc_root's nr argument a long It's currently u64 which gets instantly translated either to LONG_MAX (if U64_MAX is passed) or cast to an unsigned long (which is in fact, wrong because writeback_control::nr_to_write is a signed, long type). Just convert the function's argument to be long time which obviates the need to manually convert u64 value to a long. Adjust all call sites which pass U64_MAX to pass LONG_MAX. Finally ensure that in shrink_delalloc the u64 is converted to a long without overflowing, resulting in a negative number. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- fs/btrfs/space-info.c | 3 ++- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f5c636b29451..ed6bb46a2572 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3100,7 +3100,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 324f646d6e5e..bc73f798ce3a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -715,7 +715,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index af5558f87243..17418a75e3c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9489,11 +9489,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) return start_delalloc_inodes(root, &wbc, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { - .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, + .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, @@ -9515,7 +9515,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, * Reset nr_to_write here so we know that we're doing a full * flush. */ - if (nr == U64_MAX) + if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b9b0a390f0e..7f2935ea8d3a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4946,7 +4946,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8347461c8dd..84fb94e78a8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -532,7 +532,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + long nr_pages = min_t(u64, temp, LONG_MAX); btrfs_start_delalloc_roots(fs_info, nr_pages, true); -- cgit v1.2.3-70-g09d2 From d7830b7155ab43952ec8f2b95f326f63936ecd03 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 11 Jan 2021 12:58:12 +0200 Subject: btrfs: remove always true condition in btrfs_start_delalloc_roots Following the rework in e076ab2a2ca7 ("btrfs: shrink delalloc pages instead of full inodes") the nr variable is no longer passed by reference to start_delalloc_inodes hence it cannot change. Additionally we are always guaranteed for it to be positive number hence it's redundant to have it as a condition in the loop. Simply remove that usage. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17418a75e3c8..4056fc3b39cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9510,7 +9510,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice) && nr) { + while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. -- cgit v1.2.3-70-g09d2 From 523929f1cac3e869492ea376c9d86af11ec0e5c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Dec 2020 13:59:23 +0800 Subject: btrfs: make btrfs_dio_private::bytes u32 btrfs_dio_private::bytes is only assigned from bio::bi_iter::bi_size, which is never larger than U32. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9bf53d9ff90..28e202e89660 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -325,7 +325,8 @@ struct btrfs_dio_private { struct inode *inode; u64 logical_offset; u64 disk_bytenr; - u64 bytes; + /* Used for bio::bi_size */ + u32 bytes; /* * References to this structure. There is one reference per in-flight -- cgit v1.2.3-70-g09d2 From 58f74b2203d786da37128cbf786873996145bfdc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Dec 2020 13:59:24 +0800 Subject: btrfs: refactor btrfs_dec_test_* functions for ordered extents The refactoring involves the following modifications: - Return bool instead of int - Parameter update for @cached of btrfs_dec_test_first_ordered_pending() For btrfs_dec_test_first_ordered_pending(), @cached is only used to return the finished ordered extent. Rename it to @finished_ret. - Comment updates * Change one stale comment Which still refers to btrfs_dec_test_ordered_pending(), but the context is calling btrfs_dec_test_first_ordered_pending(). * Follow the common comment style for both functions Add more detailed descriptions for parameters and the return value * Move the reason why test_and_set_bit() is used into the call sites - Change how the return value is calculated The most anti-human part of the return value is: if (...) ret = 1; ... return ret == 0; This means, when we set ret to 1, the function returns 0. Change the local variable name to @finished, and directly return the value of it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +-- fs/btrfs/ordered-data.c | 104 +++++++++++++++++++++++++++--------------------- fs/btrfs/ordered-data.h | 10 ++--- 3 files changed, 65 insertions(+), 55 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4056fc3b39cf..ef6cb7b620d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7797,10 +7797,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, NULL); btrfs_queue_work(wq, &ordered->work); } - /* - * If btrfs_dec_test_ordered_pending does not find any ordered - * extent in the range, we can exit. - */ + + /* No ordered extent found in the range, exit */ if (ordered_offset == last_offset) return; /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 79d366a36223..d5d326c674b1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -297,26 +297,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * this is used to account for finished IO across a given range - * of the file. The IO may span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can + * contain several ordered extents. * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * @found_ret: Return the finished ordered extent + * @file_offset: File offset for the finished IO + * Will also be updated to one byte past the range that is + * recordered as finished. This allows caller to walk forward. + * @io_size: Length of the finish IO range + * @uptodate: If the IO finished without problem * - * file_offset is updated to one byte past the range that is recorded as - * complete. This allows you to walk forward in the file. + * Return true if any ordered extent is finished in the range, and update + * @found_ret and @file_offset. + * Return false otherwise. + * + * NOTE: Although The range can cross multiple ordered extents, only one + * ordered extent will be updated during one call. The caller is responsible to + * iterate all ordered extents in the range. */ -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - int ret; + bool finished = false; unsigned long flags; u64 dec_end; u64 dec_start; @@ -324,16 +331,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) { - ret = 1; + if (!offset_in_entry(entry, *file_offset)) goto out; - } dec_start = max(*file_offset, entry->file_offset); dec_end = min(*file_offset + io_size, @@ -354,39 +357,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { - *cached = entry; + if (finished && finished_ret && entry) { + *finished_ret = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* - * this is used to account for finished IO across a given range - * of the file. The IO should not span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can only + * contain one ordered extent. + * + * @cached: The cached ordered extent. If not NULL, we can skip the tree + * search and use the ordered extent directly. + * Will be also used to store the finished ordered extent. + * @file_offset: File offset for the finished IO + * @io_size: Length of the finish IO range + * @uptodate: If the IO finishes without problem + * + * Return true if the ordered extent is finished in the range, and update + * @cached. + * Return false otherwise. * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * NOTE: The range can NOT cross multiple ordered extents. + * Thus caller should ensure the range doesn't cross ordered extents. */ -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - int ret; + bool finished = false; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { @@ -395,41 +409,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, } node = tree_search(tree, file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) { - ret = 1; + if (!offset_in_entry(entry, file_offset)) goto out; - } - if (io_size > entry->bytes_left) { + if (io_size > entry->bytes_left) btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); - } + entry->bytes_left -= io_size; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { + if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bfa82b58e23..46194c2c05d4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -152,11 +152,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate); +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, -- cgit v1.2.3-70-g09d2 From 0c64c33c603f692ceb91d9fe17cc10028cff7da8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:40 +0800 Subject: btrfs: rename parameter offset to disk_bytenr in submit_extent_page The parameter offset is confusing, it's supposed to be the disk bytenr of metadata/data. Rename it to disk_bytenr and update the comment. Also rename each offset passed to submit_extent_page() as @disk_bytenr so they're consistent. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9cee458e001..60990616b895 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3062,10 +3062,10 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting * @page: page to add to the bio + * @disk_bytenr: logical bytenr where the write will be + * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @size: portion of page that we want to write - * @offset: starting offset in the page * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -3074,7 +3074,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, - struct page *page, u64 offset, + struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -3086,7 +3086,7 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = offset >> 9; + sector_t sector = disk_bytenr >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); @@ -3120,7 +3120,7 @@ static int submit_extent_page(unsigned int opf, } } - bio = btrfs_bio_alloc(offset); + bio = btrfs_bio_alloc(disk_bytenr); bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -3242,7 +3242,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } while (cur <= end) { bool force_bio_submit = false; - u64 offset; + u64 disk_bytenr; if (cur >= last_byte) { char *userpage; @@ -3280,9 +3280,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) - offset = em->block_start; + disk_bytenr = em->block_start; else - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3371,7 +3371,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, iosize, + page, disk_bytenr, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3548,8 +3548,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, blocksize = inode->vfs_inode.i_sb->s_blocksize; while (cur <= end) { + u64 disk_bytenr; u64 em_end; - u64 offset; if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(page, cur, @@ -3569,7 +3569,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, BUG_ON(end < cur); iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); @@ -3599,7 +3599,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, offset, iosize, pg_offset, + page, disk_bytenr, iosize, pg_offset, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); @@ -3923,7 +3923,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - u64 offset = eb->start; + u64 disk_bytenr = eb->start; u32 nritems; int i, num_pages; unsigned long start, end; @@ -3956,7 +3956,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, offset, PAGE_SIZE, 0, + p, disk_bytenr, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3969,7 +3969,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_SIZE; + disk_bytenr += PAGE_SIZE; update_nr_written(wbc, 1); unlock_page(p); } -- cgit v1.2.3-70-g09d2 From 6bc5636a67bf489d95ebc06c0449396fd487d309 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:41 +0800 Subject: btrfs: refactor __extent_writepage_io() to improve readability The refactoring involves the following modifications: - iosize alignment In fact we don't really need to manually do alignment at all. All extent maps should already be aligned, thus basic ASSERT() check would be enough. - redundant variables We have extra variable like blocksize/pg_offset/end. They are all unnecessary. @blocksize can be replaced by sectorsize size directly, and it's only used to verify the em start/size is aligned. @pg_offset can be easily calculated using @cur and page_offset(page). @end is just assigned from @page_end and never modified, use "start + PAGE_SIZE - 1" directly and remove @page_end. - remove some BUG_ON()s The BUG_ON()s are for extent map, which we have tree-checker to check on-disk extent data item and runtime check. ASSERT() should be enough. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60990616b895..74c0a32d04fc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3513,23 +3513,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, unsigned long nr_written, int *nr_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; - u64 end; + u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; u64 block_start; - u64 iosize; struct extent_map *em; - size_t pg_offset = 0; - size_t blocksize; int ret = 0; int nr = 0; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, page_end); + ret = btrfs_writepage_cow_fixup(page, start, end); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3544,16 +3541,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ update_nr_written(wbc, nr_written + 1); - end = page_end; - blocksize = inode->vfs_inode.i_sb->s_blocksize; - while (cur <= end) { u64 disk_bytenr; u64 em_end; + u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, - page_end, 1); + btrfs_writepage_endio_finish_ordered(page, cur, end, 1); break; } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); @@ -3565,13 +3559,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, extent_offset = cur - em->start; em_end = extent_map_end(em); - BUG_ON(em_end <= cur); - BUG_ON(end < cur); - iosize = min(em_end - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); - disk_bytenr = em->block_start + extent_offset; + ASSERT(cur <= em_end); + ASSERT(cur < end); + ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + disk_bytenr = em->block_start + extent_offset; + + /* Note that em_end from extent_map_end() is exclusive */ + iosize = min(em_end, end + 1) - cur; free_extent_map(em); em = NULL; @@ -3587,7 +3584,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_writepage_endio_finish_ordered(page, cur, cur + iosize - 1, 1); cur += iosize; - pg_offset += iosize; continue; } @@ -3599,8 +3595,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, disk_bytenr, iosize, pg_offset, - &epd->bio, + page, disk_bytenr, iosize, + cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3609,8 +3605,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_page_writeback(page); } - cur = cur + iosize; - pg_offset += iosize; + cur += iosize; nr++; } *nr_ret = nr; -- cgit v1.2.3-70-g09d2 From c0fab480955c4a943cc77be58269d97128ac3ef9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:42 +0800 Subject: btrfs: update comment for btrfs_dirty_pages The original comment is from the initial merge, which has several problems: - No holes check any more - No inline decision is made Update the out-of-date comment with more correct one. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e65223e3510d..d81ae1f518f2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } /* - * after copy_from_user, pages need to be dirtied and we need to make - * sure holes are created between the current EOF and the start of - * any next extents (if required). - * - * this also makes the decision about creating an inline extent vs - * doing real data extents, marking pages dirty and delalloc as required. + * After btrfs_copy_from_user(), update the following things for delalloc: + * - Mark newly dirtied pages as DELALLOC in the io tree. + * Used to advise which range is to be written back. + * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, -- cgit v1.2.3-70-g09d2 From c0f0a9e71653b33c003433f2248cec88f6942f35 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:45 +0800 Subject: btrfs: introduce helper to grab an existing extent buffer from a page This patch will extract the code to grab an extent buffer from a page into a helper, grab_extent_buffer_from_page(). This reduces one indent level, and provides the work place for later expansion for subapge support. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 74c0a32d04fc..7f689ad7709c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5247,6 +5247,29 @@ free_eb: } #endif +static struct extent_buffer *grab_extent_buffer(struct page *page) +{ + struct extent_buffer *exists; + + /* Page not yet attached to an extent buffer */ + if (!PagePrivate(page)) + return NULL; + + /* + * We could have already allocated an eb for this page and attached one + * so lets see if we can get a ref on the existing eb, and if we can we + * know it's good and we can just return that one, else we know we can + * just overwrite page->private. + */ + exists = (struct extent_buffer *)page->private; + if (atomic_inc_not_zero(&exists->refs)) + return exists; + + WARN_ON(PageDirty(page)); + detach_page_private(page); + return NULL; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -5292,26 +5315,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - goto free_eb; - } - exists = NULL; - - WARN_ON(PageDirty(p)); - detach_page_private(p); + exists = grab_extent_buffer(p); + if (exists) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + put_page(p); + mark_extent_buffer_accessed(exists, p); + goto free_eb; } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); -- cgit v1.2.3-70-g09d2 From f7ba2d37519dd6e15af9f00e9b4bbc7d1aba267a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:22:15 -0500 Subject: btrfs: keep track of the root owner for relocation reads While testing the error paths in relocation, I hit the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.10.0-rc3+ #206 Not tainted ------------------------------------------------------ btrfs-balance/1571 is trying to acquire lock: ffff8cdbcc8f77d0 (&head_ref->mutex){+.+.}-{3:3}, at: btrfs_lookup_extent_info+0x156/0x3b0 but task is already holding lock: ffff8cdbc54adbf8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-tree-00){++++}-{3:3}: down_write_nested+0x43/0x80 __btrfs_tree_lock+0x27/0x100 btrfs_search_slot+0x248/0x890 relocate_tree_blocks+0x490/0x650 relocate_block_group+0x1ba/0x5d0 kretprobe_trampoline+0x0/0x50 -> #1 (btrfs-csum-01){++++}-{3:3}: down_read_nested+0x43/0x130 __btrfs_tree_read_lock+0x27/0x100 btrfs_read_lock_root_node+0x31/0x40 btrfs_search_slot+0x5ab/0x890 btrfs_del_csums+0x10b/0x3c0 __btrfs_free_extent+0x49d/0x8e0 __btrfs_run_delayed_refs+0x283/0x11f0 btrfs_run_delayed_refs+0x86/0x220 btrfs_start_dirty_block_groups+0x2ba/0x520 kretprobe_trampoline+0x0/0x50 -> #0 (&head_ref->mutex){+.+.}-{3:3}: __lock_acquire+0x1167/0x2150 lock_acquire+0x116/0x3e0 __mutex_lock+0x7e/0x7b0 btrfs_lookup_extent_info+0x156/0x3b0 walk_down_proc+0x1c3/0x280 walk_down_tree+0x64/0xe0 btrfs_drop_subtree+0x182/0x260 do_relocation+0x52e/0x660 relocate_tree_blocks+0x2ae/0x650 relocate_block_group+0x1ba/0x5d0 kretprobe_trampoline+0x0/0x50 other info that might help us debug this: Chain exists of: &head_ref->mutex --> btrfs-csum-01 --> btrfs-tree-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-tree-00); lock(btrfs-csum-01); lock(btrfs-tree-00); lock(&head_ref->mutex); *** DEADLOCK *** 5 locks held by btrfs-balance/1571: #0: ffff8cdb89749ff8 (&fs_info->delete_unused_bgs_mutex){+.+.}-{3:3}, at: btrfs_balance+0x563/0xf40 #1: ffff8cdb89748838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x156/0x300 #2: ffff8cdbc2c16650 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x413/0x5c0 #3: ffff8cdbc135f538 (btrfs-treloc-01){+.+.}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 #4: ffff8cdbc54adbf8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 stack backtrace: CPU: 1 PID: 1571 Comm: btrfs-balance Not tainted 5.10.0-rc3+ #206 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb0 check_noncircular+0xcf/0xf0 ? trace_call_bpf+0x139/0x260 __lock_acquire+0x1167/0x2150 lock_acquire+0x116/0x3e0 ? btrfs_lookup_extent_info+0x156/0x3b0 __mutex_lock+0x7e/0x7b0 ? btrfs_lookup_extent_info+0x156/0x3b0 ? btrfs_lookup_extent_info+0x156/0x3b0 ? release_extent_buffer+0x124/0x170 ? _raw_spin_unlock+0x1f/0x30 ? release_extent_buffer+0x124/0x170 btrfs_lookup_extent_info+0x156/0x3b0 walk_down_proc+0x1c3/0x280 walk_down_tree+0x64/0xe0 btrfs_drop_subtree+0x182/0x260 do_relocation+0x52e/0x660 relocate_tree_blocks+0x2ae/0x650 ? add_tree_block+0x149/0x1b0 relocate_block_group+0x1ba/0x5d0 elfcorehdr_read+0x40/0x40 ? elfcorehdr_read+0x40/0x40 ? btrfs_balance+0x796/0xf40 ? __kthread_parkme+0x66/0x90 ? btrfs_balance+0xf40/0xf40 ? balance_kthread+0x37/0x50 ? kthread+0x137/0x150 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x30 As you can see this is bogus, we never take another tree's lock under the csum lock. This happens because sometimes we have to read tree blocks from disk without knowing which root they belong to during relocation. We defaulted to an owner of 0, which translates to an fs tree. This is fine as all fs trees have the same class, but obviously isn't fine if the block belongs to a COW only tree. Thankfully COW only trees only have their owners root as a reference to them, and since we already look up the extent information during relocation, go ahead and check and see if this block might belong to a COW only tree, and if so save the owner in the tree_block struct. This allows us to read_tree_block with the proper owner, which gets rid of this lockdep splat. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8e51b39cbfbb..9f2289bcdde6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -97,6 +97,7 @@ struct tree_block { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ + u64 owner; struct btrfs_key key; unsigned int level:8; unsigned int key_ready:1; @@ -2393,8 +2394,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, - block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, block->owner, + block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2493,7 +2494,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + btrfs_readahead_tree_block(fs_info, block->bytenr, + block->owner, 0, block->level); } @@ -2801,21 +2803,58 @@ static int add_tree_block(struct reloc_control *rc, u32 item_size; int level = -1; u64 generation; + u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { + unsigned long ptr = 0, end; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); + ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; + ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); + + /* + * We're reading random blocks without knowing their owner ahead + * of time. This is ok most of the time, as all reloc roots and + * fs roots have the same lock type. However normal trees do + * not, and the only way to know ahead of time is to read the + * inline ref offset. We know it's an fs root if + * + * 1. There's more than one ref. + * 2. There's a SHARED_DATA_REF_KEY set. + * 3. FULL_BACKREF is set on the flags. + * + * Otherwise it's safe to assume that the ref offset == the + * owner of this block, so we can use that when calling + * read_tree_block. + */ + if (btrfs_extent_refs(eb, ei) == 1 && + !(btrfs_extent_flags(eb, ei) & + BTRFS_BLOCK_FLAG_FULL_BACKREF) && + ptr < end) { + struct btrfs_extent_inline_ref *iref; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + if (type == BTRFS_TREE_BLOCK_REF_KEY) + owner = btrfs_extent_inline_ref_offset(eb, iref); + } } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { btrfs_print_v0_err(eb->fs_info); btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); @@ -2837,6 +2876,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.offset = generation; block->level = level; block->key_ready = 0; + block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) -- cgit v1.2.3-70-g09d2 From 7e2a870a599d4699a626ec26430c7a1ab14a2a49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:22:16 -0500 Subject: btrfs: do not cleanup upper nodes in btrfs_backref_cleanup_node Zygo reported the following panic when testing my error handling patches for relocation: kernel BUG at fs/btrfs/backref.c:2545! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 3 PID: 8472 Comm: btrfs Tainted: G W 14 Hardware name: QEMU Standard PC (i440FX + PIIX, Call Trace: btrfs_backref_error_cleanup+0x4df/0x530 build_backref_tree+0x1a5/0x700 ? _raw_spin_unlock+0x22/0x30 ? release_extent_buffer+0x225/0x280 ? free_extent_buffer.part.52+0xd7/0x140 relocate_tree_blocks+0x2a6/0xb60 ? kasan_unpoison_shadow+0x35/0x50 ? do_relocation+0xc10/0xc10 ? kasan_kmalloc+0x9/0x10 ? kmem_cache_alloc_trace+0x6a3/0xcb0 ? free_extent_buffer.part.52+0xd7/0x140 ? rb_insert_color+0x342/0x360 ? add_tree_block.isra.36+0x236/0x2b0 relocate_block_group+0x2eb/0x780 ? merge_reloc_roots+0x470/0x470 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x18f0 ? pvclock_clocksource_read+0xeb/0x190 ? btrfs_relocate_chunk+0x120/0x120 ? lock_contended+0x620/0x6e0 ? do_raw_spin_lock+0x1e0/0x1e0 ? do_raw_spin_unlock+0xa8/0x140 btrfs_ioctl_balance+0x1f9/0x460 btrfs_ioctl+0x24c8/0x4380 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This occurs because of this check if (RB_EMPTY_NODE(&upper->rb_node)) BUG_ON(!list_empty(&node->upper)); As we are dropping the backref node, if we discover that our upper node in the edge we just cleaned up isn't linked into the cache that we are now done with this node, thus the BUG_ON(). However this is an erroneous assumption, as we will look up all the references for a node first, and then process the pending edges. All of the 'upper' nodes in our pending edges won't be in the cache's rb_tree yet, because they haven't been processed. We could very well have many edges still left to cleanup on this node. The fact is we simply do not need this check, we can just process all of the edges only for this node, because below this check we do the following if (list_empty(&upper->lower)) { list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } If the upper node truly isn't used yet, then we add it to the cache->leaves list to be cleaned up later. If it is still used then the last child node that has it linked into its node will add it to the leaves list and then it will be cleaned up. Fix this problem by dropping this logic altogether. With this fix I no longer see the panic when testing with error injection in the backref code. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/backref.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9cadacf3ec27..ef71aba5bc15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2541,13 +2541,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - btrfs_backref_drop_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } /* * Add the node to leaf node list if no other child block * cached. -- cgit v1.2.3-70-g09d2 From fe3b7bb085a0b1fb26d622a5eccc7dbb5c4f82fb Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 21 Jan 2021 16:19:47 +0800 Subject: btrfs: remove redundant NULL check before kvfree Fix below warnings reported by coccicheck: ./fs/btrfs/raid56.c:237:2-8: WARNING: NULL check before some freeing functions is not needed. Reported-by: Abaci Robot Reviewed-by: Anand Jain Signed-off-by: Yang Li Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..5394641541f7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) - kvfree(x); + kvfree(x); return 0; } -- cgit v1.2.3-70-g09d2 From 3c198fe064491dcceaed9e15c6c997e92e71293e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 21 Jan 2021 14:13:54 +0800 Subject: btrfs: rework the order of btrfs_ordered_extent::flags [BUG] There is a long existing bug in the last parameter of btrfs_add_ordered_extent(), in commit 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads") back to 2008. In that ancient commit btrfs_add_ordered_extent() expects the @type parameter to be one of the following: - BTRFS_ORDERED_REGULAR - BTRFS_ORDERED_NOCOW - BTRFS_ORDERED_PREALLOC - BTRFS_ORDERED_COMPRESSED But we pass 0 in cow_file_range(), which means BTRFS_ORDERED_IO_DONE. Ironically extra check in __btrfs_add_ordered_extent() won't set the bit if we see (type == IO_DONE || type == IO_COMPLETE), and avoid any obvious bug. But this still leads to regular COW ordered extent having no bit to indicate its type in various trace events, rendering REGULAR bit useless. [FIX] Change the following aspects to avoid such problem: - Reorder btrfs_ordered_extent::flags Now the type bits go first (REGULAR/NOCOW/PREALLCO/COMPRESSED), then DIRECT bit, finally extra status bits like IO_DONE/COMPLETE/IOERR. - Add extra ASSERT() for btrfs_add_ordered_extent_*() - Remove @type parameter for btrfs_add_ordered_extent_compress() As the only valid @type here is BTRFS_ORDERED_COMPRESSED. - Remove the unnecessary special check for IO_DONE/COMPLETE in __btrfs_add_ordered_extent() This is just to make the code work, with extra ASSERT(), there are limited values can be passed in. Reviewed-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- fs/btrfs/ordered-data.c | 21 ++++++++++++++++----- fs/btrfs/ordered-data.h | 37 ++++++++++++++++++++++++------------- include/trace/events/btrfs.h | 7 ++++--- 4 files changed, 46 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef6cb7b620d0..ea9056cc5559 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -917,7 +917,6 @@ retry: ins.objectid, async_extent->ram_size, ins.offset, - BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, async_extent->start, @@ -1127,7 +1126,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); + ram_size, cur_alloc_size, + BTRFS_ORDERED_REGULAR); if (ret) goto out_drop_extent_cache; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d5d326c674b1..b4e6500548a2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,8 +199,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) - set_bit(type, &entry->flags); + + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED); + set_bit(type, &entry->flags); if (dio) { percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, @@ -256,6 +260,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); @@ -265,6 +272,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); @@ -272,11 +282,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type) + u64 disk_num_bytes, int compress_type) { + ASSERT(compress_type != BTRFS_COMPRESS_NONE); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, + num_bytes, disk_num_bytes, + BTRFS_ORDERED_COMPRESSED, 0, compress_type); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 46194c2c05d4..cca3307807e8 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -27,7 +27,7 @@ struct btrfs_ordered_sum { }; /* - * bits for the flags field: + * Bits for btrfs_ordered_extent::flags. * * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. * It is used to make sure metadata is inserted into the tree only once @@ -38,24 +38,36 @@ struct btrfs_ordered_sum { * IO is done and any metadata is inserted into the tree. */ enum { + /* + * Different types for direct io, one and only one of the 4 type can + * be set when creating ordered extent. + * + * REGULAR: For regular non-compressed COW write + * NOCOW: For NOCOW write into existing non-hole extent + * PREALLOC: For NOCOW write into preallocated extent + * COMPRESSED: For compressed COW write + */ + BTRFS_ORDERED_REGULAR, + BTRFS_ORDERED_NOCOW, + BTRFS_ORDERED_PREALLOC, + BTRFS_ORDERED_COMPRESSED, + + /* + * Extra bit for direct io, can only be set for + * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + */ + BTRFS_ORDERED_DIRECT, + + /* Extra status bits for ordered extents */ + /* set when all the pages are written */ BTRFS_ORDERED_IO_DONE, /* set when removed from the tree */ BTRFS_ORDERED_COMPLETE, - /* set when we want to write in place */ - BTRFS_ORDERED_NOCOW, - /* writing a zlib compressed extent */ - BTRFS_ORDERED_COMPRESSED, - /* set when writing to preallocated extent */ - BTRFS_ORDERED_PREALLOC, - /* set when we're doing DIO with this extent */ - BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, - /* Regular IO for COW */ - BTRFS_ORDERED_REGULAR, /* Used during fsync to track already logged extents */ BTRFS_ORDERED_LOGGED, /* We have already logged all the csums of the ordered extent */ @@ -167,8 +179,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type); + u64 disk_num_bytes, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index ecd24c719de4..b9896fc06160 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -499,12 +499,13 @@ DEFINE_EVENT( #define show_ordered_flags(flags) \ __print_flags(flags, "|", \ - { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ - { (1 << BTRFS_ORDERED_COMPLETE), "COMPLETE" }, \ + { (1 << BTRFS_ORDERED_REGULAR), "REGULAR" }, \ { (1 << BTRFS_ORDERED_NOCOW), "NOCOW" }, \ - { (1 << BTRFS_ORDERED_COMPRESSED), "COMPRESSED" }, \ { (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \ + { (1 << BTRFS_ORDERED_COMPRESSED), "COMPRESSED" }, \ { (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \ + { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ + { (1 << BTRFS_ORDERED_COMPLETE), "COMPLETE" }, \ { (1 << BTRFS_ORDERED_IOERR), "IOERR" }, \ { (1 << BTRFS_ORDERED_TRUNCATED), "TRUNCATED" }) -- cgit v1.2.3-70-g09d2 From 401bd2dd1299dd384849707c6577b2089ab9f615 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:52 +0200 Subject: btrfs: document modified parameter of add_extent_mapping Fixes fs/btrfs/extent_map.c:399: warning: Function parameter or member 'modified' not described in 'add_extent_mapping' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bd6229fb2b6f..8bda6c89e23e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } /** - * add_extent_mapping - add new extent map to the extent tree + * Add new extent map to the extent tree + * * @tree: tree to insert new map in * @em: map to insert + * @modified: indicate whether the given @em should be added to the + * modified list, which indicates the extent needs to be logged * * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted -- cgit v1.2.3-70-g09d2 From 9ad37bb3ffc51fbd9c48ba4d85414b4aa3e21c6d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:53 +0200 Subject: btrfs: fix parameter description of btrfs_add_extent_mapping This fixes the following compiler warnings: fs/btrfs/extent_map.c:601: warning: Function parameter or member 'fs_info' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_tree' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_in' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'start' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'len' not described in 'btrfs_add_extent_mapping' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8bda6c89e23e..4a8e02f7b6c7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -577,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, } /** - * btrfs_add_extent_mapping - add extent mapping into em_tree - * @fs_info - used for tracepoint - * @em_tree - the extent tree into which we want to insert the extent mapping - * @em_in - extent we are inserting - * @start - start of the logical range btrfs_get_extent() is requesting - * @len - length of the logical range btrfs_get_extent() is requesting + * Add extent mapping into em_tree + * + * @fs_info: the filesystem + * @em_tree: extent tree into which we want to insert the extent mapping + * @em_in: extent we are inserting + * @start: start of the logical range btrfs_get_extent() is requesting + * @len: length of the logical range btrfs_get_extent() is requesting * * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. -- cgit v1.2.3-70-g09d2 From ca4207ae1385190f7d62926f107ede1edced4c1f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:54 +0200 Subject: btrfs: fix function description formats in file-item.c This fixes following W=1 warnings: fs/btrfs/file-item.c:27: warning: Cannot understand * @inode: the inode we want to update the disk_i_size for on line 27 - I thought it was a doc line fs/btrfs/file-item.c:65: warning: Cannot understand * @inode - the inode we're modifying on line 65 - I thought it was a doc line fs/btrfs/file-item.c:91: warning: Cannot understand * @inode - the inode we're modifying on line 91 - I thought it was a doc line Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6ccfc019ad90..47cd3a6dc635 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -24,8 +24,10 @@ PAGE_SIZE)) /** - * @inode - the inode we want to update the disk_i_size for - * @new_i_size - the i_size we want to set to, 0 if we use i_size + * Set inode's size according to filesystem options + * + * @inode: inode we want to update the disk_i_size for + * @new_i_size: i_size we want to set to, 0 if we use i_size * * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() * returns as it is perfectly fine with a file that has holes without hole file @@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Mark range within a file as having a new extent inserted + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Call when we are inserting a new file extent where there was none before. * Does not need to call this in the case where we're replacing an existing file @@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Marks an inode range as not having a backing extent + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Called when we drop a file extent, for example when we truncate. Doesn't * need to be called for cases where we're replacing a file extent, like when -- cgit v1.2.3-70-g09d2 From 696eb22b67add04e13f26cebe9f63eeb9477becd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:55 +0200 Subject: btrfs: fix parameter description in delayed-ref.c functions This fixes the following warnings: fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'fs_info' not described in 'btrfs_delayed_refs_rsv_release' fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'nr' not described in 'btrfs_delayed_refs_rsv_release' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'fs_info' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'src' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'num_bytes' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'fs_info' not described in 'btrfs_delayed_refs_rsv_refill' fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'flush' not described in 'btrfs_delayed_refs_rsv_refill' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 353cc2994d10..88a1e27d2fc2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) } /** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. + * Release a ref head's reservation + * + * @fs_info: the filesystem + * @nr: number of items to drop * * This drops the delayed ref head's count from the delayed refs rsv and frees * any excess reservation we had. @@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) } /** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. + * Transfer bytes to our delayed refs rsv + * + * @fs_info: the filesystem + * @src: source block rsv to transfer from + * @num_bytes: number of bytes to transfer * * This transfers up to the num_bytes amount from the src rsv to the * delayed_refs_rsv. Any extra bytes are returned to the space info. @@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, } /** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. + * Refill based on our delayed refs usage + * + * @fs_info: the filesystem + * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. -- cgit v1.2.3-70-g09d2 From f092cf3cfd0144bdaf6110176ea9d2cef1f3b4a8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:56 +0200 Subject: btrfs: improve parameter description for __btrfs_write_out_cache Fixes following W=1 warnings: fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'root' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'inode' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'ctl' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'block_group' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'io_ctl' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'trans' not described in '__btrfs_write_out_cache' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fd6ddd6b8165..0d6dcb5ff963 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1299,11 +1299,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, } /** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle + * Write out cached info to an inode + * + * @root: root the inode belongs to + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, -- cgit v1.2.3-70-g09d2 From 92419695478b6a75ca85e9f8e06b08a4a35bfb20 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:57 +0200 Subject: btrfs: document now parameter of peek_discard_list Fixes fs/btrfs/discard.c:203: warning: Function parameter or member 'now' not described in 'peek_discard_list' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 2b8383d41144..306ff20af70f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group( } /** - * peek_discard_list - wrap find_next_block_group() - * @discard_ctl: discard control + * Wrap find_next_block_group() + * + * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management + * @now: time when discard was invoked, in ns * * This wraps find_next_block_group() and sets the block_group to be in use. * discard_state's control flow is managed here. Variables related to -- cgit v1.2.3-70-g09d2 From 9ee9b97990d6eff9cea64303c640dfb4b3a40253 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:58 +0200 Subject: btrfs: document fs_info in btrfs_rmap_block Fixes fs/btrfs/block-group.c:1570: warning: Function parameter or member 'fs_info' not described in 'btrfs_rmap_block' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ebc106a606..2d7294d81616 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1561,7 +1561,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /** - * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * Map a physical disk address to a list of logical addresses + * + * @fs_info: the filesystem * @chunk_start: logical address of block group * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical -- cgit v1.2.3-70-g09d2 From 2639631d34941db1ebbc74fb879855e0cd286cec Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:59 +0200 Subject: btrfs: fix description format of fs_info of btrfs_wait_on_delayed_iputs Fixes fs/btrfs/inode.c:3101: warning: Function parameter or member 'fs_info' not described in 'btrfs_wait_on_delayed_iputs' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ea9056cc5559..0dbe1aaa0b71 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3103,14 +3103,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /** - * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running - * @fs_info - the fs_info for this fs - * @return - EINTR if we were killed, 0 if nothing's pending + * Wait for flushing all delayed iputs + * + * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. + * + * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { -- cgit v1.2.3-70-g09d2 From 6e353e3b3c5545524d718d528548f7c8c95536c5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:00 +0200 Subject: btrfs: document btrfs_check_shared parameters Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ef71aba5bc15..701124c3e0b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1501,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, } /** - * btrfs_check_shared - tell us whether an extent is shared + * Check if an extent is shared or not + * + * @root: root inode belongs to + * @inum: inode number of the inode whose extent we are checking + * @bytenr: logical bytenr of the extent we are checking + * @roots: list of roots this extent is shared among + * @tmp: temporary list used for iteration * * btrfs_check_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the -- cgit v1.2.3-70-g09d2 From b762d1d08dacdc444ffd6417fc17805408da7af4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:01 +0200 Subject: btrfs: fix parameter description of btrfs_inode_rsv_release/btrfs_delalloc_release_space Fixes following warnings: fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 'inode' not described in 'btrfs_inode_rsv_release' fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 'qgroup_free' not described in 'btrfs_inode_rsv_release' fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 'reserved' not described in 'btrfs_delalloc_release_space' fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 'qgroup_free' not described in 'btrfs_delalloc_release_space' fs/btrfs/delalloc-space.c:472: warning: Excess function parameter 'release_bytes' description in 'btrfs_delalloc_release_space' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bacee09b7bfd..56642ca7af10 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, } /** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. + * Release any excessive reservation + * + * @inode: the inode we need to release from + * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup + * meta reservation needs to know if we are freeing qgroup + * reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal + * release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. @@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * Release a metadata reservation for an inode + * * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation @@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, } /** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use + * Release data and metadata space for delalloc + * + * @inode: inode we're releasing space for + * @reserved: list of changed/reserved ranges + * @start: start position of the space already reserved + * @len: length of the space already reserved + * @qgroup_free: should qgroup reserved-space also be freed * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes -- cgit v1.2.3-70-g09d2 From d98b188ea463281ee89663c36d8ac0a030e93b0c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:02 +0200 Subject: btrfs: fix parameter description in space-info.c With these fixes space-info.c is clear for W=1 warnings, namely the following ones are fixed: fs/btrfs/space-info.c:575: warning: Function parameter or member 'fs_info' not described in 'may_commit_transaction' fs/btrfs/space-info.c:575: warning: Function parameter or member 'space_info' not described in 'may_commit_transaction' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'fs_info' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'space_info' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'ticket' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'flush' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'fs_info' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'space_info' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'orig_bytes' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'flush' not described in '__reserve_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'root' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'block_rsv' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'orig_bytes' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'flush' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'fs_info' not described in 'btrfs_reserve_data_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'bytes' not described in 'btrfs_reserve_data_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'flush' not described in 'btrfs_reserve_data_bytes' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 84fb94e78a8f..fd8e79e3c10e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -561,10 +561,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, } /** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit + * Possibly commit the transaction if its ok to + * + * @fs_info: the filesystem + * @space_info: space_info we are checking for commit, either data or metadata * * This will check to make sure that committing the transaction will actually * get us somewhere and then commit the transaction if it does. Otherwise it @@ -1215,11 +1215,12 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, } /** - * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket - * @fs_info - the fs - * @space_info - the space_info for the reservation - * @ticket - the ticket for the reservation - * @flush - how much we can flush + * Do the appropriate flushing and waiting for a ticket + * + * @fs_info: the filesystem + * @space_info: space info for the reservation + * @ticket: ticket for the reservation + * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. @@ -1296,11 +1297,12 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Try to reserve bytes from the block_rsv's space + * + * @fs_info: the filesystem + * @space_info: space info we want to allocate from + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1407,11 +1409,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Trye to reserve metadata bytes from the block_rsv's space + * + * @root: the root we're allocating for + * @block_rsv: block_rsv we're allocating for + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1449,10 +1452,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } /** - * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation - * @fs_info - the filesystem - * @bytes - the number of bytes we need - * @flush - how we are allowed to flush + * Try to reserve data bytes for an allocation + * + * @fs_info: the filesystem + * @bytes: number of bytes we need + * @flush: how we are allowed to flush * * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. -- cgit v1.2.3-70-g09d2 From 3bed2da1b00f554e70d16f44db9357a7670d776c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:03 +0200 Subject: btrfs: fix parameter description for functions in extent_io.c This makes the file W=1 clean and fixes the following warnings: fs/btrfs/extent_io.c:414: warning: Function parameter or member 'tree' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'offset' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'next_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'prev_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'p_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'parent_ret' not described in '__etree_search' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'tree' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start_ret' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'end_ret' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'bits' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'tree' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start_ret' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'end_ret' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'bits' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:4187: warning: Function parameter or member 'epd' not described in 'extent_write_cache_pages' fs/btrfs/extent_io.c:4187: warning: Excess function parameter 'data' description in 'extent_write_cache_pages' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7f689ad7709c..2fa563da65bd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -389,16 +389,16 @@ do_insert: } /** - * __etree_search - searche @tree for an entry that contains @offset. Such - * entry would have entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. * - * @tree - the tree to search - * @offset - offset that should fall within an entry in @tree - * @next_ret - pointer to the first entry whose range ends after @offset - * @prev - pointer to the first entry whose range begins before @offset - * @p_ret - pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret - points to entry which would have been the parent of the entry, + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * @p_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * * This function returns a pointer to the entry that contains @offset byte @@ -1588,12 +1588,13 @@ out: } /** - * find_contiguous_extent_bit: find a contiguous area of bits - * @tree - io tree to check - * @start - offset to start the search from - * @start_ret - the first offset we found with the bits set - * @end_ret - the final contiguous range of the bits that were set - * @bits - bits to look for + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for * * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges * to set bits appropriately, and then merge them again. During this time it @@ -1625,14 +1626,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, } /** - * find_first_clear_extent_bit - find the first range that has @bits not set. - * This range could start before @start. + * Find the first range that has @bits not set. This range could start before + * @start. * - * @tree - the tree to search - * @start - the offset at/after which the found extent should start - * @start_ret - records the beginning of the range - * @end_ret - records the end of the range (inclusive) - * @bits - the set of bits which must be unset + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset * * Since unallocated range is also considered one which doesn't have the bits * set it's possible that @end_ret contains -1, this happens in case the range @@ -4168,10 +4169,11 @@ retry: } /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * Walk the list of dirty pages of the given address space and write all of them. + * * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @data: data passed to __extent_writepage function + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @epd: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, -- cgit v1.2.3-70-g09d2 From 8c31a3dbaa356b1fce97bf55026410649e4dd0f1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Sun, 24 Jan 2021 18:03:21 +0200 Subject: btrfs: zoned: remove unused variable in btrfs_sb_log_location_bdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes warning: fs/btrfs/zoned.c:491:6: warning: variable ‘zone_size’ set but not used [-Wunused-but-set-variable] 491 | u64 zone_size; which got introduced in 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode"). We'll enable the warning by default and want clean build until the relevant zoned patches land. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c38846659019..41d27fefd306 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -488,7 +488,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, unsigned int zone_sectors; u32 sb_zone; int ret; - u64 zone_size; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; @@ -503,7 +502,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; - zone_size = zone_sectors << SECTOR_SHIFT; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; -- cgit v1.2.3-70-g09d2 From e9aa7c285d20a69ce1fb940ec846686780af9e56 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:05 +0200 Subject: btrfs: enable W=1 checks for btrfs Now that the btrfs' codebase is clean of almost all W=1 warnings let's enable those checks unconditionally for the entire fs/btrfs/ and its subdirectories to catch potential errors during development. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ add some comments ] Signed-off-by: David Sterba --- fs/btrfs/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9f1b1a88e317..e45957319424 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,5 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 +# Subset of W=1 warnings +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) +subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +# The following turn off the warnings enabled by -Wextra +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-sign-compare +subdir-ccflags-y += -Wno-type-limits + obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ -- cgit v1.2.3-70-g09d2 From 2187374f35fe9cadbddaa9fcf0c4121365d914e8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jan 2021 16:48:55 -0500 Subject: btrfs: handle space_info::total_bytes_pinned inside the delayed ref itself Currently we pass things around to figure out if we maybe freeing data based on the state of the delayed refs head. This makes the accounting sort of confusing and hard to follow, as it's distinctly separate from the delayed ref heads stuff, but also depends on it entirely. Fix this by explicitly adjusting the space_info->total_bytes_pinned in the delayed refs code. We now have two places where we modify this counter, once where we create the delayed and destroy the delayed refs, and once when we pin and unpin the extents. This means there is a slight overlap between delayed refs and the pin/unpin mechanisms, but this is simply used by the ENOSPC infrastructure to determine if we need to commit the transaction, so there's no adverse affect from this, we might simply commit thinking it will give us enough space when it might not. CC: stable@vger.kernel.org # 5.10 Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 ++---- fs/btrfs/delayed-ref.c | 51 +++++++++++++++----------- fs/btrfs/delayed-ref.h | 16 ++++++--- fs/btrfs/extent-tree.c | 97 +++++++------------------------------------------- fs/btrfs/space-info.h | 17 +++++++++ 5 files changed, 74 insertions(+), 117 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2d7294d81616..763a3671b7af 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1371,9 +1371,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -2898,10 +2896,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, + num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 88a1e27d2fc2..a540ace3e03a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -651,12 +651,12 @@ inserted: */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, - struct btrfs_delayed_ref_head *update, - int *old_ref_mod_ret) + struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -704,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; - if (old_ref_mod_ret) - *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; @@ -727,6 +725,22 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates += csum_leaves; } } + + /* + * This handles the following conditions: + * + * 1. We had a ref mod of 0 or more and went negative, indicating that + * we may be freeing space, so add our space to the + * total_bytes_pinned counter. + * 2. We were negative and went to 0 or positive, so no longer can say + * that the space would be pinned, decrement our counter from the + * total_bytes_pinned counter. + */ + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + spin_unlock(&existing->lock); } @@ -801,8 +815,7 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) + int action, int *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -824,8 +837,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(trans, existing, head_ref, - old_ref_mod); + update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -833,14 +845,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - if (old_ref_mod) - *old_ref_mod = 0; + u64 flags = btrfs_ref_head_to_space_flags(head_ref); + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } + if (head_ref->ref_mod < 0) + btrfs_mod_total_bytes_pinned(trans->fs_info, flags, + head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -848,8 +863,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; - if (new_ref_mod) - *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -912,8 +925,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; @@ -980,8 +992,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1009,8 +1020,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod) + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -1076,8 +1086,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1120,7 +1129,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL, NULL, NULL); + NULL); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 1c977e6d45dc..3ba140468f12 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } +static inline u64 btrfs_ref_head_to_space_flags( + struct btrfs_delayed_ref_head *head_ref) +{ + if (head_ref->is_data) + return BTRFS_BLOCK_GROUP_DATA; + else if (head_ref->is_system) + return BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_METADATA; +} + static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) { if (refcount_dec_and_test(&head->refs)) @@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod); + u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c335dae5af7..2f591036ffc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -82,41 +82,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) EXTENT_UPTODATE); } -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; - } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -1388,7 +1353,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && @@ -1397,17 +1361,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - return ret; } @@ -1796,20 +1755,9 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; + u64 flags = btrfs_ref_head_to_space_flags(head); - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); /* * We had csum deletions accounted for in our delayed refs rsv, @@ -2572,8 +2520,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2784,8 +2731,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -3318,7 +3264,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ref generic_ref = { 0 }; - int pin = 1; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3327,13 +3272,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, root->root_key.objectid); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { @@ -3345,7 +3286,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } - pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -3362,9 +3302,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); - if (last_ref) { /* * Deleting the buffer, clear the corrupt flag since it doesn't @@ -3378,7 +3315,6 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) @@ -3394,14 +3330,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (!((ref->type == BTRFS_REF_METADATA && @@ -3410,9 +3343,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); - return ret; } @@ -4528,7 +4458,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; - int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -4536,9 +4465,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ins->objectid, ins->offset, 0); btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* @@ -4730,8 +4658,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, generic_ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&generic_ref, level, root_objectid); btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) goto out_free_delayed; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5646393b928c..74706f604bce 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -152,4 +152,21 @@ static inline void btrfs_space_info_free_bytes_may_use( int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +static inline void __btrfs_mod_total_bytes_pinned( + struct btrfs_space_info *space_info, + s64 mod) +{ + percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, + BTRFS_TOTAL_BYTES_PINNED_BATCH); +} + +static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, + u64 flags, s64 mod) +{ + struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); + + ASSERT(space_info); + __btrfs_mod_total_bytes_pinned(space_info, mod); +} + #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3-70-g09d2 From 81e75ac74ecba929d1e922bf93f9fc467232e39f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jan 2021 16:48:56 -0500 Subject: btrfs: account for new extents being deleted in total_bytes_pinned My recent patch set "A variety of lock contention fixes", found here https://lore.kernel.org/linux-btrfs/cover.1608319304.git.josef@toxicpanda.com/ (Tracked in https://github.com/btrfs/linux/issues/86) that reduce lock contention on the extent root by running delayed refs less often resulted in a regression in generic/371. This test fallocate()'s the fs until it's full, deletes all the files, and then tries to fallocate() until full again. Before these patches we would run all of the delayed refs during flushing, and then would commit the transaction because we had plenty of pinned space to recover in order to allocate. However my patches made it so we weren't running the delayed refs as aggressively, which meant that we appeared to have less pinned space when we were deciding to commit the transaction. We use the space_info->total_bytes_pinned to approximate how much space we have pinned. It's approximate because if we remove a reference to an extent we may free it, but there may be more references to it than we know of at that point, but we account it as pinned at the creation time, and then it's properly accounted when the delayed ref runs. The way we account for pinned space is if the delayed_ref_head->total_ref_mod is < 0, because that is clearly a freeing option. However there is another case, and that is where ->total_ref_mod == 0 && ->must_insert_reserved == 1. When we allocate a new extent, we have ->total_ref_mod == 1 and we have ->must_insert_reserved == 1. This is used to indicate that it is a brand new extent and will need to have its extent entry added before we modify any references on the delayed ref head. But if we subsequently remove that extent reference, our ->total_ref_mod will be 0, and that space will be pinned and freed. Accounting for this case properly allows for generic/371 to pass with my delayed refs patches applied. It's important to note that this problem exists without the referenced patches, it just was uncovered by them. CC: stable@vger.kernel.org # 5.10 Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 5 +++++ fs/btrfs/extent-tree.c | 33 +++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a540ace3e03a..63be7d01a9a3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -735,11 +735,16 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * 2. We were negative and went to 0 or positive, so no longer can say * that the space would be pinned, decrement our counter from the * total_bytes_pinned counter. + * 3. We are now at 0 and have ->must_insert_reserved set, which means + * this was a new allocation and then we dropped it, and thus must + * add our space to the total_bytes_pinned counter. */ if (existing->total_ref_mod < 0 && old_ref_mod >= 0) btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); spin_unlock(&existing->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f591036ffc1..6f0c59debc2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1754,23 +1754,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, { int nr_items = 1; /* Dropping this ref head update. */ - if (head->total_ref_mod < 0) { + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + } + + /* + * We were dropping refs, or had a new ref and dropped it, and thus must + * adjust down our total_bytes_pinned, the space may or may not have + * been pinned and so is accounted for properly in the pinned space by + * now. + */ + if (head->total_ref_mod < 0 || + (head->total_ref_mod == 0 && head->must_insert_reserved)) { u64 flags = btrfs_ref_head_to_space_flags(head); btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); - - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } } btrfs_delayed_refs_rsv_release(fs_info, nr_items); -- cgit v1.2.3-70-g09d2 From 2e626e5673c2a3b4ce8200b961e28edd613ab6a9 Mon Sep 17 00:00:00 2001 From: Nigel Christian Date: Sun, 24 Jan 2021 20:41:41 -0500 Subject: btrfs: remove repeated word in struct member comment Comment for processed extent end of range has an unnecessary "in", remove it. Signed-off-by: Nigel Christian Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2fa563da65bd..edcdbd739a1e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2776,7 +2776,7 @@ struct processed_extent { struct btrfs_inode *inode; /* Start of the range in @inode */ u64 start; - /* End of the range in in @inode */ + /* End of the range in @inode */ u64 end; bool uptodate; }; -- cgit v1.2.3-70-g09d2 From c78a10aebb275c38d0cfccae129a803fe622e305 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:42 -0500 Subject: btrfs: fix reloc root leak with 0 ref reloc roots on recovery When recovering a relocation, if we run into a reloc root that has 0 refs we simply add it to the reloc_control->reloc_roots list, and then clean it up later. The problem with this is __del_reloc_root() doesn't do anything if the root isn't in the radix tree, which in this case it won't be because we never call __add_reloc_root() on the reloc_root. This exit condition simply isn't correct really. During normal operation we can remove ourselves from the rb tree and then we're meant to clean up later at merge_reloc_roots() time, and this happens correctly. During recovery we're depending on free_reloc_roots() to drop our references, but we're short-circuiting. Fix this by continuing to check if we're on the list and dropping ourselves from the reloc_control root list and dropping our reference appropriately. Change the corresponding BUG_ON() to an ASSERT() that does the correct thing if we aren't in the rb tree. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9f2289bcdde6..d29baf3822a7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -669,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root) RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); + ASSERT(!node || (struct btrfs_root *)node->data == root); } /* -- cgit v1.2.3-70-g09d2 From 938fcbfb0cbcf532a1869efab58e6009446b1ced Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:43 -0500 Subject: btrfs: splice remaining dirty_bg's onto the transaction dirty bg list While doing error injection testing with my relocation patches I hit the following assert: assertion failed: list_empty(&block_group->dirty_list), in fs/btrfs/block-group.c:3356 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3357! invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 PID: 24351 Comm: umount Tainted: G W 5.10.0-rc3+ #193 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:assertfail.constprop.0+0x18/0x1a RSP: 0018:ffffa09b019c7e00 EFLAGS: 00010282 RAX: 0000000000000056 RBX: ffff8f6492c18000 RCX: 0000000000000000 RDX: ffff8f64fbc27c60 RSI: ffff8f64fbc19050 RDI: ffff8f64fbc19050 RBP: ffff8f6483bbdc00 R08: 0000000000000000 R09: 0000000000000000 R10: ffffa09b019c7c38 R11: ffffffff85d70928 R12: ffff8f6492c18100 R13: ffff8f6492c18148 R14: ffff8f6483bbdd70 R15: dead000000000100 FS: 00007fbfda4cdc40(0000) GS:ffff8f64fbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbfda666fd0 CR3: 000000013cf66002 CR4: 0000000000370ef0 Call Trace: btrfs_free_block_groups.cold+0x55/0x55 close_ctree+0x2c5/0x306 ? fsnotify_destroy_marks+0x14/0x100 generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x36/0xa0 cleanup_mnt+0x12d/0x190 task_work_run+0x5c/0xa0 exit_to_user_mode_prepare+0x1b1/0x1d0 syscall_exit_to_user_mode+0x54/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happened because I injected an error in btrfs_cow_block() while running the dirty block groups. When we run the dirty block groups, we splice the list onto a local list to process. However if an error occurs, we only cleanup the transactions dirty block group list, not any pending block groups we have on our locally spliced list. In fact if we fail to allocate a path in this function we'll also fail to clean up the splice list. Fix this by splicing the list back onto the transaction dirty block group list so that the block groups are cleaned up. Then add a 'out' label and have the error conditions jump to out so that the errors are handled properly. This also has the side-effect of fixing a problem where we would clear 'ret' on error because we unconditionally ran btrfs_run_delayed_refs(). CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 763a3671b7af..dda495b2a862 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2564,8 +2564,10 @@ again: if (!path) { path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } } /* @@ -2659,16 +2661,14 @@ again: btrfs_put_block_group(cache); if (drop_reserve) btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); @@ -2692,7 +2692,12 @@ again: goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } -- cgit v1.2.3-70-g09d2 From f78743fbdae1bb31bc9c9233c3590a5048782381 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:44 -0500 Subject: btrfs: do not warn if we can't find the reloc root when looking up backref The backref code is looking for a reloc_root that corresponds to the given fs root. However any number of things could have gone wrong while initializing that reloc_root, like ENOMEM while trying to allocate the root itself, or EIO while trying to write the root item. This would result in no corresponding reloc_root being in the reloc root cache, and thus would return NULL when we do the find_reloc_root() call. Because of this we do not want to WARN_ON(). This presumably was meant to catch developer errors, cases where we messed up adding the reloc root. However we can easily hit this case with error injection, and thus should not do a WARN_ON(). CC: stable@vger.kernel.org # 5.10+ Reported-by: Zygo Blaxell Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 701124c3e0b1..f47c1528eb9a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2623,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, /* Only reloc backref cache cares about a specific root */ if (cache->is_reloc) { root = find_reloc_root(cache->fs_info, cur->bytenr); - if (WARN_ON(!root)) + if (!root) return -ENOENT; cur->root = root; } else { -- cgit v1.2.3-70-g09d2 From eddda68d97732ce05ca145f8e85e8a447f65cdad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:45 -0500 Subject: btrfs: add asserts for deleting backref cache nodes A weird KASAN problem that Zygo reported could have been easily caught if we checked for basic things in our backref freeing code. We have two methods of freeing a backref node - btrfs_backref_free_node: this just is kfree() essentially. - btrfs_backref_drop_node: this actually unlinks the node and cleans up everything and then calls btrfs_backref_free_node(). We should mostly be using btrfs_backref_drop_node(), to make sure the node is properly unlinked from the backref cache, and only use btrfs_backref_free_node() when we know the node isn't actually linked to the backref cache. We made a mistake here and thus got the KASAN splat. Make this style of issue easier to find by adding some ASSERT()'s to btrfs_backref_free_node() and adjusting our deletion stuff to properly init the list so we can rely on list_empty() checks working properly. BUG: KASAN: use-after-free in btrfs_backref_cleanup_node+0x18a/0x420 Read of size 8 at addr ffff888112402950 by task btrfs/28836 CPU: 0 PID: 28836 Comm: btrfs Tainted: G W 5.10.0-e35f27394290-for-next+ #23 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xbc/0xf9 ? btrfs_backref_cleanup_node+0x18a/0x420 print_address_description.constprop.8+0x21/0x210 ? record_print_text.cold.34+0x11/0x11 ? btrfs_backref_cleanup_node+0x18a/0x420 ? btrfs_backref_cleanup_node+0x18a/0x420 kasan_report.cold.10+0x20/0x37 ? btrfs_backref_cleanup_node+0x18a/0x420 __asan_load8+0x69/0x90 btrfs_backref_cleanup_node+0x18a/0x420 btrfs_backref_release_cache+0x83/0x1b0 relocate_block_group+0x394/0x780 ? merge_reloc_roots+0x4a0/0x4a0 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 ? check_flags.part.50+0x6c/0x1e0 ? btrfs_relocate_chunk+0x120/0x120 ? kmem_cache_alloc_trace+0xa06/0xcb0 ? _copy_from_user+0x83/0xc0 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4c4bdfe427 RSP: 002b:00007fff33ee6df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fff33ee6e98 RCX: 00007f4c4bdfe427 RDX: 00007fff33ee6e98 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 R10: fffffffffffff59d R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fff33ee8a34 R15: 0000000000000001 Allocated by task 28836: kasan_save_stack+0x21/0x50 __kasan_kmalloc.constprop.18+0xbe/0xd0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x410/0xcb0 btrfs_backref_alloc_node+0x46/0xf0 btrfs_backref_add_tree_node+0x60d/0x11d0 build_backref_tree+0xc5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 28836: kasan_save_stack+0x21/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x1f/0x30 __kasan_slab_free+0xf3/0x140 kasan_slab_free+0xe/0x10 kfree+0xde/0x200 btrfs_backref_error_cleanup+0x452/0x530 build_backref_tree+0x1a5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888112402900 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 80 bytes inside of 128-byte region [ffff888112402900, ffff888112402980) The buggy address belongs to the page: page:0000000028b1cd08 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888131c810c0 pfn:0x112402 flags: 0x17ffe0000000200(slab) raw: 017ffe0000000200 ffffea000424f308 ffffea0007d572c8 ffff888100040440 raw: ffff888131c810c0 ffff888112402000 0000000100000009 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888112402800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888112402880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888112402900: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888112402980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888112402a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Link: https://lore.kernel.org/linux-btrfs/20201208194607.GI31381@hungrycats.org/ CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff705cc564a9..17abde7f794c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); cache->nr_nodes--; btrfs_put_root(node->root); kfree(node); @@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer( static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node) { - BUG_ON(!list_empty(&node->upper)); + ASSERT(list_empty(&node->upper)); btrfs_backref_drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); + list_del_init(&node->list); + list_del_init(&node->lower); if (!RB_EMPTY_NODE(&node->rb_node)) rb_erase(&node->rb_node, &tree->rb_root); btrfs_backref_free_node(tree, node); -- cgit v1.2.3-70-g09d2 From 867ed321f90d06aaba84e2c91de51cd3038825ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:46 -0500 Subject: btrfs: abort the transaction if we fail to inc ref in btrfs_copy_root While testing my error handling patches, I added a error injection site at btrfs_inc_extent_ref, to validate the error handling I added was doing the correct thing. However I hit a pretty ugly corruption while doing this check, with the following error injection stack trace: btrfs_inc_extent_ref btrfs_copy_root create_reloc_root btrfs_init_reloc_root btrfs_record_root_in_trans btrfs_start_transaction btrfs_update_inode btrfs_update_time touch_atime file_accessed btrfs_file_mmap This is because we do not catch the error from btrfs_inc_extent_ref, which in practice would be ENOMEM, which means we lose the extent references for a root that has already been allocated and inserted, which is the problem. Fix this by aborting the transaction if we fail to do the reference modification. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 56e132d825a2..95d9bae764ab 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -221,9 +221,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } btrfs_mark_buffer_dirty(cow); *cow_ret = cow; -- cgit v1.2.3-70-g09d2 From ddfd08cb0484e491cae47a76ead051a168a0e644 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:19 -0500 Subject: btrfs: do not block on deleted bgs mutex in the cleaner While running some stress tests I started getting hung task messages. This is because the delete unused block groups code has to take the delete_unused_bgs_mutex to do it's work, which is taken by balance to make sure we don't delete block groups while we're balancing. The problem is that balance can take a while, and so we were getting hung task warnings. We don't need to block and run these things, and the cleaner is needed to do other work, so trylock on this mutex and just bail if we can't acquire it right away. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index dda495b2a862..5fa6b3d540f4 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1262,6 +1262,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + return; + spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; @@ -1281,8 +1288,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - mutex_lock(&fs_info->delete_unused_bgs_mutex); - /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); @@ -1426,11 +1431,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans); next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); return; flip_async: -- cgit v1.2.3-70-g09d2 From e19eb11f4f3d3b0463cd897016064a79cb6d8c6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:20 -0500 Subject: btrfs: only let one thread pre-flush delayed refs in commit I've been running a stress test that runs 20 workers in their own subvolume, which are running an fsstress instance with 4 threads per worker, which is 80 total fsstress threads. In addition to this I'm running balance in the background as well as creating and deleting snapshots. This test takes around 12 hours to run normally, going slower and slower as the test goes on. The reason for this is because fsstress is running fsync sometimes, and because we're messing with block groups we often fall through to btrfs_commit_transaction, so will often have 20-30 threads all calling btrfs_commit_transaction at the same time. These all get stuck contending on the extent tree while they try to run delayed refs during the initial part of the commit. This is suboptimal, really because the extent tree is a single point of failure we only want one thread acting on that tree at once to reduce lock contention. Fix this by making the flushing mechanism a bit operation, to make it easy to use test_and_set_bit() in order to make sure only one task does this initial flush. Once we're into the transaction commit we only have one thread doing delayed ref running, it's just this initial pre-flush that is problematic. With this patch my stress test takes around 90 minutes to run, instead of 12 hours. The memory barrier is not necessary for the flushing bit as it's ordered, unlike plain int. The transaction state accessed in btrfs_should_end_transaction could be affected by that too as it's not always used under transaction lock. Upon Nikolay's analysis in [1] it's not necessary: In should_end_transaction it's read without holding any locks. (U) It's modified in btrfs_cleanup_transaction without holding the fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set. set in cleanup_transaction under fs_info->trans_lock (L) set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_UNBLOCK under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this point the transaction is finished and fs_info->running_trans is NULL (U but irrelevant). So by the looks of it we can have a concurrent READ race with a WRITE, due to reads not taking a lock. In this case what we want to ensure is we either see new or old state. I consulted with Will Deacon and he said that in such a case we'd want to annotate the accesses to ->state with (READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I don't think this could happen but I imagine at some point KCSAN would flag such an access as racy (which it is). [1] https://lore.kernel.org/linux-btrfs/e1fd5cc1-0f28-f670-69f4-e9958b4964e6@suse.com Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add comments regarding memory barrier ] Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.h | 12 ++++++------ fs/btrfs/transaction.c | 32 +++++++++++++++----------------- 2 files changed, 21 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 3ba140468f12..e22fba272e4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref { u64 offset; }; +enum btrfs_delayed_ref_flags { + /* Indicate that we are flushing delayed refs for the commit */ + BTRFS_DELAYED_REFS_FLUSHING, +}; + struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root { u64 pending_csums; - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; + unsigned long flags; u64 run_delayed_start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3bcb5444536e..1485f7722f47 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -909,9 +909,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || - cur_trans->delayed_refs.flushing) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; return should_end_transaction(trans); @@ -2043,23 +2042,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. */ - cur_trans->delayed_refs.flushing = 1; - smp_wmb(); + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } btrfs_create_pending_block_groups(trans); -- cgit v1.2.3-70-g09d2 From 61a56a992fcfc694a54de77d896350b9d0588e86 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:21 -0500 Subject: btrfs: delayed refs pre-flushing should only run the heads we have Previously our delayed ref running used the total number of items as the items to run. However we changed that to number of heads to run with the delayed_refs_rsv, as generally we want to run all of the operations for one bytenr. But with btrfs_run_delayed_refs(trans, 0) we set our count to 2x the number of items that we have. This is generally fine, but if we have some operation generation loads of delayed refs while we're doing this pre-flushing in the transaction commit, we'll just spin forever doing delayed refs. Fix this to simply pick the number of delayed refs we currently have, that way we do not end up doing a lot of extra work that's being generated in other threads. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6f0c59debc2b..0943731f7edd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2113,7 +2113,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; + count = delayed_refs->num_heads_ready; again: #ifdef SCRAMBLE_DELAYED_REFS -- cgit v1.2.3-70-g09d2 From ad368f3394b796fd7faa46da8d326c98718f21d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:22 -0500 Subject: btrfs: only run delayed refs once before committing We try to pre-flush the delayed refs when committing, because we want to do as little work as possible in the critical section of the transaction commit. However doing this twice can lead to very long transaction commit delays as other threads are allowed to continue to generate more delayed refs, which potentially delays the commit by multiple minutes in very extreme cases. So simply stick to one pre-flush, and then continue the rest of the transaction commit. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1485f7722f47..7bb58c3ddcd1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2061,12 +2061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; -- cgit v1.2.3-70-g09d2 From 2a4d84c11a872551a335cfe3ee8b60af67ded109 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:23 -0500 Subject: btrfs: move delayed ref flushing for qgroup into qgroup helper The commit d67263354541 ("btrfs: qgroup: Make snapshot accounting work with new extent-oriented qgroup.") added a flush of the delayed refs during snapshot creation in order to get the qgroup accounting properly. However this code has changed and been moved to it's own helper that is skipped if qgroups are turned off. Move the flushing to the helper, as we do not need it when qgroups are turned off. Also add a comment explaining why it exists, and why it doesn't actually save us. This will be helpful later when we try to fix qgroup accounting properly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7bb58c3ddcd1..dbbd46417534 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1432,6 +1432,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, */ record_root_in_trans(trans, src, 1); + /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex @@ -1685,12 +1702,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. -- cgit v1.2.3-70-g09d2 From b7774425e0c08d8558be3a072b0c3e0b806b95f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:24 -0500 Subject: btrfs: remove bogus BUG_ON in alloc_reserved_tree_block The fix 361048f586f5 ("Btrfs: fix full backref problem when inserting shared block reference") added a delayed ref flushing at subvolume creation time in order to avoid hitting this particular BUG_ON(). Before this fix, we were tripping the BUG_ON() by 1. Modify snapshot A, which creates blocks with a normal reference for snapshot A, as A is the owner of these blocks. We now have delayed refs for these blocks. 2. Create a snapshot of A named B, which pushes references for the children blocks of the root node for the new root B, thus creating more delayed refs for newly allocated blocks. 3. A is modified, and because the metadata blocks can now be shared, it must push FULL_BACKREF references to the children of any block that A COWs down it's path to its target key. 4. Delayed refs are run. Because these are newly allocated blocks, we have ->must_insert_reserved reserved set on the delayed ref head, we call into alloc_reserved_tree_block() to add the extent item, and then add our ref. At the time of this fix, we were ordering FULL_BACKREF delayed ref operations first, so we'd go to add this reference and then BUG_ON() because we didn't have the FULL_BACKREF flag set. The patch fixed this problem by making sure we ran the delayed refs before we had the chance to modify A. This meant that any *new* blocks would have had their extent items created _before_ we would ever actually COW down and generate FULL_BACKREF entries. Thus the problem went away. However this BUG_ON() is actually completely bogus. The existence of a full backref doesn't necessarily mean that FULL_BACKREF must be set on that block, it must only be set on the actual parent itself. Consider the example provided above. If we COW down one path from A, any nodes are going to have a FULL_BACKREF ref pushed down to _all_ of their children, but not all of the children are going to have FULL_BACKREF set. It is completely valid to have an extent item with normal and full backrefs without FULL_BACKREF actually set on the block itself. As a final note, I have been testing with the patch (applied after this one) btrfs: stop running all delayed refs during snapshot which removed this flushing. My test was a torture test which did a lot of operations while snapshotting and deleting snapshots as well as relocation, and I never tripped this BUG_ON(). This is actually because at the time of 361048f586f5, we ordered SHARED keys _before_ normal references, and thus they would get run first. However currently they are ordered _after_ normal references, so we'd do the initial creation without having a shared reference, and thus not hit this BUG_ON(), which explains why I didn't start hitting this problem during my testing with my other patch applied. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0943731f7edd..5476ab84e544 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4426,7 +4426,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); -- cgit v1.2.3-70-g09d2 From dac348e9257051e7a39224747695b53e3fc737d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:25 -0500 Subject: btrfs: stop running all delayed refs during snapshot This was added in commit 361048f586f5 ("Btrfs: fix full backref problem when inserting shared block reference") to address a problem where we hit the following BUG_ON() in alloc_reserved_tree_block if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); However this BUG_ON() is bogus, and was removed by previous commit: btrfs: remove bogus BUG_ON in alloc_reserved_tree_block We no longer need to run delayed refs because of this, and can remove this flushing here. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dbbd46417534..02592c6ce755 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1749,12 +1749,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: -- cgit v1.2.3-70-g09d2 From 488bc2a2d21e5faf14f9f695bb592ae9dd0e7465 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:26 -0500 Subject: btrfs: run delayed refs less often in commit_cowonly_roots We love running delayed refs in commit_cowonly_roots, but it is a bit excessive. I was seeing cases of running 3 or 4 refs a few times in a row during this time. Instead simply: - update all of the roots first - then run delayed refs - then handle the empty block groups case - and then if we have any more dirty roots do the whole thing again This allows us to be much more efficient with our delayed ref running, as we can batch a few more operations at once. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 02592c6ce755..b83e8ae38cfc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1226,10 +1226,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) btrfs_tree_unlock(eb); free_extent_buffer(eb); - if (ret) - return ret; - - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1247,10 +1243,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; @@ -1265,15 +1257,24 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; } + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; -- cgit v1.2.3-70-g09d2 From 8898038309876e5b8e535eac9d4b9fe4e3d6f5b3 Mon Sep 17 00:00:00 2001 From: Roman Anasal Date: Mon, 25 Jan 2021 20:43:25 +0100 Subject: btrfs: send: use struct send_ctx *sctx for btrfs_compare_trees and changed_cb btrfs_compare_trees and changed_cb use a void *ctx parameter instead of struct send_ctx *sctx but when used in changed_cb it is immediately cast to `struct send_ctx *sctx = ctx;`. changed_cb is only ever called from btrfs_compare_trees and full_send_tree: - full_send_tree already passes a struct send_ctx *sctx - btrfs_compare_trees is only called by send_subvol with a struct send_ctx *sctx - void *ctx in btrfs_compare_trees is only used to be passed to changed_cb So casting to/from void *ctx seems unnecessary and directly using struct send_ctx *sctx instead provides better type-safety. The original reason for using void *ctx in the first place seems to have been dropped with 1b51d6fce45e ("btrfs: send: remove indirect callback parameter for changed_cb"). Signed-off-by: Roman Anasal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3bcbf2bcb869..f87878274e9f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6591,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, - void *ctx) + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || @@ -6799,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, void *ctx) + struct btrfs_root *right_root, struct send_ctx *sctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6951,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; } @@ -6962,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; } @@ -6976,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; advance_left = ADVANCE; @@ -6984,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; advance_right = ADVANCE; @@ -6999,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, else result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, - &left_key, result, ctx); + &left_key, result, sctx); if (ret < 0) goto out; advance_left = ADVANCE; -- cgit v1.2.3-70-g09d2 From 91e79a83fff663283341c8c29293faec8255099a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:18 -0400 Subject: btrfs: make flush_space take a enum btrfs_flush_state instead of int I got a automated message from somebody who runs clang against our kernels and it's because I used the wrong enum type for what I passed into flush_space, caught by -Wenum-conversion. Change the argument to be explicitly the enum we're expecting to make everything consistent. Maybe eventually gcc will catch errors like this. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index fd8e79e3c10e..4eab581b1b9c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -670,7 +670,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - int state) + enum btrfs_flush_state state) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -923,7 +923,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; - int flush_state; + enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; @@ -1055,7 +1055,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 last_tickets_id; - int flush_state = 0; + enum btrfs_flush_state flush_state = 0; fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); space_info = fs_info->data_sinfo; -- cgit v1.2.3-70-g09d2 From ac1ea10e757a57fb61512ae9beb2ef67e5340e31 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:19 -0400 Subject: btrfs: add a trace point for reserve tickets While debugging a ENOSPC related performance problem I needed to see the time difference between start and end of a reserve ticket, so add a trace point to report when we handle a reserve ticket. I opted to spit out start_ns itself without calculating the difference because there could be a gap between enabling the tracepoint and setting start_ns. Doing it this way allows us to filter on 0 start_ns so we don't get bogus entries, and we can easily calculate the time difference with bpftrace or something else. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 12 +++++++++++- include/trace/events/btrfs.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4eab581b1b9c..d879e3fea0b6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1220,6 +1220,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation + * @start_ns: timestamp when the reservation started + * @orig_bytes: amount of bytes originally reserved * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for @@ -1228,6 +1230,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, + u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; @@ -1283,6 +1286,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); + trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, + start_ns, flush, ticket->error); return ret; } @@ -1317,6 +1322,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, { struct work_struct *async_work; struct reserve_ticket ticket; + u64 start_ns = 0; u64 used; int ret = 0; bool pending_tickets; @@ -1369,6 +1375,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + if (trace_btrfs_reserve_ticket_enabled()) + start_ns = ktime_get_ns(); + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_DATA) { @@ -1405,7 +1414,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, flush); + return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, + orig_bytes, flush); } /** diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b9896fc06160..b0ea2a108be3 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2026,6 +2026,35 @@ TRACE_EVENT(btrfs_convert_extent_bit, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); +TRACE_EVENT(btrfs_reserve_ticket, + TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, + u64 start_ns, int flush, int error), + + TP_ARGS(fs_info, flags, bytes, start_ns, flush, error), + + TP_STRUCT__entry_btrfs( + __field( u64, flags ) + __field( u64, bytes ) + __field( u64, start_ns ) + __field( int, flush ) + __field( int, error ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->flags = flags; + __entry->bytes = bytes; + __entry->start_ns = start_ns; + __entry->flush = flush; + __entry->error = error; + ), + + TP_printk_btrfs("flags=%s bytes=%llu start_ns=%llu flush=%s error=%d", + __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->bytes, __entry->start_ns, + __print_symbolic(__entry->flush, FLUSH_ACTIONS), + __entry->error) +); + DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock, TP_PROTO(const struct extent_buffer *eb, u64 start_ns), -- cgit v1.2.3-70-g09d2 From 5deb17e18e27a3502f21581ba4d086e762b86b31 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:20 -0400 Subject: btrfs: track ordered bytes instead of just dio ordered bytes We track dio_bytes because the shrink delalloc code needs to know if we have more DIO in flight than we have normal buffered IO. The reason for this is because we can't "flush" DIO, we have to just wait on the ordered extents to finish. However this is true of all ordered extents. If we have more ordered space outstanding than dirty pages we should be waiting on ordered extents. We already are ok on this front technically, because we always do a FLUSH_DELALLOC_WAIT loop, but I want to use the ordered counter in the preemptive flushing code as well, so change this to count all ordered bytes instead of just DIO ordered bytes. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/ordered-data.c | 13 ++++++------- fs/btrfs/space-info.c | 18 +++++++----------- 4 files changed, 18 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ed6bb46a2572..7d8660227520 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -797,7 +797,7 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; - struct percpu_counter dio_bytes; + struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5473bed6a7e8..e0d56b3d1223 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1469,7 +1469,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2802,7 +2802,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -4163,9 +4163,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b4e6500548a2..e8dee1578d4a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -206,11 +206,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset type == BTRFS_ORDERED_COMPRESSED); set_bit(type, &entry->flags); - if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, + fs_info->delalloc_batch); + + if (dio) set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -503,9 +503,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, false); - if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, + fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d879e3fea0b6..711beacd75d6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -489,7 +489,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, { struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 dio_bytes; + u64 ordered_bytes; u64 items; long time_left; int loops; @@ -513,25 +513,20 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) return; - } /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (dio_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes) wait_ordered = true; loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { + while ((delalloc_bytes || ordered_bytes) && loops < 3) { u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; long nr_pages = min_t(u64, temp, LONG_MAX); @@ -556,7 +551,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + ordered_bytes = percpu_counter_sum_positive( + &fs_info->ordered_bytes); } } -- cgit v1.2.3-70-g09d2 From f00c42dd4cc8b856e68638e6a88b51f88b8e849e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:21 -0400 Subject: btrfs: introduce a FORCE_COMMIT_TRANS flush operation Solely for preemptive flushing, we want to be able to force the transaction commit without any of the ambiguity of may_commit_transaction(). This is because may_commit_transaction() checks tickets and such, and in preemptive flushing we already know it'll be helpful, so use this to keep the code nice and clean and straightforward. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add comment ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/space-info.c | 14 ++++++++++++++ include/trace/events/btrfs.h | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7d8660227520..90726954b883 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2740,6 +2740,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, + FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 711beacd75d6..e677b5451f82 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -140,6 +140,12 @@ * be freed, plus any delayed work we may not have gotten rid of in the case * of metadata. * + * FORCE_COMMIT_TRANS + * For use by the preemptive flusher. We use this to bypass the ticketing + * checks in may_commit_transaction, as we have more information about the + * overall state of the system and may want to commit the transaction ahead + * of actual ENOSPC conditions. + * * OVERCOMMIT * * Because we hold so many reservations for metadata we will allow you to @@ -735,6 +741,14 @@ static void flush_space(struct btrfs_fs_info *fs_info, case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; + case FORCE_COMMIT_TRANS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_commit_transaction(trans); + break; default: ret = -ENOSPC; break; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b0ea2a108be3..8a7c163907a2 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -99,7 +99,8 @@ struct btrfs_space_info; EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ - EMe(COMMIT_TRANS, "COMMIT_TRANS") + EM( COMMIT_TRANS, "COMMIT_TRANS") \ + EMe(FORCE_COMMIT_TRANS, "FORCE_COMMIT_TRANS") /* * First define the enums in the above macros to be exported to userspace via -- cgit v1.2.3-70-g09d2 From 576fa34830afac6a40cd19c777f1ab49c914e87c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:22 -0400 Subject: btrfs: improve preemptive background space flushing Currently if we ever have to flush space because we do not have enough we allocate a ticket and attach it to the space_info, and then systematically flush things in the filesystem that hold space reservations until our space is reclaimed. However this has a latency cost, we must go to sleep and wait for the flushing to make progress before we are woken up and allowed to continue doing our work. In order to address that we used to kick off the async worker to flush space preemptively, so that we could be reclaiming space hopefully before any tasks needed to stop and wait for space to reclaim. When I introduced the ticketed ENOSPC stuff this broke slightly in the fact that we were using tickets to indicate if we were done flushing. No tickets, no more flushing. However this meant that we essentially never preemptively flushed. This caused a write performance regression that Nikolay noticed in an unrelated patch that removed the committing of the transaction during btrfs_end_transaction. The behavior that happened pre that patch was btrfs_end_transaction() would see that we were low on space, and it would commit the transaction. This was bad because in this particular case you could end up with thousands and thousands of transactions being committed during the 5 minute reproducer. With the patch to remove this behavior we got much more sane transaction commits, but we ended up slower because we would write for a while, flush, write for a while, flush again. To address this we need to reinstate a preemptive flushing mechanism. However it is distinctly different from our ticketing flushing in that it doesn't have tickets to base it's decisions on. Instead of bolting this logic into our existing flushing work, add another worker to handle this preemptive flushing. Here we will attempt to be slightly intelligent about the things that we flushing, attempting to balance between whichever pool is taking up the most space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/space-info.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 90726954b883..a9b0521d9e89 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -933,6 +933,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d56b3d1223..e0d1b328397e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4111,6 +4111,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e677b5451f82..8a27c193f8a8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1000,6 +1000,100 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } +/* + * This handles pre-flushing of metadata space before we get to the point that + * we need to start blocking threads on tickets. The logic here is different + * from the other flush paths because it doesn't rely on tickets to tell us how + * much we need to flush, instead it attempts to keep us below the 80% full + * watermark of space by flushing whichever reservation pool is currently the + * largest. + */ +static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv; + struct btrfs_block_rsv *trans_rsv; + u64 used; + + fs_info = container_of(work, struct btrfs_fs_info, + preempt_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + delayed_block_rsv = &fs_info->delayed_block_rsv; + delayed_refs_rsv = &fs_info->delayed_refs_rsv; + global_rsv = &fs_info->global_block_rsv; + trans_rsv = &fs_info->trans_block_rsv; + + spin_lock(&space_info->lock); + used = btrfs_space_info_used(space_info, true); + while (need_do_async_reclaim(fs_info, space_info, used)) { + enum btrfs_flush_state flush; + u64 delalloc_size = 0; + u64 to_reclaim, block_rsv_size; + u64 global_rsv_size = global_rsv->reserved; + + /* + * We don't have a precise counter for the metadata being + * reserved for delalloc, so we'll approximate it by subtracting + * out the block rsv's space from the bytes_may_use. If that + * amount is higher than the individual reserves, then we can + * assume it's tied up in delalloc reservations. + */ + block_rsv_size = global_rsv_size + + delayed_block_rsv->reserved + + delayed_refs_rsv->reserved + + trans_rsv->reserved; + if (block_rsv_size < space_info->bytes_may_use) + delalloc_size = space_info->bytes_may_use - block_rsv_size; + spin_unlock(&space_info->lock); + + /* + * We don't want to include the global_rsv in our calculation, + * because that's space we can't touch. Subtract it from the + * block_rsv_size for the next checks. + */ + block_rsv_size -= global_rsv_size; + + /* + * We really want to avoid flushing delalloc too much, as it + * could result in poor allocation patterns, so only flush it if + * it's larger than the rest of the pools combined. + */ + if (delalloc_size > block_rsv_size) { + to_reclaim = delalloc_size; + flush = FLUSH_DELALLOC; + } else if (space_info->bytes_pinned > + (delayed_block_rsv->reserved + + delayed_refs_rsv->reserved)) { + to_reclaim = space_info->bytes_pinned; + flush = FORCE_COMMIT_TRANS; + } else if (delayed_block_rsv->reserved > + delayed_refs_rsv->reserved) { + to_reclaim = delayed_block_rsv->reserved; + flush = FLUSH_DELAYED_ITEMS_NR; + } else { + to_reclaim = delayed_refs_rsv->reserved; + flush = FLUSH_DELAYED_REFS_NR; + } + + /* + * We don't want to reclaim everything, just a portion, so scale + * down the to_reclaim by 1/4. If it takes us down to 0, + * reclaim 1 items worth. + */ + to_reclaim >>= 2; + if (!to_reclaim) + to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); + flush_space(fs_info, space_info, to_reclaim, flush); + cond_resched(); + spin_lock(&space_info->lock); + used = btrfs_space_info_used(space_info, true); + } + spin_unlock(&space_info->lock); +} + /* * FLUSH_DELALLOC_WAIT: * Space is freed from flushing delalloc in one of two ways. @@ -1126,6 +1220,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); + INIT_WORK(&fs_info->preempt_reclaim_work, + btrfs_preempt_reclaim_metadata_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1413,11 +1509,11 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && need_do_async_reclaim(fs_info, space_info, used) && - !work_busy(&fs_info->async_reclaim_work)) { + !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + &fs_info->preempt_reclaim_work); } } spin_unlock(&space_info->lock); -- cgit v1.2.3-70-g09d2 From ae7913ba52ec4a2883eb073c6bc99f1a8d9d636b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:23 -0400 Subject: btrfs: rename need_do_async_reclaim All of our normal flushing is asynchronous reclaim, so this helper is poorly named. This is more checking if we need to preemptively flush space, so rename it to need_preemptive_reclaim. Also switch it to bool and make it plain static as followup patches will move more code here. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8a27c193f8a8..effb9b73a418 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -808,18 +808,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) +static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; + return false; if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) - return 0; + return false; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -1028,7 +1028,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) spin_lock(&space_info->lock); used = btrfs_space_info_used(space_info, true); - while (need_do_async_reclaim(fs_info, space_info, used)) { + while (need_preemptive_reclaim(fs_info, space_info, used)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; @@ -1508,7 +1508,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, used) && + need_preemptive_reclaim(fs_info, space_info, used) && !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); -- cgit v1.2.3-70-g09d2 From f205edf77315a33eee82a7615fb57e9297957fe9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:24 -0400 Subject: btrfs: check reclaim_size in need_preemptive_reclaim If we're flushing space for tickets then we have space_info->reclaim_size set and we do not need to do background reclaim. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index effb9b73a418..9f30d6837eb5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -818,6 +818,13 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return false; + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) return false; -- cgit v1.2.3-70-g09d2 From 9f42d37748264d65ca611b60c22b9c003030b0b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:25 -0400 Subject: btrfs: rework btrfs_calc_reclaim_metadata_size Currently btrfs_calc_reclaim_metadata_size does two things, it returns the space currently required for flushing by the tickets, and if there are no tickets it calculates a value for the preemptive flushing. However for the normal ticketed flushing we really only care about the space required for tickets. We will accidentally come in and flush one time, but as soon as we see there are no tickets we bail out of our flushing. Fix this by making btrfs_calc_reclaim_metadata_size really only tell us what is required for flushing if we have people waiting on space. Then move the preemptive flushing logic into need_preemptive_reclaim(). We ignore btrfs_calc_reclaim_metadata_size() in need_preemptive_reclaim() because if we are in this path then we made our reservation and there are not pending tickets currently, so we do not need to check it, simply do the fuzzy logic to check if we're getting low on space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9f30d6837eb5..636a42620b11 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -765,7 +765,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; - u64 expected; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -783,28 +782,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, if (space_info->total_bytes + avail < used) to_reclaim += used - (space_info->total_bytes + avail); - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); return to_reclaim; } @@ -813,6 +790,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 to_reclaim, expected; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) @@ -825,7 +803,25 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if (space_info->reclaim_size) return false; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return false; + + used = btrfs_space_info_used(space_info, true); + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + if (!to_reclaim) return false; return (used >= thresh && !btrfs_fs_closing(fs_info) && -- cgit v1.2.3-70-g09d2 From 2e294c60497f29ab8791f4b99f348b22d70dd3c3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:26 -0400 Subject: btrfs: simplify the logic in need_preemptive_flushing A lot of this was added all in one go with no explanation, and is a bit unwieldy and confusing. Simplify the logic to start preemptive flushing if we've reserved more than half of our available free space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 73 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 636a42620b11..9befd22a2316 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -786,11 +786,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) + struct btrfs_space_info *space_info) { + u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); - u64 to_reclaim, expected; + u64 used; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) @@ -803,26 +803,52 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if (space_info->reclaim_size) return false; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return false; + /* + * If we have over half of the free space occupied by reservations or + * pinned then we want to start flushing. + * + * We do not do the traditional thing here, which is to say + * + * if (used >= ((total_bytes + avail) / 2)) + * return 1; + * + * because this doesn't quite work how we want. If we had more than 50% + * of the space_info used by bytes_used and we had 0 available we'd just + * constantly run the background flusher. Instead we want it to kick in + * if our reclaimable space exceeds 50% of our available free space. + */ + thresh = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + thresh += (space_info->total_bytes - space_info->bytes_used - + space_info->bytes_reserved - space_info->bytes_readonly); + thresh >>= 1; - used = btrfs_space_info_used(space_info, true); - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); + used = space_info->bytes_pinned; - if (used > expected) - to_reclaim = used - expected; + /* + * If we have more ordered bytes than delalloc bytes then we're either + * doing a lot of DIO, or we simply don't have a lot of delalloc waiting + * around. Preemptive flushing is only useful in that it can free up + * space before tickets need to wait for things to finish. In the case + * of ordered extents, preemptively waiting on ordered extents gets us + * nothing, if our reservations are tied up in ordered extents we'll + * simply have to slow down writers by forcing them to wait on ordered + * extents. + * + * In the case that ordered is larger than delalloc, only include the + * block reserves that we would actually be able to directly reclaim + * from. In this case if we're heavy on metadata operations this will + * clearly be heavy enough to warrant preemptive flushing. In the case + * of heavy DIO or ordered reservations, preemptive flushing will just + * waste time and cause us to slow down. + */ + ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + if (ordered >= delalloc) + used += fs_info->delayed_refs_rsv.reserved + + fs_info->delayed_block_rsv.reserved; else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - if (!to_reclaim) - return false; + used += space_info->bytes_may_use; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -1019,7 +1045,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) struct btrfs_block_rsv *delayed_refs_rsv; struct btrfs_block_rsv *global_rsv; struct btrfs_block_rsv *trans_rsv; - u64 used; fs_info = container_of(work, struct btrfs_fs_info, preempt_reclaim_work); @@ -1030,8 +1055,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); - used = btrfs_space_info_used(space_info, true); - while (need_preemptive_reclaim(fs_info, space_info, used)) { + while (need_preemptive_reclaim(fs_info, space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; @@ -1092,7 +1116,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush_space(fs_info, space_info, to_reclaim, flush); cond_resched(); spin_lock(&space_info->lock); - used = btrfs_space_info_used(space_info, true); } spin_unlock(&space_info->lock); } @@ -1511,7 +1534,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_preemptive_reclaim(fs_info, space_info, used) && + need_preemptive_reclaim(fs_info, space_info) && !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); -- cgit v1.2.3-70-g09d2 From 88a777a6e5272106bdc96b1032d89b0ddc0e526f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:27 -0400 Subject: btrfs: implement space clamping for preemptive flushing Starting preemptive flushing at 50% of available free space is a good start, but some workloads are particularly abusive and can quickly overwhelm the preemptive flushing code and drive us into using tickets. Handle this by clamping down on our threshold for starting and continuing to run preemptive flushing. This is particularly important for our overcommit case, as we can really drive the file system into overages and then it's more difficult to pull it back as we start to actually fill up the file system. The clamping is essentially 2^CLAMP, but we start at 1 so whatever we calculate for overcommit is the baseline. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/space-info.h | 4 ++++ 2 files changed, 55 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9befd22a2316..e8acf087dcee 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -212,6 +212,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -815,13 +816,28 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * because this doesn't quite work how we want. If we had more than 50% * of the space_info used by bytes_used and we had 0 available we'd just * constantly run the background flusher. Instead we want it to kick in - * if our reclaimable space exceeds 50% of our available free space. + * if our reclaimable space exceeds our clamped free space. + * + * Our clamping range is 2^1 -> 2^8. Practically speaking that means + * the following: + * + * Amount of RAM Minimum threshold Maximum threshold + * + * 256GiB 1GiB 128GiB + * 128GiB 512MiB 64GiB + * 64GiB 256MiB 32GiB + * 32GiB 128MiB 16GiB + * 16GiB 64MiB 8GiB + * + * These are the range our thresholds will fall in, corresponding to how + * much delalloc we need for the background flusher to kick in. */ + thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); thresh += (space_info->total_bytes - space_info->bytes_used - space_info->bytes_reserved - space_info->bytes_readonly); - thresh >>= 1; + thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1045,6 +1061,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) struct btrfs_block_rsv *delayed_refs_rsv; struct btrfs_block_rsv *global_rsv; struct btrfs_block_rsv *trans_rsv; + int loops = 0; fs_info = container_of(work, struct btrfs_fs_info, preempt_reclaim_work); @@ -1061,6 +1078,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) u64 to_reclaim, block_rsv_size; u64 global_rsv_size = global_rsv->reserved; + loops++; + /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1117,6 +1136,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) cond_resched(); spin_lock(&space_info->lock); } + + /* We only went through once, back off our clamping. */ + if (loops == 1 && !space_info->reclaim_size) + space_info->clamp = max(1, space_info->clamp - 1); spin_unlock(&space_info->lock); } @@ -1433,6 +1456,24 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } +static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + + /* + * If we're heavy on ordered operations then clamping won't help us. We + * need to clamp specifically to keep up with dirty'ing buffered + * writers, because there's not a 1:1 correlation of writing delalloc + * and freeing space, like there is with flushing delayed refs or + * delayed nodes. If we're already more ordered than delalloc then + * we're keeping up, otherwise we aren't and should probably clamp. + */ + if (ordered < delalloc) + space_info->clamp = min(space_info->clamp + 1, 8); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1526,6 +1567,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } + + /* + * We were forced to add a reserve ticket, so our preemptive + * flushing is unable to keep up. Clamp down on the threshold + * for the preemptive flushing in order to keep up with the + * workload. + */ + maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 74706f604bce..e237156ce888 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -22,6 +22,10 @@ struct btrfs_space_info { the space info if we had an ENOSPC in the allocator. */ + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ -- cgit v1.2.3-70-g09d2 From 4b02b00fe5f1377f3dbfb168dfcfebf3d7a9632f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:28 -0400 Subject: btrfs: adjust the flush trace point to include the source Since we have normal ticketed flushing and preemptive flushing, adjust the tracepoint so that we know the source of the flushing action to make it easier to debug problems. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 +++++++++-------- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8acf087dcee..bb4f85c07738 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -673,7 +673,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - enum btrfs_flush_state state) + enum btrfs_flush_state state, bool for_preempt) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -756,7 +756,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, } trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); + ret, for_preempt); return; } @@ -997,7 +997,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1132,7 +1132,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); - flush_space(fs_info, space_info, to_reclaim, flush); + flush_space(fs_info, space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } @@ -1223,7 +1223,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1236,7 +1236,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) while (flush_state < ARRAY_SIZE(data_flush_states)) { flush_space(fs_info, space_info, U64_MAX, - data_flush_states[flush_state]); + data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1309,7 +1309,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state], + false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -1325,7 +1326,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket) { while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8a7c163907a2..807921de6b32 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1113,15 +1113,16 @@ TRACE_EVENT(btrfs_trigger_flush, TRACE_EVENT(btrfs_flush_space, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes, - int state, int ret), + int state, int ret, bool for_preempt), - TP_ARGS(fs_info, flags, num_bytes, state, ret), + TP_ARGS(fs_info, flags, num_bytes, state, ret, for_preempt), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, num_bytes ) __field( int, state ) __field( int, ret ) + __field( bool, for_preempt ) ), TP_fast_assign_btrfs(fs_info, @@ -1129,15 +1130,16 @@ TRACE_EVENT(btrfs_flush_space, __entry->num_bytes = num_bytes; __entry->state = state; __entry->ret = ret; + __entry->for_preempt = for_preempt; ), - TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d", + TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d for_preempt=%d", __entry->state, __print_symbolic(__entry->state, FLUSH_STATES), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->num_bytes, __entry->ret) + __entry->num_bytes, __entry->ret, __entry->for_preempt) ); DECLARE_EVENT_CLASS(btrfs__reserved_extent, -- cgit v1.2.3-70-g09d2 From e5ad49e215a07562f0a765c68161d13d7c23d8d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:29 -0400 Subject: btrfs: add a trace class for dumping the current ENOSPC state Often when I'm debugging ENOSPC related issues I have to resort to printing the entire ENOSPC state with trace_printk() in different spots. This gets pretty annoying, so add a trace state that does this for us. Then add a trace point at the end of preemptive flushing so you can see the state of the space_info when we decide to exit preemptive flushing. This helped me figure out we weren't kicking in the preemptive flushing soon enough. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 1 + include/trace/events/btrfs.h | 62 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bb4f85c07738..bccd98141a6e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1140,6 +1140,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) /* We only went through once, back off our clamping. */ if (loops == 1 && !space_info->reclaim_size) space_info->clamp = max(1, space_info->clamp - 1); + trace_btrfs_done_preemptive_reclaim(fs_info, space_info); spin_unlock(&space_info->lock); } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 807921de6b32..0551ea65374f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2029,6 +2029,68 @@ TRACE_EVENT(btrfs_convert_extent_bit, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); +DECLARE_EVENT_CLASS(btrfs_dump_space_info, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo), + + TP_ARGS(fs_info, sinfo), + + TP_STRUCT__entry_btrfs( + __field( u64, flags ) + __field( u64, total_bytes ) + __field( u64, bytes_used ) + __field( u64, bytes_pinned ) + __field( u64, bytes_reserved ) + __field( u64, bytes_may_use ) + __field( u64, bytes_readonly ) + __field( u64, reclaim_size ) + __field( int, clamp ) + __field( u64, global_reserved ) + __field( u64, trans_reserved ) + __field( u64, delayed_refs_reserved ) + __field( u64, delayed_reserved ) + __field( u64, free_chunk_space ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->flags = sinfo->flags; + __entry->total_bytes = sinfo->total_bytes; + __entry->bytes_used = sinfo->bytes_used; + __entry->bytes_pinned = sinfo->bytes_pinned; + __entry->bytes_reserved = sinfo->bytes_reserved; + __entry->bytes_may_use = sinfo->bytes_may_use; + __entry->bytes_readonly = sinfo->bytes_readonly; + __entry->reclaim_size = sinfo->reclaim_size; + __entry->clamp = sinfo->clamp; + __entry->global_reserved = fs_info->global_block_rsv.reserved; + __entry->trans_reserved = fs_info->trans_block_rsv.reserved; + __entry->delayed_refs_reserved = fs_info->delayed_refs_rsv.reserved; + __entry->delayed_reserved = fs_info->delayed_block_rsv.reserved; + __entry->free_chunk_space = atomic64_read(&fs_info->free_chunk_space); + ), + + TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu " + "bytes_pinned=%llu bytes_reserved=%llu " + "bytes_may_use=%llu bytes_readonly=%llu " + "reclaim_size=%llu clamp=%d global_reserved=%llu " + "trans_reserved=%llu delayed_refs_reserved=%llu " + "delayed_reserved=%llu chunk_free_space=%llu", + __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->total_bytes, __entry->bytes_used, + __entry->bytes_pinned, __entry->bytes_reserved, + __entry->bytes_may_use, __entry->bytes_readonly, + __entry->reclaim_size, __entry->clamp, + __entry->global_reserved, __entry->trans_reserved, + __entry->delayed_refs_reserved, + __entry->delayed_reserved, __entry->free_chunk_space) +); + +DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo), + TP_ARGS(fs_info, sinfo) +); + TRACE_EVENT(btrfs_reserve_ticket, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, u64 start_ns, int flush, int error), -- cgit v1.2.3-70-g09d2 From 2965194b7700f9405860557826520fd6e8e8b9ad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 15:05:41 +0000 Subject: btrfs: remove wrong comment for can_nocow_extent() The comment for can_nocow_extent() says that the function will flush ordered extents, however that never happens and was never true before the comment was added in commit e4ecaf90bc13 ("btrfs: add comments for btrfs_check_can_nocow() and can_nocow_extent()"). This is true only for the function btrfs_check_can_nocow(), which after that commit was renamed to check_can_nocow(). So just remove that part of the comment. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0dbe1aaa0b71..589030cefd90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7105,9 +7105,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * - * This function will flush ordered extents in the range to ensure proper - * nocow checks for (nowait == false) case. - * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write -- cgit v1.2.3-70-g09d2 From a4559e6f6f3a4e84cb788ac158fb419ece473527 Mon Sep 17 00:00:00 2001 From: Abaci Team Date: Wed, 27 Jan 2021 16:11:37 +0800 Subject: btrfs: simplify condition in __btrfs_run_delayed_items Fix the following coccicheck warnings: ./fs/btrfs/delayed-inode.c:1157:39-41: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Suggested-by: Jiapeng Zhong Reviewed-by: Josef Bacik Signed-off-by: Abaci Team Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 70c0340d839c..ec0b50b8c5d6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node && (!count || (count && nr--))) { + while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { -- cgit v1.2.3-70-g09d2 From 951c80f83d61bd4b21794c8aba829c3c1a45c2d0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 27 Jan 2021 14:38:48 +0800 Subject: btrfs: fix double accounting of ordered extent for subpage case in btrfs_invalidapge Commit dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could span across a page") make btrfs_invalidapage() to search all ordered extents. The offending code looks like this: again: start = page_start; ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordred) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); /* Do the cleanup */ start = end + 1; if (start < page_end) goto again; } The behavior is indeed necessary for the incoming subpage support, but when it iterates through all the ordered extents, it also resets the search range @start. This means, for the following cases, we can double account the ordered extents, causing its bytes_left underflow: Page offset 0 16K 32K |<--- OE 1 --->|<--- OE 2 ---->| As the first iteration will find ordered extent (OE) 1, which doesn't cover the full page, thus after cleanup code, we need to retry again. But again label will reset start to page_start, and we got OE 1 again, which causes double accounting on OE 1, and cause OE 1's byte_left to underflow. This problem can only happen for subpage case, as for regular sectorsize == PAGE_SIZE case, we will always find a OE ends at or after page end, thus no way to trigger the problem. Move the again label after start = page_start. There will be more comprehensive rework to convert the open coded loop to a proper while loop for subpage support. Fixes: dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could span across a page") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 589030cefd90..680cd0eea6d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8183,8 +8183,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); -again: + start = page_start; +again: ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { found_ordered = true; -- cgit v1.2.3-70-g09d2 From 420343131970fd29db129b308612f9364b06df0b Mon Sep 17 00:00:00 2001 From: Michal Rostecki Date: Wed, 27 Jan 2021 14:57:27 +0100 Subject: btrfs: let callers of btrfs_get_io_geometry pass the em Before this change, the btrfs_get_io_geometry() function was calling btrfs_get_chunk_map() to get the extent mapping, necessary for calculating the I/O geometry. It was using that extent mapping only internally and freeing the pointer after its execution. That resulted in calling btrfs_get_chunk_map() de facto twice by the __btrfs_map_block() function. It was calling btrfs_get_io_geometry() first and then calling btrfs_get_chunk_map() directly to get the extent mapping, used by the rest of the function. Change that to passing the extent mapping to the btrfs_get_io_geometry() function as an argument. This could improve performance in some cases. For very large filesystems, i.e. several thousands of allocated chunks, not only this avoids searching two times the rbtree, saving time, it may also help reducing contention on the lock that protects the tree - thinking of writeback starting for multiple inodes, other tasks allocating or removing chunks, and anything else that requires access to the rbtree. Reviewed-by: Filipe Manana Signed-off-by: Michal Rostecki Reviewed-by: David Sterba [ add Filipe's analysis ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++++++++++++----------- fs/btrfs/volumes.c | 43 ++++++++++++++++++------------------------- fs/btrfs/volumes.h | 5 +++-- 3 files changed, 51 insertions(+), 38 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 680cd0eea6d1..04cd95899ac8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2183,9 +2183,10 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + struct extent_map *em; u64 length = 0; u64 map_length; - int ret; + int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) @@ -2193,14 +2194,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, - &geom); + em = btrfs_get_chunk_map(fs_info, logical, map_length); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, + map_length, &geom); if (ret < 0) - return ret; + goto out; if (geom.len < length + size) - return 1; - return 0; + ret = 1; +out: + free_extent_map(em); + return ret; } /* @@ -7938,10 +7944,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 submit_len; int clone_offset = 0; int clone_len; + u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; struct btrfs_dio_data *dio_data = iomap->private; + struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7970,12 +7978,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, submit_len = dio_bio->bi_iter.bi_size; do { - ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), - start_sector << 9, submit_len, - &geom); + logical = start_sector << 9; + em = btrfs_get_chunk_map(fs_info, logical, submit_len); + if (IS_ERR(em)) { + status = errno_to_blk_status(PTR_ERR(em)); + em = NULL; + goto out_err_em; + } + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), + logical, submit_len, &geom); if (ret) { status = errno_to_blk_status(ret); - goto out_err; + goto out_err_em; } ASSERT(geom.len <= INT_MAX); @@ -8020,19 +8034,24 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio_put(bio); if (submit_len > 0) refcount_dec(&dip->refs); - goto out_err; + goto out_err_em; } dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; + + free_extent_map(em); } while (submit_len > 0); return BLK_QC_T_NONE; +out_err_em: + free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8ec8539cd8d..3948f5b50d11 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5940,23 +5940,24 @@ static bool need_full_stripe(enum btrfs_map_op op) } /* - * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) - * tuple. This information is used to calculate how big a - * particular bio can get before it straddles a stripe. + * Calculate the geometry of a particular (address, len) tuple. This + * information is used to calculate how big a particular bio can get before it + * straddles a stripe. * - * @fs_info - the filesystem - * @logical - address that we want to figure out the geometry of - * @len - the length of IO we are going to perform, starting at @logical - * @op - type of operation - write or read - * @io_geom - pointer used to return values + * @fs_info: the filesystem + * @em: mapping containing the logical extent + * @op: type of operation - write or read + * @logical: address that we want to figure out the geometry of + * @len: the length of IO we are going to perform, starting at @logical + * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom) { - struct extent_map *em; struct map_lookup *map; u64 offset; u64 stripe_offset; @@ -5964,14 +5965,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; - int ret = 0; ASSERT(op != BTRFS_MAP_DISCARD); - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) - return PTR_ERR(em); - map = em->map_lookup; /* Offset of this logical address in the chunk */ offset = logical - em->start; @@ -5985,8 +5981,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_crit(fs_info, "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", stripe_offset, offset, em->start, logical, stripe_len); - ret = -EINVAL; - goto out; + return -EINVAL; } /* stripe_offset is the offset of this block in its stripe */ @@ -6033,10 +6028,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom->stripe_offset = stripe_offset; io_geom->raid56_stripe_offset = raid56_full_stripe_start; -out: - /* once for us */ - free_extent_map(em); - return ret; + return 0; } static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -6069,12 +6061,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ASSERT(bbio_ret); ASSERT(op != BTRFS_MAP_DISCARD); - ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + + ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); if (ret < 0) return ret; - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c43663d9c22e..04e2b26823c2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -440,8 +440,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); -- cgit v1.2.3-70-g09d2 From ddffcf6fb5ac54ffcd7e90b10554d89dbd10b47b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:54 +0000 Subject: btrfs: remove unnecessary directory inode item update when deleting dir entry When we remove a directory entry, as part of an unlink operation, if the directory was logged before we must remove the directory index items from the log. We are also updating the inode item of the directory to update its i_size, but that is not necessary because during log replay we do not need it and we correctly adjust the i_size in the inode item of the subvolume as we process directory index items and replay deletes. This is not needed since commit d555438b6e1dad ("Btrfs: drop dir i_size when adding new names on replay"), where we explicitly ignore the i_size of directory inode items on log replay. Before that we used it but it was buggy as mentioned in that commit's change log (i_size got a larger value then it should have). So stop updating the i_size of the directory inode item in the log, as that is a waste of time, adds more log contention to the log tree and often results in COWing more extent buffers for the log tree. This code path is triggered often during dbench workloads for example. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 39 ++++----------------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ee0700a980f..5d87afc6058a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3379,7 +3379,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; int err = 0; - int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) @@ -3406,7 +3405,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; @@ -3421,46 +3419,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; } } - /* update the directory size in the log to reflect the names - * we have removed + /* + * We do not need to update the size field of the directory's inode item + * because on log replay we update the field to reflect all existing + * entries in the directory (see overwrite_item()). */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } fail: btrfs_free_path(path); out_unlock: -- cgit v1.2.3-70-g09d2 From e593e54ed1f643f5007ab4656188b7c3c9a9cb11 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:55 +0000 Subject: btrfs: stop setting nbytes when filling inode item for logging When we fill an inode item for logging we are setting its nbytes field with the value returned by inode_get_bytes() (a VFS API), however we do not need it because it is not used during log replay. In fact, for fast fsyncs, when we call inode_get_bytes() we may even get an outdated value for nbytes because the nbytes field of the inode is only updated when ordered extents complete, and a fast fsync only waits for writeback to complete, it does not wait for ordered extent completion. So just remove the setup of nbytes and add an explicit comment mentioning why we do not set it. This also avoids adding contention on the inode's i_lock (VFS) with concurrent stat() calls, since that spinlock is used by inode_get_bytes() which is also called by our stat callback (btrfs_getattr()). This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5d87afc6058a..be62759f0aac 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3858,7 +3858,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); -- cgit v1.2.3-70-g09d2 From ab12313a9f56b939529abc80ac26bedefb3d5b62 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:56 +0000 Subject: btrfs: avoid logging new ancestor inodes when logging new inode When we fsync a new file, created in the current transaction, we check all its ancestor inodes and always log them if they were created in the current transaction - even if we have already logged them before, which is a waste of time. So avoid logging new ancestor inodes if they were already logged before and have no xattrs added/updated/removed since they were last logged. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index be62759f0aac..105cf316ee27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5272,6 +5272,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); @@ -5520,6 +5521,34 @@ out: return ret; } +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + struct btrfs_dir_list { u64 ino; struct list_head list; @@ -5848,7 +5877,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid) + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5902,7 +5932,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation >= trans->transid) { + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) -- cgit v1.2.3-70-g09d2 From 3e6a86a193b08039a382807c56421622c3ff4368 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:57 +0000 Subject: btrfs: skip logging directories already logged when logging all parents Some times when we fsync an inode we need to do a full log of all its ancestors (due to unlink, link or rename operations), which can be an expensive operation, specially if the directories are large. However if we find an ancestor directory inode that is already logged in the current transaction, and has no inserted/updated/deleted xattrs since it was last logged, we can skip logging the directory again. We are safe to skip that since we know that for logged directories, any link, unlink or rename operations that implicate the directory will update the log as necessary. So use the helper need_log_dir(), introduced in a previous commit, to detect already logged directories that can be skipped. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 105cf316ee27..c0dce99c2c14 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5826,6 +5826,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(dir_inode); + continue; + } + if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), -- cgit v1.2.3-70-g09d2 From 0e44cb3f94284d33067fc74e30990a0ed5b3540d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:58 +0000 Subject: btrfs: skip logging inodes already logged when logging new entries When logging new directory entries of a directory, we log the inodes of new dentries and the inodes of dentries pointing to directories that may have been created in past transactions. For the case of directories we log in full mode, which can be particularly expensive for large directories. We do use btrfs_inode_in_log() to skip already logged inodes, however for that helper to return true, it requires that the log transaction used to log the inode to be already committed. This means that when we have more than one task using the same log transaction we can end up logging an inode multiple times, which is a waste of time and not necessary since the log will be committed by one of the tasks and the others will wait for the log transaction to be committed before returning to user space. So simply replace the use of btrfs_inode_in_log() with the new helper function need_log_inode(), introduced in a previous commit. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c0dce99c2c14..6dc376a16cf2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5676,7 +5676,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { + if (!need_log_inode(trans, BTRFS_I(di_inode))) { btrfs_add_delayed_iput(di_inode); break; } -- cgit v1.2.3-70-g09d2 From 64d6b281ba4db044c946158387c74e1149b9487e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:59 +0000 Subject: btrfs: remove unnecessary check_parent_dirs_for_sync() Whenever we fsync an inode, if it is a directory, a regular file that was created in the current transaction or has last_unlink_trans set to the generation of the current transaction, we check if any of its ancestor inodes (and the inode itself if it is a directory) can not be logged and need a fallback to a full transaction commit - if so, we return with a value of 1 in order to fallback to a transaction commit. However we often do not need to fallback to a transaction commit because: 1) The ancestor inode is not an immediate parent, and therefore there is not an explicit request to log it and it is not needed neither to guarantee the consistency of the inode originally asked to be logged (fsynced) nor its immediate parent; 2) The ancestor inode was already logged before, in which case any link, unlink or rename operation updates the log as needed. So for these two cases we can avoid an unnecessary transaction commit. Therefore remove check_parent_dirs_for_sync() and add a check at the top of btrfs_log_inode() to make us fallback immediately to a transaction commit when we are logging a directory inode that can not be logged and needs a full transaction commit. All we need to protect is the case where after renaming a file someone fsyncs only the old directory, which would result is losing the renamed file after a log replay. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 121 +++++++--------------------------------------------- 1 file changed, 15 insertions(+), 106 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6dc376a16cf2..4c7b283ed2b2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5265,6 +5265,21 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&inode->log_mutex); } + /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); + err = 1; + goto out_unlock; + } + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -5428,99 +5443,6 @@ out_unlock: return err; } -/* - * Check if we must fallback to a transaction commit when logging an inode. - * This must be called after logging the inode and is used only in the context - * when fsyncing an inode requires the need to log some other inode - in which - * case we can't lock the i_mutex of each other inode we need to log as that - * can lead to deadlocks with concurrent fsync against other inodes (as we can - * log inodes up or down in the hierarchy) or rename operations for example. So - * we take the log_mutex of the inode after we have logged it and then check for - * its last_unlink_trans value - this is safe because any task setting - * last_unlink_trans must take the log_mutex and it must do this before it does - * the actual unlink operation, so if we do this check before a concurrent task - * sets last_unlink_trans it means we've logged a consistent version/state of - * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task might have only updated last_unlink_trans before - * we logged the inode or it might have also done the unlink). - */ -static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - bool ret = false; - - mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans >= trans->transid) { - /* - * Make sure any commits to the log are forced to be full - * commits. - */ - btrfs_set_log_full_commit(trans); - ret = true; - } - mutex_unlock(&inode->log_mutex); - - return ret; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, - struct dentry *parent, - struct super_block *sb) -{ - int ret = 0; - struct dentry *old_parent = NULL; - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation < trans->transid && - inode->last_unlink_trans < trans->transid) - goto out; - - if (!S_ISDIR(inode->vfs_inode.i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - goto out; - inode = BTRFS_I(d_inode(parent)); - } - - while (1) { - if (btrfs_must_commit_transaction(trans, inode)) { - ret = 1; - break; - } - - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - if (IS_ROOT(parent)) { - inode = BTRFS_I(d_inode(parent)); - if (btrfs_must_commit_transaction(trans, inode)) - ret = 1; - break; - } - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = BTRFS_I(d_inode(parent)); - - } - dput(old_parent); -out: - return ret; -} - /* * Check if we need to log an inode. This is used in contexts where while * logging an inode we need to log another inode (either that it exists or in @@ -5686,9 +5608,6 @@ process_leaf: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) - ret = 1; btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; @@ -5835,9 +5754,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) - ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); @@ -6053,12 +5969,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct super_block *sb; int ret = 0; bool log_dentries = false; - sb = inode->vfs_inode.i_sb; - if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; @@ -6069,10 +5982,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb); - if (ret) - goto end_no_trans; - /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they -- cgit v1.2.3-70-g09d2 From d0c2f4fa555e70324ec2a129b822ab58f172cc62 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:35:00 +0000 Subject: btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Often an fsync needs to fallback to a transaction commit for several reasons (to ensure consistency after a power failure, a new block group was allocated or a temporary error such as ENOMEM or ENOSPC happened). In that case the log is marked as needing a full commit and any concurrent tasks attempting to log inodes or commit the log will also fallback to the transaction commit. When this happens they all wait for the task that first started the transaction commit to finish the transaction commit - however they wait until the full transaction commit happens, which is not needed, as they only need to wait for the superblocks to be persisted and not for unpinning all the extents pinned during the transaction's lifetime, which even for short lived transactions can be a few thousand and take some significant amount of time to complete - for dbench workloads I have observed up to 4~5 milliseconds of time spent unpinning extents in the worst cases, and the number of pinned extents was between 2 to 3 thousand. So allow fsync tasks to skip waiting for the unpinning of extents when they call btrfs_commit_transaction() and they were not the task that started the transaction commit (that one has to do it, the alternative would be to offload the transaction commit to another task so that it could avoid waiting for the extent unpinning or offload the extent unpinning to another task). This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit After applying the entire patchset, dbench shows improvements in respect to throughput and latency. The script used to measure it is the following: $ cat dbench-test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-m single -d single" echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT The test was run on a physical machine with 12 cores (Intel corei7), 64G of ram, using a NVMe device and a non-debug kernel configuration (Debian's default configuration). Before applying patchset, 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9627107 0.153 61.938 Close 7072076 0.001 3.175 Rename 407633 1.222 44.439 Unlink 1943895 0.658 44.440 Deltree 256 17.339 110.891 Mkdir 128 0.003 0.009 Qpathinfo 8725406 0.064 17.850 Qfileinfo 1529516 0.001 2.188 Qfsinfo 1599884 0.002 1.457 Sfileinfo 784200 0.005 3.562 Find 3373513 0.411 30.312 WriteX 4802132 0.053 29.054 ReadX 15089959 0.002 5.801 LockX 31344 0.002 0.425 UnlockX 31344 0.001 0.173 Flush 674724 5.952 341.830 Throughput 1008.02 MB/sec 32 clients 32 procs max_latency=341.833 ms After applying patchset, 32 clients: After patchset, with 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9931568 0.111 25.597 Close 7295730 0.001 2.171 Rename 420549 0.982 49.714 Unlink 2005366 0.497 39.015 Deltree 256 11.149 89.242 Mkdir 128 0.002 0.014 Qpathinfo 9001863 0.049 20.761 Qfileinfo 1577730 0.001 2.546 Qfsinfo 1650508 0.002 3.531 Sfileinfo 809031 0.005 5.846 Find 3480259 0.309 23.977 WriteX 4952505 0.043 41.283 ReadX 15568127 0.002 5.476 LockX 32338 0.002 0.978 UnlockX 32338 0.001 2.032 Flush 696017 7.485 228.835 Throughput 1049.91 MB/sec 32 clients 32 procs max_latency=228.847 ms --> +4.1% throughput, -39.6% max latency Before applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 8956748 0.342 108.312 Close 6579660 0.001 3.823 Rename 379209 2.396 81.897 Unlink 1808625 1.108 131.148 Deltree 256 25.632 172.176 Mkdir 128 0.003 0.018 Qpathinfo 8117615 0.131 55.916 Qfileinfo 1423495 0.001 2.635 Qfsinfo 1488496 0.002 5.412 Sfileinfo 729472 0.007 8.643 Find 3138598 0.855 78.321 WriteX 4470783 0.102 79.442 ReadX 14038139 0.002 7.578 LockX 29158 0.002 0.844 UnlockX 29158 0.001 0.567 Flush 627746 14.168 506.151 Throughput 924.738 MB/sec 64 clients 64 procs max_latency=506.154 ms After applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9069003 0.303 43.193 Close 6662328 0.001 3.888 Rename 383976 2.194 46.418 Unlink 1831080 1.022 43.873 Deltree 256 24.037 155.763 Mkdir 128 0.002 0.005 Qpathinfo 8219173 0.137 30.233 Qfileinfo 1441203 0.001 3.204 Qfsinfo 1507092 0.002 4.055 Sfileinfo 738775 0.006 5.431 Find 3177874 0.936 38.170 WriteX 4526152 0.084 39.518 ReadX 14213562 0.002 24.760 LockX 29522 0.002 1.221 UnlockX 29522 0.001 0.694 Flush 635652 14.358 422.039 Throughput 990.13 MB/sec 64 clients 64 procs max_latency=422.043 ms --> +6.8% throughput, -18.1% max latency Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 1 + fs/btrfs/transaction.c | 39 +++++++++++++++++++++++++++++++-------- fs/btrfs/transaction.h | 2 ++ 3 files changed, 34 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d81ae1f518f2..be5350f5bedf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2238,6 +2238,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = PTR_ERR(trans); goto out_release_extents; } + trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); btrfs_release_log_ctx_extents(&ctx); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b83e8ae38cfc..00c0680dac3a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -107,6 +107,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | @@ -826,10 +831,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) return trans; } -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_transaction *commit) +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); + wait_event(commit->commit_wait, commit->state >= min_state); } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -884,7 +890,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) goto out; /* nothing committing|committed */ } - wait_for_commit(cur_trans); + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); btrfs_put_transaction(cur_trans); out: return ret; @@ -2100,11 +2106,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans); - wait_for_commit(cur_trans); + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; @@ -2118,13 +2128,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (prev_trans->state != TRANS_STATE_COMPLETED) { + if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); - wait_for_commit(prev_trans); + wait_for_commit(prev_trans, want_state); + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); @@ -2343,6 +2359,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 31ca81bad822..935bd6958a8a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -16,6 +16,7 @@ enum btrfs_trans_state { TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, + TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; @@ -133,6 +134,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool dirty; + bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; -- cgit v1.2.3-70-g09d2 From 6869b0a8be775e920be54ee9b69a743ca20d8332 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:45 +0800 Subject: btrfs: merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK to PAGE_START_WRITEBACK PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK are two defines used in __process_pages_contig(), to let the function know to clear page dirty bit and then set page writeback. However page writeback and dirty bits are conflicting (at least for sector size == PAGE_SIZE case), this means these two have to be always updated together. This means we can merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK to PAGE_START_WRITEBACK. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 12 ++++++------ fs/btrfs/inode.c | 28 ++++++++++------------------ 3 files changed, 18 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index edcdbd739a1e..7c14ccf76838 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1976,10 +1976,10 @@ static int __process_pages_contig(struct address_space *mapping, pages_processed++; continue; } - if (page_ops & PAGE_CLEAR_DIRTY) + if (page_ops & PAGE_START_WRITEBACK) { clear_page_dirty_for_io(pages[i]); - if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + } if (page_ops & PAGE_SET_ERROR) SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19221095c635..2d8187c84812 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,12 +35,12 @@ enum { /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) -#define PAGE_CLEAR_DIRTY (1 << 1) -#define PAGE_SET_WRITEBACK (1 << 2) -#define PAGE_END_WRITEBACK (1 << 3) -#define PAGE_SET_PRIVATE2 (1 << 4) -#define PAGE_SET_ERROR (1 << 5) -#define PAGE_LOCK (1 << 6) +/* Page starts writeback, clear dirty bit and set writeback bit */ +#define PAGE_START_WRITEBACK (1 << 1) +#define PAGE_END_WRITEBACK (1 << 2) +#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ERROR (1 << 4) +#define PAGE_LOCK (1 << 5) /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04cd95899ac8..3337c8ee7928 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -692,8 +692,7 @@ cont: NULL, clear_flags, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); @@ -933,8 +932,7 @@ retry: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK); + PAGE_UNLOCK | PAGE_START_WRITEBACK); if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, @@ -970,9 +968,8 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR); + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); goto again; @@ -1070,8 +1067,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK); + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1194,8 +1190,7 @@ out_reserve: out_unlock: clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * If we reserved an extent for our delalloc range (or a subrange) and * failed to create the respective ordered extent, then it means that @@ -1320,9 +1315,8 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR; + unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); @@ -1519,8 +1513,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1842,8 +1835,7 @@ error: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; -- cgit v1.2.3-70-g09d2 From 62c053fbb2d1816def1d353d9abed4c2f1f0abe9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:46 +0800 Subject: btrfs: set UNMAPPED bit early in btrfs_clone_extent_buffer() for subpage support For the incoming subpage support, UNMAPPED extent buffer will have different behavior in btrfs_release_extent_buffer(). This means we need to set UNMAPPED bit early before calling btrfs_release_extent_buffer(). Currently there is only one caller which relies on btrfs_release_extent_buffer() in its error path while set UNMAPPED bit late: - btrfs_clone_extent_buffer() Make it subpage compatible by setting the UNMAPPED bit early, since we're here, also move the UPTODATE bit early. There is another caller, __alloc_dummy_extent_buffer(), setting UNMAPPED bit late, but that function clean up the allocated page manually, thus no need for any modification. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7c14ccf76838..d3819dde8952 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5064,6 +5064,13 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) if (new == NULL) return NULL; + /* + * Set UNMAPPED before calling btrfs_release_extent_buffer(), as + * btrfs_release_extent_buffer() have different behavior for + * UNMAPPED subpage extent buffer. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + for (i = 0; i < num_pages; i++) { p = alloc_page(GFP_NOFS); if (!p) { @@ -5076,9 +5083,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); - set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); return new; } -- cgit v1.2.3-70-g09d2 From cac06d843f259ebc4d03e4bc8af7304c17f76ee5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:47 +0800 Subject: btrfs: introduce the skeleton of btrfs_subpage structure For sectorsize < page size support, we need a structure to record extra status info for each sector of a page. Introduce the skeleton structure, all subpage related code would go to subpage.[ch]. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 3 ++- fs/btrfs/subpage.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 1 - 4 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 fs/btrfs/subpage.c create mode 100644 fs/btrfs/subpage.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e45957319424..b634c42115ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -27,7 +27,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o reflink.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ + subpage.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c new file mode 100644 index 000000000000..a3e5b6a13d54 --- /dev/null +++ b/fs/btrfs/subpage.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "ctree.h" +#include "subpage.h" + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type) +{ + struct btrfs_subpage *subpage; + + /* + * We have cases like a dummy extent buffer page, which is not mappped + * and doesn't need to be locked. + */ + if (page->mapping) + ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ + if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + return 0; + + subpage = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!subpage) + return -ENOMEM; + + spin_lock_init(&subpage->lock); + attach_page_private(page, subpage); + return 0; +} + +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + /* Either not subpage, or already detached */ + if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + return; + + subpage = (struct btrfs_subpage *)detach_page_private(page); + ASSERT(subpage); + kfree(subpage); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h new file mode 100644 index 000000000000..676280bc7562 --- /dev/null +++ b/fs/btrfs/subpage.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUBPAGE_H +#define BTRFS_SUBPAGE_H + +#include + +/* + * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap + * is sufficient. Regular bitmap_* is not used due to size reasons. + */ +#define BTRFS_SUBPAGE_BITMAP_SIZE 16 + +/* + * Structure to trace status of each sector inside a page, attached to + * page::private for both data and metadata inodes. + */ +struct btrfs_subpage { + /* Common members for both data and metadata pages */ + spinlock_t lock; +}; + +enum btrfs_subpage_type { + BTRFS_SUBPAGE_METADATA, + BTRFS_SUBPAGE_DATA, +}; + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12d7d3be7cd4..919ed5c357e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,7 +48,6 @@ #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" - #include "qgroup.h" #define CREATE_TRACE_POINTS #include -- cgit v1.2.3-70-g09d2 From 760f991f1428f25fd18b8638004c95f0a2a43b2f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:48 +0800 Subject: btrfs: make attach_extent_buffer_page() handle subpage case For subpage case, we need to allocate additional memory for each metadata page. So we need to: - Allow attach_extent_buffer_page() to return int to indicate allocation failure - Allow manually pre-allocate subpage memory for alloc_extent_buffer() As we don't want to use GFP_ATOMIC under spinlock, we introduce btrfs_alloc_subpage() and btrfs_free_subpage() functions for this purpose. (The simple wrap for btrfs_free_subpage() is for later convert to kmem_cache. Already internally tested without problem) - Preallocate btrfs_subpage structure for alloc_extent_buffer() We don't want to call memory allocation with spinlock held, so do preallocation before we acquire mapping->private_lock. - Handle subpage and regular case differently in attach_extent_buffer_page() For regular case, no change, just do the usual thing. For subpage case, allocate new memory or use the preallocated memory. For future subpage metadata, we will make use of radix tree to grab extent buffer. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------ fs/btrfs/subpage.c | 30 ++++++++++++++++++----- fs/btrfs/subpage.h | 10 ++++++++ 3 files changed, 96 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d3819dde8952..e498d496560b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -24,6 +24,7 @@ #include "rcu-string.h" #include "backref.h" #include "disk-io.h" +#include "subpage.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -3141,9 +3142,13 @@ static int submit_extent_page(unsigned int opf, return ret; } -static void attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page) +static int attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page, + struct btrfs_subpage *prealloc) { + struct btrfs_fs_info *fs_info = eb->fs_info; + int ret = 0; + /* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. @@ -3153,10 +3158,28 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (fs_info->sectorsize == PAGE_SIZE) { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else + WARN_ON(page->private != (unsigned long)eb); + return 0; + } + + /* Already mapped, just free prealloc */ + if (PagePrivate(page)) { + btrfs_free_subpage(prealloc); + return 0; + } + + if (prealloc) + /* Has preallocated memory for subpage */ + attach_page_private(page, prealloc); else - WARN_ON(page->private != (unsigned long)eb); + /* Do new allocation to attach subpage */ + ret = btrfs_attach_subpage(fs_info, page, + BTRFS_SUBPAGE_METADATA); + return ret; } void set_page_extent_mapped(struct page *page) @@ -5072,12 +5095,19 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); for (i = 0; i < num_pages; i++) { + int ret; + p = alloc_page(GFP_NOFS); if (!p) { btrfs_release_extent_buffer(new); return NULL; } - attach_extent_buffer_page(new, p); + ret = attach_extent_buffer_page(new, p, NULL); + if (ret < 0) { + put_page(p); + btrfs_release_extent_buffer(new); + return NULL; + } WARN_ON(PageDirty(p)); SetPageUptodate(p); new->pages[i] = p; @@ -5315,12 +5345,33 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { + struct btrfs_subpage *prealloc = NULL; + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); goto free_eb; } + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock hold. The memory will be + * freed by attach_extent_buffer_page() or freed manually if + * we exit earlier. + * + * Although we have ensured one subpage eb can only have one + * page, but it may change in the future for 16K page size + * support, so we still preallocate the memory in the loop. + */ + ret = btrfs_alloc_subpage(fs_info, &prealloc, + BTRFS_SUBPAGE_METADATA); + if (ret < 0) { + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&mapping->private_lock); exists = grab_extent_buffer(p); if (exists) { @@ -5328,10 +5379,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); + btrfs_free_subpage(prealloc); goto free_eb; } - attach_extent_buffer_page(eb, p); + /* Should not fail, as we have preallocated the memory */ + ret = attach_extent_buffer_page(eb, p, prealloc); + ASSERT(!ret); spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); eb->pages[i] = p; if (!PageUptodate(p)) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a3e5b6a13d54..61b28dfca20c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -7,7 +7,8 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { - struct btrfs_subpage *subpage; + struct btrfs_subpage *subpage = NULL; + int ret; /* * We have cases like a dummy extent buffer page, which is not mappped @@ -19,11 +20,9 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) return 0; - subpage = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); - if (!subpage) - return -ENOMEM; - - spin_lock_init(&subpage->lock); + ret = btrfs_alloc_subpage(fs_info, &subpage, type); + if (ret < 0) + return ret; attach_page_private(page, subpage); return 0; } @@ -39,5 +38,24 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, subpage = (struct btrfs_subpage *)detach_page_private(page); ASSERT(subpage); + btrfs_free_subpage(subpage); +} + +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type) +{ + if (fs_info->sectorsize == PAGE_SIZE) + return 0; + + *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!*ret) + return -ENOMEM; + spin_lock_init(&(*ret)->lock); + return 0; +} + +void btrfs_free_subpage(struct btrfs_subpage *subpage) +{ kfree(subpage); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 676280bc7562..7ba544bcc9c6 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -18,6 +18,10 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; + union { + /* Structures only used by metadata */ + /* Structures only used by data */ + }; }; enum btrfs_subpage_type { @@ -30,4 +34,10 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +/* Allocate additional data where page represents more than one sector */ +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type); +void btrfs_free_subpage(struct btrfs_subpage *subpage); + #endif -- cgit v1.2.3-70-g09d2 From 819822107d8837fc3363ceaeb172b981c8600a2b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:49 +0800 Subject: btrfs: make grab_extent_buffer_from_page() handle subpage case For subpage case, grab_extent_buffer() can't really get an extent buffer just from btrfs_subpage. We have radix tree lock protecting us from inserting the same eb into the tree. Thus we don't really need to do the extra hassle, just let alloc_extent_buffer() handle the existing eb in radix tree. Now if two ebs are being allocated as the same time, one will fail with -EEIXST when inserting into the radix tree. So for grab_extent_buffer(), just always return NULL for subpage case. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e498d496560b..133ff4531472 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5284,10 +5284,19 @@ free_eb: } #endif -static struct extent_buffer *grab_extent_buffer(struct page *page) +static struct extent_buffer *grab_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page) { struct extent_buffer *exists; + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always + * return NULL and just continue. + */ + if (fs_info->sectorsize < PAGE_SIZE) + return NULL; + /* Page not yet attached to an extent buffer */ if (!PagePrivate(page)) return NULL; @@ -5373,7 +5382,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(p); + exists = grab_extent_buffer(fs_info, p); if (exists) { spin_unlock(&mapping->private_lock); unlock_page(p); -- cgit v1.2.3-70-g09d2 From 8ff8466d29efc226648c3c5e57590428d798a6ea Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:50 +0800 Subject: btrfs: support subpage for extent buffer page release In btrfs_release_extent_buffer_pages(), we need to add extra handling for subpage. Introduce a helper, detach_extent_buffer_page(), to do different handling for regular and subpage cases. For subpage case, handle detaching page private. For unmapped (dummy or cloned) ebs, we can detach the page private immediately as the page can only be attached to one unmapped eb. For mapped ebs, we have to ensure there are no eb in the page range before we delete it, as page->private is shared between all ebs in the same page. But there is a subpage specific race, where we can race with extent buffer allocation, and clear the page private while new eb is still being utilized, like this: Extent buffer A is the new extent buffer which will be allocated, while extent buffer B is the last existing extent buffer of the page. T1 (eb A) | T2 (eb B) -------------------------------+------------------------------ alloc_extent_buffer() | btrfs_release_extent_buffer_pages() |- p = find_or_create_page() | | |- attach_extent_buffer_page() | | | | |- detach_extent_buffer_page() | | |- if (!page_range_has_eb()) | | | No new eb in the page range yet | | | As new eb A hasn't yet been | | | inserted into radix tree. | | |- btrfs_detach_subpage() | | |- detach_page_private(); |- radix_tree_insert() | Then we have a metadata eb whose page has no private bit. To avoid such race, we introduce a subpage metadata-specific member, btrfs_subpage::eb_refs. In alloc_extent_buffer() we increase eb_refs in the critical section of private_lock. Then page_range_has_eb() will return true for detach_extent_buffer_page(), and will not detach page private. The section is marked by: - btrfs_page_inc_eb_refs() - btrfs_page_dec_eb_refs() Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 94 +++++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/subpage.c | 42 +++++++++++++++++++++++ fs/btrfs/subpage.h | 13 +++++++- 3 files changed, 133 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 133ff4531472..1812813bdf63 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4995,25 +4995,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -/* - * Release all pages attached to the extent buffer. - */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { - int i; - int num_pages; - int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + struct btrfs_subpage *subpage; - BUG_ON(extent_buffer_under_io(eb)); + lockdep_assert_held(&page->mapping->private_lock); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + if (PagePrivate(page)) { + subpage = (struct btrfs_subpage *)page->private; + if (atomic_read(&subpage->eb_refs)) + return true; + } + return false; +} - if (!page) - continue; +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + /* + * For mapped eb, we're going to change the page private, which should + * be done under the private_lock. + */ + if (mapped) + spin_lock(&page->mapping->private_lock); + + if (!PagePrivate(page)) { if (mapped) - spin_lock(&page->mapping->private_lock); + spin_unlock(&page->mapping->private_lock); + return; + } + + if (fs_info->sectorsize == PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5032,9 +5046,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) */ detach_page_private(page); } - if (mapped) spin_unlock(&page->mapping->private_lock); + return; + } + + /* + * For subpage, we can have dummy eb with page private. In this case, + * we can directly detach the private as such page is only attached to + * one dummy eb, no sharing. + */ + if (!mapped) { + btrfs_detach_subpage(fs_info, page); + return; + } + + btrfs_page_dec_eb_refs(fs_info, page); + + /* + * We can only detach the page private if there are no other ebs in the + * page range. + */ + if (!page_range_has_eb(fs_info, page)) + btrfs_detach_subpage(fs_info, page); + + spin_unlock(&page->mapping->private_lock); +} + +/* Release all pages attached to the extent buffer */ +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +{ + int i; + int num_pages; + + ASSERT(!extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) + continue; + + detach_extent_buffer_page(eb, page); /* One for when we allocated the page */ put_page(page); @@ -5394,6 +5448,16 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_page(eb, p, prealloc); ASSERT(!ret); + /* + * To inform we have extra eb under allocation, so that + * detach_extent_buffer_page() won't release the page private + * when the eb hasn't yet been inserted into radix tree. + * + * The ref will be decreased when the eb released the page, in + * detach_extent_buffer_page(). + * Thus needs no special handling in error path. + */ + btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 61b28dfca20c..a2a21fa0ea35 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -52,6 +52,8 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, if (!*ret) return -ENOMEM; spin_lock_init(&(*ret)->lock); + if (type == BTRFS_SUBPAGE_METADATA) + atomic_set(&(*ret)->eb_refs, 0); return 0; } @@ -59,3 +61,43 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) { kfree(subpage); } + +/* + * Increase the eb_refs of current subpage. + * + * This is important for eb allocation, to prevent race with last eb freeing + * of the same page. + * With the eb_refs increased before the eb inserted into radix tree, + * detach_extent_buffer_page() won't detach the page private while we're still + * allocating the extent buffer. + */ +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + atomic_inc(&subpage->eb_refs); +} + +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + ASSERT(atomic_read(&subpage->eb_refs)); + atomic_dec(&subpage->eb_refs); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7ba544bcc9c6..fe51cc237a66 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,7 +19,13 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; union { - /* Structures only used by metadata */ + /* + * Structures only used by metadata + * + * @eb_refs should only be operated under private_lock, as it + * manages whether the subpage can be detached. + */ + atomic_t eb_refs; /* Structures only used by data */ }; }; @@ -40,4 +46,9 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); + #endif -- cgit v1.2.3-70-g09d2 From 09bc1f0fb845a6435e2c6c5d3c937f7a674e816a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:51 +0800 Subject: btrfs: attach private to dummy extent buffer pages There are locations where we allocate dummy extent buffers for temporary usage, like in tree_mod_log_rewind() or get_old_root(). These dummy extent buffers will be handled by the same eb accessors, and if they don't have page::private subpage eb accessors could fail. To address such problems, make __alloc_dummy_extent_buffer() attach page private for dummy extent buffers too. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1812813bdf63..b8ff05916b8f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5185,9 +5185,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { + int ret; + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; + ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + if (ret < 0) + goto err; } set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); @@ -5195,8 +5200,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (; i > 0; i--) + for (; i > 0; i--) { + detach_extent_buffer_page(eb, eb->pages[i - 1]); __free_page(eb->pages[i - 1]); + } __free_extent_buffer(eb); return NULL; } -- cgit v1.2.3-70-g09d2 From a1d767c11cca0f9b6ddc56ea9561d441340d91a9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:52 +0800 Subject: btrfs: introduce helpers for subpage uptodate status Introduce the following functions to handle subpage uptodate status: - btrfs_subpage_set_uptodate() - btrfs_subpage_clear_uptodate() - btrfs_subpage_test_uptodate() These helpers can only be called when the page has subpage attached and the range is ensured to be inside the page. - btrfs_page_set_uptodate() - btrfs_page_clear_uptodate() - btrfs_page_test_uptodate() These helpers can handle both regular sector size and subpage. Although caller should still ensure that the range is inside the page. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 27 +++++++++++++ 2 files changed, 140 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a2a21fa0ea35..4e1b187b9607 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -101,3 +101,116 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } + +/* + * Convert the [start, start + len) range into a u16 bitmap + * + * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. + */ +static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; + const int nbits = len >> fs_info->sectorsize_bits; + + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + /* + * Here nbits can be 16, thus can go beyond u16 range. We make the + * first left shift to be calculate in unsigned long (at least u32), + * then truncate the result to u16. + */ + return (u16)(((1UL << nbits) - 1) << bit_start); +} + +void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap |= tmp; + if (subpage->uptodate_bitmap == U16_MAX) + SetPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap &= ~tmp; + ClearPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +/* + * Unlike set/clear which is dependent on each page status, for test all bits + * are tested in the same way. + */ +#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned long flags; \ + bool ret; \ + \ + spin_lock_irqsave(&subpage->lock, flags); \ + ret = ((subpage->name##_bitmap & tmp) == tmp); \ + spin_unlock_irqrestore(&subpage->lock, flags); \ + return ret; \ +} +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); + +/* + * Note that, in selftests (extent-io-tests), we can have empty fs_info passed + * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall + * back to regular sectorsize branch. + */ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ + test_page_func) \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, + PageUptodate); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index fe51cc237a66..98d511cd75bf 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -18,6 +18,7 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; + u16 uptodate_bitmap; union { /* * Structures only used by metadata @@ -51,4 +52,30 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page); +/* + * Template for subpage related operations. + * + * btrfs_subpage_*() are for call sites where the page has subpage attached and + * the range is ensured to be inside the page. + * + * btrfs_page_*() are for call sites where the page can either be subpage + * specific or regular page. The function will handle both cases. + * But the range still needs to be inside the page. + */ +#define DECLARE_BTRFS_SUBPAGE_OPS(name) \ +void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); + +DECLARE_BTRFS_SUBPAGE_OPS(uptodate); + #endif -- cgit v1.2.3-70-g09d2 From 03a816b32be577fdeed2e17d95c2636b68f6860c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:53 +0800 Subject: btrfs: introduce helpers for subpage error status Introduce the following functions to handle subpage error status: - btrfs_subpage_set_error() - btrfs_subpage_clear_error() - btrfs_subpage_test_error() These helpers can only be called when the page has subpage attached and the range is ensured to be inside the page. - btrfs_page_set_error() - btrfs_page_clear_error() - btrfs_page_test_error() These helpers can handle both regular sector size and subpage without problem. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 29 +++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 2 ++ 2 files changed, 31 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 4e1b187b9607..2c51ab71e000 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -160,6 +160,33 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap |= tmp; + SetPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap &= ~tmp; + if (subpage->error_bitmap == 0) + ClearPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -179,6 +206,7 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -214,3 +242,4 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); +IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 98d511cd75bf..f3c5def313a1 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,6 +19,7 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; u16 uptodate_bitmap; + u16 error_bitmap; union { /* * Structures only used by metadata @@ -77,5 +78,6 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); +DECLARE_BTRFS_SUBPAGE_OPS(error); #endif -- cgit v1.2.3-70-g09d2 From 251f2acc719e99f00827814ea77cfd38080e1d62 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:54 +0800 Subject: btrfs: support subpage in set/clear_extent_buffer_uptodate() To support subpage in set_extent_buffer_uptodate and clear_extent_buffer_uptodate we only need to use the subpage-aware helpers to update the page bits. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b8ff05916b8f..969de300a95b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5672,30 +5672,33 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (page) - ClearPageUptodate(page); + btrfs_page_clear_uptodate(fs_info, page, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); } } -- cgit v1.2.3-70-g09d2 From 92d83e94365706fa3250b0e43bdab5995ac03046 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:55 +0800 Subject: btrfs: support subpage in btrfs_clone_extent_buffer For btrfs_clone_extent_buffer(), it's mostly the same code of __alloc_dummy_extent_buffer(), except it has extra page copy. So to make it subpage compatible, we only need to: - Call set_extent_buffer_uptodate() instead of SetPageUptodate() This will set correct uptodate bit for subpage and regular sector size cases. Since we're calling set_extent_buffer_uptodate() which will also set EXTENT_BUFFER_UPTODATE bit, we don't need to manually set that bit either. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 969de300a95b..6b27daf62d94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5163,11 +5163,10 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - SetPageUptodate(p); new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_extent_buffer_uptodate(new); return new; } -- cgit v1.2.3-70-g09d2 From d1e86e3fc34f24b090d86949ad7f3db7a4c1861f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:56 +0800 Subject: btrfs: support subpage in try_release_extent_buffer() Unlike the original try_release_extent_buffer(), try_release_subpage_extent_buffer() will iterate through all the ebs in the page, and try to release each. We can release the full page only after there's no private attached, which means all ebs of that page have been released as well. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6b27daf62d94..a6102e795af5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6318,13 +6318,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +static struct extent_buffer *get_next_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *found = NULL; + u64 page_start = page_offset(page); + int ret; + int i; + + ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); + lockdep_assert_held(&fs_info->buffer_lock); + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, + bytenr >> fs_info->sectorsize_bits, + PAGE_SIZE / fs_info->nodesize); + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + break; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + break; + } + } + return found; +} + +static int try_release_subpage_extent_buffer(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + u64 cur = page_offset(page); + const u64 end = page_offset(page) + PAGE_SIZE; + int ret; + + while (cur < end) { + struct extent_buffer *eb = NULL; + + /* + * Unlike try_release_extent_buffer() which uses page->private + * to grab buffer, for subpage case we rely on radix tree, thus + * we need to ensure radix tree consistency. + * + * We also want an atomic snapshot of the radix tree, thus go + * with spinlock rather than RCU. + */ + spin_lock(&fs_info->buffer_lock); + eb = get_next_extent_buffer(fs_info, page, cur); + if (!eb) { + /* No more eb in the page range after or at cur */ + spin_unlock(&fs_info->buffer_lock); + break; + } + cur = eb->start + eb->len; + + /* + * The same as try_release_extent_buffer(), to ensure the eb + * won't disappear out from under us. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&fs_info->buffer_lock); + break; + } + spin_unlock(&fs_info->buffer_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a + * real ref, so just return, this eb will likely be freed soon + * anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + break; + } + + /* + * Here we don't care about the return value, we will always + * check the page private at the end. And + * release_extent_buffer() will release the refs_lock. + */ + release_extent_buffer(eb); + } + /* + * Finally to check if we have cleared page private, as if we have + * released all ebs in the page, the page private should be cleared now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) + ret = 1; + else + ret = 0; + spin_unlock(&page->mapping->private_lock); + return ret; + +} + int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return try_release_subpage_extent_buffer(page); + /* - * We need to make sure nobody is attaching this page to an eb right - * now. + * We need to make sure nobody is changing page->private, as we rely on + * page->private as the pointer to extent buffer. */ spin_lock(&page->mapping->private_lock); if (!PagePrivate(page)) { -- cgit v1.2.3-70-g09d2 From 4012daf769cb77dbf3bc36c3adecf480ad097682 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:57 +0800 Subject: btrfs: introduce read_extent_buffer_subpage() Introduce a helper, read_extent_buffer_subpage(), to do the subpage extent buffer read. The difference between regular and subpage routines are: - No page locking Here we completely rely on extent locking. Page locking can reduce the concurrency greatly, as if we lock one page to read one extent buffer, all the other extent buffers in the same page will have to wait. - Extent uptodate condition Despite the existing PageUptodate() and EXTENT_BUFFER_UPTODATE check, We also need to check btrfs_subpage::uptodate_bitmap. - No page iteration Just one page, no need to loop, this greatly simplified the subpage routine. This patch only implements the bio submit part, no endio support yet. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a6102e795af5..2ffaa983c9dd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5701,6 +5701,73 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct extent_io_tree *io_tree; + struct page *page = eb->pages[0]; + struct bio *bio = NULL; + int ret = 0; + + ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); + ASSERT(PagePrivate(page)); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + + if (wait == WAIT_NONE) { + ret = try_lock_extent(io_tree, eb->start, + eb->start + eb->len - 1); + if (ret <= 0) + return ret; + } else { + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + if (ret < 0) + return ret; + } + + ret = 0; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || + PageUptodate(page) || + btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + return ret; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, 1); + check_buffer_tree_ref(eb); + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, + eb->len, eb->start - page_offset(page), &bio, + end_bio_extent_readpage, mirror_num, 0, 0, + true); + if (ret) { + /* + * In the endio function, if we hit something wrong we will + * increase the io_pages, so here we need to decrease it for + * error path. + */ + atomic_dec(&eb->io_pages); + } + if (bio) { + int tmp; + + tmp = submit_one_bio(bio, mirror_num, 0); + if (tmp < 0) + return tmp; + } + if (ret || wait != WAIT_COMPLETE) + return ret; + + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + ret = -EIO; + return ret; +} + int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) { int i; @@ -5717,6 +5784,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return read_extent_buffer_subpage(eb, wait, mirror_num); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; -- cgit v1.2.3-70-g09d2 From 4325cb2293817cef3611c43d7a27d0937d1e6962 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:58 +0800 Subject: btrfs: support subpage in endio_readpage_update_page_status() To handle subpage status update, add the following: - Use btrfs_page_*() subpage-aware helpers to update page status Now we can handle both cases well. - No page unlock for subpage metadata Since subpage metadata doesn't utilize page locking at all, skip it. For subpage data locking, it's handled in later commits. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ffaa983c9dd..491fc0114672 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2840,15 +2840,24 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate) +static void endio_readpage_update_page_status(struct page *page, bool uptodate, + u64 start, u32 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + if (uptodate) { - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, start, len); } else { - ClearPageUptodate(page); - SetPageError(page); + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); } - unlock_page(page); + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + /* Subpage locking will be handled in later patches */ } /* @@ -2985,7 +2994,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate); + endio_readpage_update_page_status(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } -- cgit v1.2.3-70-g09d2 From 371cdc0700c778b94ae8fa2c7d99401f13070d8f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:59 +0800 Subject: btrfs: introduce subpage metadata validation check For subpage metadata validation check, there are some differences: - Read must finish in one bvec Since we're just reading one subpage range in one page, it should never be split into two bios nor two bvecs. - How to grab the existing eb Instead of grabbing eb using page->private, we have to go search radix tree as we don't have any direct pointer at hand. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d1b328397e..d34c6d61928f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -591,6 +591,59 @@ out: return ret; } +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) @@ -600,6 +653,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, int reads_done; ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + eb = (struct extent_buffer *)page->private; /* -- cgit v1.2.3-70-g09d2 From 32443de3382be98c0a8b8f6f50d23da2e10c4117 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:34:00 +0800 Subject: btrfs: introduce btrfs_subpage for data inodes To support subpage sector size, data also need extra info to make sure which sectors in a page are uptodate/dirty/... This patch will make pages for data inodes get btrfs_subpage structure attached, and detached when the page is freed. This patch also slightly changes the timing when set_page_extent_mapped() is called to make sure: - We have page->mapping set page->mapping->host is used to grab btrfs_fs_info, thus we can only call this function after page is mapped to an inode. One call site attaches pages to inode manually, thus we have to modify the timing of set_page_extent_mapped() a bit. - As soon as possible, before other operations Since memory allocation can fail, we have to do extra error handling. Calling set_page_extent_mapped() as soon as possible can simply the error handling for several call sites. The idea is pretty much the same as iomap_page, but with more bitmaps for btrfs specific cases. Currently the plan is to switch iomap if iomap can provide sector aligned write back (only write back dirty sectors, but not the full page, data balance require this feature). So we will stick to btrfs specific bitmap for now. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 10 ++++++++-- fs/btrfs/extent_io.c | 45 +++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/file.c | 24 +++++++++--------------- fs/btrfs/free-space-cache.c | 15 ++++++++++++--- fs/btrfs/inode.c | 15 +++++++++++---- fs/btrfs/ioctl.c | 8 +++++++- fs/btrfs/reflink.c | 5 ++++- fs/btrfs/relocation.c | 11 +++++++++-- 9 files changed, 103 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5ae3fa0386b7..6d203acfdeb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -542,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make * sure they map to this compressed extent on disk. */ - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + + end = last_offset + PAGE_SIZE - 1; lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 491fc0114672..5ff85ecc6c3f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3191,10 +3191,38 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, return ret; } -void set_page_extent_mapped(struct page *page) +int set_page_extent_mapped(struct page *page) { + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (PagePrivate(page)) + return 0; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return 0; +} + +void clear_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + if (!PagePrivate(page)) - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_detach_subpage(fs_info, page); + + detach_page_private(page); } static struct extent_map * @@ -3251,7 +3279,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_extent(tree, start, end); + SetPageError(page); + goto out; + } if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { @@ -3691,7 +3724,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + SetPageError(page); + goto done; + } if (!epd->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2d8187c84812..047b3e66897f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,7 +178,8 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -void set_page_extent_mapped(struct page *page); +int set_page_extent_mapped(struct page *page); +void clear_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be5350f5bedf..bf52d7e85914 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1369,6 +1369,12 @@ again: goto fail; } + err = set_page_extent_mapped(pages[i]); + if (err < 0) { + faili = i; + goto fail; + } + if (i == 0) err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); @@ -1453,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * It's possible the pages are dirty right now, but we don't want - * to clean them yet because copy_from_user may catch a page fault - * and we might have to fall back to one page at a time. If that - * happens, we'll unlock these pages and we'd have a window where - * reclaim could sneak in and drop the once-dirty page on the floor - * without writing it. - * - * We have the pages locked and the extent range locked, so there's - * no way someone can start IO on any dirty pages in this range. - * - * We'll call btrfs_dirty_pages() later on, and that will flip around - * delalloc bits and dirty the pages as required. + * We should be called after prepare_pages() which should have locked + * all pages in the range. */ - for (i = 0; i < num_pages; i++) { - set_page_extent_mapped(pages[i]); + for (i = 0; i < num_pages; i++) WARN_ON(!PageLocked(pages[i])); - } return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0d6dcb5ff963..6134e10a6e7f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -431,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) int i; for (i = 0; i < io_ctl->num_pages; i++) { + int ret; + page = find_or_create_page(inode->i_mapping, i, mask); if (!page) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + io_ctl_drop_pages(io_ctl); + return ret; + } + io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -455,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) } } - for (i = 0; i < io_ctl->num_pages; i++) { + for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3337c8ee7928..5522e9d09c8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4720,6 +4720,9 @@ again: ret = -ENOMEM; goto out; } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); @@ -4737,7 +4740,6 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, block_start, block_end, &cached_state); - set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { @@ -8125,7 +8127,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) - detach_page_private(page); + clear_page_extent_mapped(page); return ret; } @@ -8285,7 +8287,7 @@ again: } ClearPageChecked(page); - detach_page_private(page); + clear_page_extent_mapped(page); } /* @@ -8364,7 +8366,12 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, page_start, page_end, &cached_state); - set_page_extent_mapped(page); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } /* * we can't set the delalloc bits if there are pending ordered diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7f2935ea8d3a..e6a63f652235 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1314,6 +1314,13 @@ again: if (!page) break; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; while (1) { @@ -1435,7 +1442,6 @@ again: for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); - set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b03e7891394e..b24396cf2f99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode, goto out_unlock; } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d29baf3822a7..473b78874844 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2679,6 +2679,15 @@ static int relocate_file_extent_cluster(struct inode *inode, goto out; } } + ret = set_page_extent_mapped(page); + if (ret < 0) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + unlock_page(page); + put_page(page); + goto out; + } if (PageReadahead(page)) { page_cache_async_readahead(inode->i_mapping, @@ -2706,8 +2715,6 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - set_page_extent_mapped(page); - if (nr < cluster->nr && page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, -- cgit v1.2.3-70-g09d2 From 92082d40976ed0a421305e2264bde53944805627 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 2 Feb 2021 10:28:36 +0800 Subject: btrfs: integrate page status update for data read path into begin/end_page_read In btrfs data page read path, the page status update are handled in two different locations: btrfs_do_read_page() { while (cur <= end) { /* No need to read from disk */ if (HOLE/PREALLOC/INLINE){ memset(); set_extent_uptodate(); continue; } /* Read from disk */ ret = submit_extent_page(end_bio_extent_readpage); } end_bio_extent_readpage() { endio_readpage_uptodate_page_status(); } This is fine for sectorsize == PAGE_SIZE case, as for above loop we should only hit one branch and then exit. But for subpage, there is more work to be done in page status update: - Page Unlock condition Unlike regular page size == sectorsize case, we can no longer just unlock a page. Only the last reader of the page can unlock the page. This means, we can unlock the page either in the while() loop, or in the endio function. - Page uptodate condition Since we have multiple sectors to read for a page, we can only mark the full page uptodate if all sectors are uptodate. To handle both subpage and regular cases, introduce a pair of functions to help handling page status update: - begin_page_read() For regular case, it does nothing. For subpage case, it updates the reader counters so that later end_page_read() can know who is the last one to unlock the page. - end_page_read() This is just endio_readpage_uptodate_page_status() renamed. The original name is a little too long and too specific for endio. The new thing added is the condition for page unlock. Now for subpage data, we unlock the page if we're the last reader. This does not only provide the basis for subpage data read, but also hide the special handling of page read from the main read loop. Also, since we're changing how the page lock is handled, there are two existing error paths where we need to manually unlock the page before calling begin_page_read(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 ++++++++++++++++++++++++++++------------- fs/btrfs/subpage.c | 53 ++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/subpage.h | 8 ++++++++ 3 files changed, 80 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5ff85ecc6c3f..40d3bca6aaa4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2840,8 +2840,17 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate, - u64 start, u32 len) +static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +{ + ASSERT(PageLocked(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page)); + btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -2857,7 +2866,12 @@ static void endio_readpage_update_page_status(struct page *page, bool uptodate, if (fs_info->sectorsize == PAGE_SIZE) unlock_page(page); - /* Subpage locking will be handled in later patches */ + else if (is_data_inode(page->mapping->host)) + /* + * For subpage data, unlock the page if we're the last reader. + * For subpage metadata, page lock is not utilized for read. + */ + btrfs_subpage_end_reader(fs_info, page, start, len); } /* @@ -2994,7 +3008,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate, start, len); + end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -3263,6 +3277,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3282,7 +3297,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { unlock_extent(tree, start, end); - SetPageError(page); + btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); + unlock_page(page); goto out; } @@ -3290,6 +3306,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); unlock_extent(tree, start, end); + unlock_page(page); goto out; } } @@ -3306,6 +3323,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, kunmap_atomic(userpage); } } + begin_page_read(fs_info, page); while (cur <= end) { bool force_bio_submit = false; u64 disk_bytenr; @@ -3323,13 +3341,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); break; } extent_offset = cur - em->start; @@ -3412,6 +3431,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3421,6 +3441,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3429,8 +3450,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to date. Error out */ if (block_start == EXTENT_MAP_INLINE) { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3447,19 +3468,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, nr++; *bio_flags = this_bio_flag; } else { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); goto out; } cur = cur + iosize; pg_offset += iosize; } out: - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } return ret; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 2c51ab71e000..c69049e7daa9 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -54,6 +54,8 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, spin_lock_init(&(*ret)->lock); if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&(*ret)->eb_refs, 0); + else + atomic_set(&(*ret)->readers, 0); return 0; } @@ -102,22 +104,13 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, atomic_dec(&subpage->eb_refs); } -/* - * Convert the [start, start + len) range into a u16 bitmap - * - * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. - */ -static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, +static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; - const int nbits = len >> fs_info->sectorsize_bits; - /* Basic checks */ ASSERT(PagePrivate(page) && page->private); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. @@ -125,6 +118,46 @@ static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, if (page->mapping) ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); +} + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ret = atomic_add_return(nbits, &subpage->readers); + ASSERT(ret == nbits); +} + +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + ASSERT(atomic_read(&subpage->readers) >= nbits); + if (atomic_sub_and_test(nbits, &subpage->readers)) + unlock_page(page); +} + +/* + * Convert the [start, start + len) range into a u16 bitmap + * + * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. + */ +static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + /* * Here nbits can be 16, thus can go beyond u16 range. We make the * first left shift to be calculate in unsigned long (at least u32), diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f3c5def313a1..b86a4881475d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -29,6 +29,9 @@ struct btrfs_subpage { */ atomic_t eb_refs; /* Structures only used by data */ + struct { + atomic_t readers; + }; }; }; @@ -53,6 +56,11 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page); +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + /* * Template for subpage related operations. * -- cgit v1.2.3-70-g09d2 From 0bb3eb3ee8674d5d20ad3c0c0767e18787bbd761 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:34:02 +0800 Subject: btrfs: allow read-only mount of 4K sector size fs on 64K page system This adds the basic RO mount ability for 4K sector size on 64K page system. Currently we only plan to support 4K and 64K page system. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 25 ++++++++++++++++++++++--- fs/btrfs/super.c | 7 +++++++ 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d34c6d61928f..71fab77873a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2483,13 +2483,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -3248,6 +3256,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 919ed5c357e9..f8435641b912 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2027,6 +2027,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write mount is not yet allowed for sectorsize %u page size %lu", + fs_info->sectorsize, PAGE_SIZE); + ret = -EINVAL; + goto restore; + } /* * NOTE: when remounting with a change that does writes, don't -- cgit v1.2.3-70-g09d2 From 2c4d8cb737b805ca8d890e50c23f2b5eca270733 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 28 Jan 2021 19:25:08 +0800 Subject: btrfs: explain page locking and readahead in read_extent_buffer_pages() In read_extent_buffer_pages(), if we failed to lock the page atomically, we just exit with return value 0. This is counter-intuitive, as normally if we can't lock what we need, we would return something like EAGAIN. But that return hides under (wait == WAIT_NONE) branch, which only gets triggered for readahead. And for readahead, if we failed to lock the page, it means the extent buffer is either being read by other thread, or has been read and is under modification. Either way the eb will or has been cached, thus readahead has no need to wait for it. Add comment on this counter-intuitive behavior. Reported-by: Dan Carpenter Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40d3bca6aaa4..4be117adda33 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5853,6 +5853,13 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (wait == WAIT_NONE) { + /* + * WAIT_NONE is only utilized by readahead. If we can't + * acquire the lock atomically it means either the eb + * is being read out or under modification. + * Either way the eb will be or has been cached, + * readahead can exit safely. + */ if (!trylock_page(page)) goto unlock_exit; } else { -- cgit v1.2.3-70-g09d2 From 72c9925f87c8b74f36f8e75a4cd93d964538d3ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 Feb 2021 14:35:44 +0000 Subject: btrfs: fix extent buffer leak on failure to copy root At btrfs_copy_root(), if the call to btrfs_inc_ref() fails we end up returning without unlocking and releasing our reference on the extent buffer named "cow" we previously allocated with btrfs_alloc_tree_block(). So fix that by unlocking the extent buffer and dropping our reference on it before returning. Fixes: be20aa9dbadc8c ("Btrfs: Add mount option to turn off data cow") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 95d9bae764ab..d56730a67885 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -222,6 +222,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } -- cgit v1.2.3-70-g09d2 From 7365104236ade0bf22edd7724c8fd438b0342ee4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:42 +0900 Subject: btrfs: zoned: defer loading zone info after opening trees This is a preparation patch to implement zone emulation on a regular device. To emulate a zoned filesystem on a regular (non-zoned) device, we need to decide an emulated zone size. Instead of making it a compile-time static value, we'll make it configurable at mkfs time. Since we have one zone == one device extent restriction, we can determine the emulated zone size from the size of a device extent. We can extend btrfs_get_dev_zone_info() to show a regular device filled with conventional zones once the zone size is decided. The current call site of btrfs_get_dev_zone_info() during the mount process is earlier than loading the file system trees so that we don't know the size of a device extent at this point. Thus we can't slice a regular device to conventional zones. This patch introduces btrfs_get_dev_zone_info_all_devices to load the zone info for all the devices. And, it places this function in open_ctree() after loading the trees. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++++++++++ fs/btrfs/volumes.c | 4 ---- fs/btrfs/zoned.c | 25 +++++++++++++++++++++++++ fs/btrfs/zoned.h | 6 ++++++ 4 files changed, 44 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 71fab77873a5..2b6a3df765cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3333,6 +3333,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_tree_roots; + /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3948f5b50d11..07cd4742c123 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -669,10 +669,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; - ret = btrfs_get_dev_zone_info(device); - if (ret != 0) - goto error_free_page; - fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 41d27fefd306..0b1b1f38a196 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -143,6 +143,31 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + /* We can skip reading of zone info for missing devices */ + if (!device->bdev) + continue; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + break; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + int btrfs_get_dev_zone_info(struct btrfs_device *device) { struct btrfs_zoned_device_info *zone_info = NULL; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8abe2f83272b..eb47b7ad9ab1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -25,6 +25,7 @@ struct btrfs_zoned_device_info { #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); @@ -42,6 +43,11 @@ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + return 0; +} + static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) { return 0; -- cgit v1.2.3-70-g09d2 From d6639b35da2d742f9cbcdf8f49f87f2bde9fd479 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:43 +0900 Subject: btrfs: zoned: use regular super block location on zone emulation A zoned filesystem currently has a superblock at the beginning of the superblock logging zones if the zones are conventional. This difference in superblock position causes a chicken-and-egg problem for filesystems with emulated zones. Since the device is a regular (non-zoned) device, we cannot know if the filesystem is regular or zoned while reading the superblock. But, to load the superblock, we need to see if it is emulated zoned or not. Place the superblocks at the same location as they are on regular filesystem on regular devices to solve the problem. It is possible because it's ensured that all the superblock locations are at an (emulated) conventional zone on regular devices. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0b1b1f38a196..8b3868088c5e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -552,7 +552,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; - if (!zinfo) { + /* + * For a zoned filesystem on a non-zoned block device, use the same + * super block locations as regular filesystem. Doing so, the super + * block can always be retrieved and the zoned flag of the volume + * detected from the super block information. + */ + if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } -- cgit v1.2.3-70-g09d2 From 4afd2fe835a0ff87fb88cba7a7daa881d8e14233 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:44 +0900 Subject: btrfs: release path before calling to btrfs_load_block_group_zone_info Since we have no write pointer in conventional zones, we cannot determine the allocation offset from it. Instead, we set the allocation offset after the highest addressed extent. This is done by reading the extent tree in btrfs_load_block_group_zone_info(). However, this function is called from btrfs_read_block_groups(), so the read lock for the tree node could be recursively taken. To avoid this unsafe locking scenario, release the path before reading the extent tree to get the allocation offset. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5fa6b3d540f4..b8fbee70a897 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1810,24 +1810,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static void read_block_group_item(struct btrfs_block_group *cache, - struct btrfs_path *path, - const struct btrfs_key *key) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_block_group_item bgi; - int slot = path->slots[0]; - - cache->length = key->offset; - - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); -} - static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_path *path, + struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { @@ -1842,7 +1826,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - read_block_group_item(cache, path, key); + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->flags = btrfs_stack_block_group_flags(bgi); set_free_space_tree_thresholds(cache); @@ -2001,19 +1987,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = read_one_block_group(info, path, &key, need_clear); + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; - btrfs_release_path(path); } btrfs_release_path(path); -- cgit v1.2.3-70-g09d2 From b53429bad3a3555fdbda190192c6e9dfef8e7787 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:45 +0900 Subject: btrfs: zoned: do not load fs_info::zoned from incompat flag Don't set the zoned flag in fs_info as soon as we're encountering the incompat filesystem flag for a zoned filesystem on mount. The zoned flag in fs_info is in a union together with the zone_size, so setting it too early will result in setting an incorrect zone_size as well. Once the correct zone_size is read from the device, we can rely on the zoned flag in fs_info as well to determine if the filesystem is zoned. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/zoned.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b6a3df765cd..8551b0fc1b22 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3201,8 +3201,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); - /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 8b3868088c5e..c0840412ccb6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -432,6 +432,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + /* + * Check mount options here, because we might change fs_info->zoned + * from fs_info->zone_size. + */ + ret = btrfs_check_mountopts_zoned(fs_info); + if (ret) + goto out; + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); out: return ret; -- cgit v1.2.3-70-g09d2 From 1cb3dc3f79153c2d7f9a4438381e1385dff09656 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:46 +0900 Subject: btrfs: zoned: disallow fitrim on zoned filesystems The implementation of fitrim depends on space cache, which is not used and disabled for zoned extent allocator. So the current code does not work with zoned filesystem. In the future, we can implement fitrim for zoned filesystems by enabling space cache (but, only for fitrim) or scanning the extent tree at fitrim time. For now, disallow fitrim on zoned filesystems. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e6a63f652235..a8c60d46d19c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * btrfs_trim_block_group() depends on space cache, which is not + * available in zoned filesystem. So, disallow fitrim on a zoned + * filesystem for now. + */ + if (btrfs_is_zoned(fs_info)) + return -EOPNOTSUPP; + /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space -- cgit v1.2.3-70-g09d2 From 3c9daa09ccd43f68104634020b364d834c01738c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:47 +0900 Subject: btrfs: zoned: allow zoned filesystems on non-zoned block devices Run a zoned filesystem on non-zoned devices. This is done by "slicing up" the block device into static sized chunks and fake a conventional zone on each of them. The emulated zone size is determined from the size of device extent. This is mainly aimed at testing of zoned filesystems, i.e. the zoned chunk allocator, on regular block devices. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/zoned.h | 14 ++++-- 2 files changed, 148 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c0840412ccb6..6699f626a86e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -119,6 +119,36 @@ static inline u32 sb_zone_number(int shift, int mirror) return 0; } +/* + * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block + * device into static sized chunks and fake a conventional zone on each of + * them. + */ +static int emulate_report_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int nr_zones) +{ + const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(device->bdev); + unsigned int i; + + pos >>= SECTOR_SHIFT; + for (i = 0; i < nr_zones; i++) { + zones[i].start = i * zone_sectors + pos; + zones[i].len = zone_sectors; + zones[i].capacity = zone_sectors; + zones[i].wp = zones[i].start + zone_sectors; + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; + zones[i].cond = BLK_ZONE_COND_NOT_WP; + + if (zones[i].wp >= bdev_size) { + i++; + break; + } + } + + return i; +} + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { @@ -127,6 +157,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!*nr_zones) return 0; + if (!bdev_is_zoned(device->bdev)) { + ret = emulate_report_zones(device, pos, zones, *nr_zones); + *nr_zones = ret; + return 0; + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -143,6 +179,50 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +/* The emulated zone size is determined from the size of device extent */ +static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_dev_extent *dext; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); + ret = 0; + +out: + btrfs_free_path(path); + + return ret; +} + int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -170,6 +250,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) int btrfs_get_dev_zone_info(struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; struct request_queue *queue = bdev_get_queue(bdev); @@ -178,9 +259,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; unsigned int zone_sectors; + char *model, *emulated; int ret; - if (!bdev_is_zoned(bdev)) + /* + * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not + * yet be set. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) @@ -190,8 +276,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); + if (ret) + goto out; + } + + ASSERT(fs_info->zone_size); + zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; + } else { + zone_sectors = bdev_zone_sectors(bdev); + } + nr_sectors = bdev_nr_sectors(bdev); - zone_sectors = bdev_zone_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; @@ -297,20 +395,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) device->zone_info = zone_info; - /* device->fs_info is not safe to use for printing messages */ - btrfs_info_in_rcu(NULL, - "host-%s zoned block device %s, %u zones of %llu bytes", - bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", - rcu_str_deref(device->name), zone_info->nr_zones, - zone_info->zone_size); + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; + emulated = ""; + break; + case BLK_ZONED_HA: + model = "host-aware zoned"; + emulated = ""; + break; + case BLK_ZONED_NONE: + model = "regular"; + emulated = "emulated "; + break; + default: + /* Just in case */ + btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", + bdev_zoned_model(bdev), + rcu_str_deref(device->name)); + ret = -EOPNOTSUPP; + goto out_free_zone_info; + } + + btrfs_info_in_rcu(fs_info, + "%s block device %s, %u %szones of %llu bytes", + model, rcu_str_deref(device->name), zone_info->nr_zones, + emulated, zone_info->zone_size); return 0; out: kfree(zones); +out_free_zone_info: bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); + device->zone_info = NULL; return ret; } @@ -349,7 +469,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_is_zoned(fs_info); + const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; /* Count zoned devices */ @@ -360,9 +480,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) continue; model = bdev_zoned_model(device->bdev); + /* + * A Host-Managed zoned device must be used as a zoned device. + * A Host-Aware zoned device and a non-zoned devices can be + * treated as a zoned device, if ZONED flag is enabled in the + * superblock. + */ if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; + (model == BLK_ZONED_HA && incompat_zoned) || + (model == BLK_ZONED_NONE && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info = + device->zone_info; zone_info = device->zone_info; zoned_devices++; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index eb47b7ad9ab1..5e78786bb723 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -142,12 +142,16 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, struct block_device *bdev) { - u64 zone_size; - if (btrfs_is_zoned(fs_info)) { - zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; - /* Do not allow non-zoned device */ - return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + /* + * We can allow a regular device on a zoned filesystem, because + * we will emulate the zoned capabilities. + */ + if (!bdev_is_zoned(bdev)) + return true; + + return fs_info->zone_size == + (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } /* Do not allow Host Manged zoned device */ -- cgit v1.2.3-70-g09d2 From 1cd6121f2a382a840f01f506694b54bf403fddc9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:48 +0900 Subject: btrfs: zoned: implement zoned chunk allocator Implement a zoned chunk and device extent allocator. One device zone becomes a device extent so that a zone reset affects only this device extent and does not change the state of blocks in the neighbor device extents. To implement the allocator, we need to extend the following functions for a zoned filesystem. - init_alloc_chunk_ctl - dev_extent_search_start - dev_extent_hole_check - decide_stripe_size init_alloc_chunk_ctl_zoned() is mostly the same as regular one. It always set the stripe_size to the zone size and aligns the parameters to the zone size. dev_extent_search_start() only aligns the start offset to zone boundaries. We don't care about the first 1MB like in regular filesystem because we anyway reserve the first two zones for superblock logging. dev_extent_hole_check_zoned() checks if zones in given hole are either conventional or empty sequential zones. Also, it skips zones reserved for superblock logging. With the change to the hole, the new hole may now contain pending extents. So, in this case, loop again to check that. Finally, decide_stripe_size_zoned() should shrink the number of devices instead of stripe size because we need to honor stripe_size == zone_size. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++------ fs/btrfs/volumes.h | 1 + fs/btrfs/zoned.c | 141 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 25 ++++++++ 4 files changed, 321 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 07cd4742c123..ae2aeadad5a0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1414,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * make sure to start at an offset of at least 1MB. */ return max_t(u64, start, SZ_1M); + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return ALIGN(start, device->zone_info->zone_size); default: BUG(); } } +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return 1; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + /** * dev_extent_hole_check - check if specified hole is suitable for allocation * @device: the device which we have the hole @@ -1426,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * - * This function may modify @hole_start and @hole_end to reflect the suitable + * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, @@ -1435,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, bool changed = false; u64 hole_end = *hole_start + *hole_size; - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } - switch (device->fs_devices->chunk_alloc_policy) { - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ break; - default: - BUG(); } return changed; @@ -1505,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device, search_start = dev_extent_search_start(device, search_start); + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4899,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { @@ -4919,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; default: BUG(); } @@ -5045,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, return 0; } +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5072,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 04e2b26823c2..598ac225176d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -214,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, + BTRFS_CHUNK_ALLOC_ZONED, }; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 6699f626a86e..69fd0d078b9b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" +#include "disk-io.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -559,6 +561,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; /* * Check mount options here, because we might change fs_info->zoned @@ -779,3 +782,141 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) sb_zone << zone_sectors_shift, zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } + +/** + * btrfs_find_allocatable_zones - find allocatable zones within a given region + * + * @device: the device to allocate a region on + * @hole_start: the position of the hole to allocate the region + * @num_bytes: size of wanted region + * @hole_end: the end of the hole + * @return: position of allocatable zones + * + * Allocatable region should not contain any superblock locations. + */ +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + u64 nzones = num_bytes >> shift; + u64 pos = hole_start; + u64 begin, end; + bool have_sb; + int i; + + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + while (pos < hole_end) { + begin = pos >> shift; + end = begin + nzones; + + if (end > zinfo->nr_zones) + return hole_end; + + /* Check if zones in the region are all empty */ + if (btrfs_dev_is_sequential(device, pos) && + find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + pos += zinfo->zone_size; + continue; + } + + have_sb = false; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_pos; + + sb_zone = sb_zone_number(shift, i); + if (!(end <= sb_zone || + sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { + have_sb = true; + pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + break; + } + + /* We also need to exclude regular superblock positions */ + sb_pos = btrfs_sb_offset(i); + if (!(pos + num_bytes <= sb_pos || + sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { + have_sb = true; + pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, + zinfo->zone_size); + break; + } + } + if (!have_sb) + break; + } + + return pos; +} + +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes) +{ + int ret; + + *bytes = 0; + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, + GFP_NOFS); + if (ret) + return ret; + + *bytes = length; + while (length) { + btrfs_dev_set_zone_empty(device, physical); + physical += device->zone_info->zone_size; + length -= device->zone_info->zone_size; + } + + return 0; +} + +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + unsigned long begin = start >> shift; + unsigned long end = (start + size) >> shift; + u64 pos; + int ret; + + ASSERT(IS_ALIGNED(start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + + if (end > zinfo->nr_zones) + return -ERANGE; + + /* All the zones are conventional */ + if (find_next_bit(zinfo->seq_zones, begin, end) == end) + return 0; + + /* All the zones are sequential and empty */ + if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && + find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + return 0; + + for (pos = start; pos < start + size; pos += zinfo->zone_size) { + u64 reset_bytes; + + if (!btrfs_dev_is_sequential(device, pos) || + btrfs_dev_is_empty_zone(device, pos)) + continue; + + /* Free regions should be empty */ + btrfs_warn_in_rcu( + device->fs_info, + "zoned: resetting device %s (devid %llu) zone %llu for allocation", + rcu_str_deref(device->name), device->devid, pos >> shift); + WARN_ON_ONCE(1); + + ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, + &reset_bytes); + if (ret) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5e78786bb723..6c8f83c48c2e 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -36,6 +36,11 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes); +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes); +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -91,6 +96,26 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror return 0; } +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device, + u64 hole_start, u64 hole_end, + u64 num_bytes) +{ + return hole_start; +} + +static inline int btrfs_reset_device_zone(struct btrfs_device *device, + u64 physical, u64 length, u64 *bytes) +{ + *bytes = 0; + return 0; +} + +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, + u64 start, u64 size) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 381a696eb5f99189a2c8d0d99aae766767f9cb1e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:49 +0900 Subject: btrfs: zoned: verify device extent is aligned to zone Add a check in verify_one_dev_extent() to ensure that a device extent on a zoned block device is aligned to the respective zone boundary. If it isn't, mark the filesystem as unclean. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ae2aeadad5a0..10401def16ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7769,6 +7769,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + out: free_extent_map(em); return ret; -- cgit v1.2.3-70-g09d2 From 08e11a3db098f4ba0cfee46d7ab449cba43dea1b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:50 +0900 Subject: btrfs: zoned: load zone's allocation offset A zoned filesystem must allocate blocks at the zones' write pointer. The device's write pointer position can be mapped to a logical address within a block group. To facilitate this, add an "alloc_offset" to the block-group to track the logical addresses of the write pointer. This logical address is populated in btrfs_load_block_group_zone_info() from the write pointers of corresponding zones. For now, zoned filesystems the single profile. Supporting non-single profile with zone append writing is not trivial. For example, in the DUP profile, we send a zone append writing IO to two zones on a device. The device reply with written LBAs for the IOs. If the offsets of the returned addresses from the beginning of the zone are different, then it results in different logical addresses. We need fine-grained logical to physical mapping to support such separated physical address issue. Since it should require additional metadata type, disable non-single profiles for now. This commit supports the case all the zones in a block group are sequential. The next patch will handle the case having a conventional zone. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 15 +++++ fs/btrfs/block-group.h | 6 ++ fs/btrfs/zoned.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 7 +++ 4 files changed, 179 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b8fbee70a897..e6bf728496eb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -15,6 +15,7 @@ #include "delalloc-space.h" #include "discard.h" #include "raid56.h" +#include "zoned.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -1855,6 +1856,13 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } + ret = btrfs_load_block_group_zone_info(cache); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space @@ -2141,6 +2149,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->cached = BTRFS_CACHE_FINISHED; if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; + + ret = btrfs_load_block_group_zone_info(cache); + if (ret) { + btrfs_put_block_group(cache); + return ret; + } + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8f74a96074f7..224946fa9bed 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -183,6 +183,12 @@ struct btrfs_block_group { /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; + + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. + */ + u64 alloc_offset; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69fd0d078b9b..0a7cd00f405f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -3,14 +3,20 @@ #include #include #include +#include #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" #include "disk-io.h" +#include "block-group.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 +/* Invalid allocation pointer value for missing devices */ +#define WP_MISSING_DEV ((u64)-1) +/* Pseudo write pointer value for conventional zone */ +#define WP_CONVENTIONAL ((u64)-2) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -920,3 +926,148 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return 0; } + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *device; + u64 logical = cache->start; + u64 length = cache->length; + u64 physical = 0; + int ret; + int i; + unsigned int nofs_flag; + u64 *alloc_offsets = NULL; + u32 num_sequential = 0, num_conventional = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + /* Sanity check */ + if (!IS_ALIGNED(length, fs_info->zone_size)) { + btrfs_err(fs_info, + "zoned: block group %llu len %llu unaligned to zone size %llu", + logical, length, fs_info->zone_size); + return -EIO; + } + + /* Get the chunk mapping */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) + return -EINVAL; + + map = em->map_lookup; + + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); + if (!alloc_offsets) { + free_extent_map(em); + return -ENOMEM; + } + + for (i = 0; i < map->num_stripes; i++) { + bool is_sequential; + struct blk_zone zone; + + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->bdev == NULL) { + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } + + is_sequential = btrfs_dev_is_sequential(device, physical); + if (is_sequential) + num_sequential++; + else + num_conventional++; + + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; + } + + /* + * This zone will be used for allocation, so mark this zone + * non-empty. + */ + btrfs_dev_clear_zone_empty(device, physical); + + /* + * The group is mapped to a sequential zone. Get the zone write + * pointer to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret == -EIO || ret == -EOPNOTSUPP) { + ret = 0; + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } else if (ret) { + goto out; + } + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + physical >> device->zone_info->zone_size_shift, + rcu_str_deref(device->name), device->devid); + alloc_offsets[i] = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + alloc_offsets[i] = 0; + break; + case BLK_ZONE_COND_FULL: + alloc_offsets[i] = fs_info->zone_size; + break; + default: + /* Partially used zone */ + alloc_offsets[i] = + ((zone.wp - zone.start) << SECTOR_SHIFT); + break; + } + } + + if (num_conventional > 0) { + /* + * Since conventional zones do not have a write pointer, we + * cannot determine alloc_offset from the pointer + */ + ret = -EINVAL; + goto out; + } + + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + cache->alloc_offset = alloc_offsets[0]; + break; + case BTRFS_BLOCK_GROUP_DUP: + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID0: + case BTRFS_BLOCK_GROUP_RAID10: + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + /* non-single profiles are not supported yet */ + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + ret = -EINVAL; + goto out; + } + +out: + kfree(alloc_offsets); + free_extent_map(em); + + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6c8f83c48c2e..4f3152d7b98f 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -41,6 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -116,6 +117,12 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, return 0; } +static inline int btrfs_load_block_group_zone_info( + struct btrfs_block_group *cache) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From a94794d50d788d4735fd8f656ac8c0510117457d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:51 +0900 Subject: btrfs: zoned: calculate allocation offset for conventional zones Conventional zones do not have a write pointer, so we cannot use it to determine the allocation offset for sequential allocation if a block group contains a conventional zone. But instead, we can consider the end of the highest addressed extent in the block group for the allocation offset. For new block group, we cannot calculate the allocation offset by consulting the extent tree, because it can cause deadlock by taking extent buffer lock after chunk mutex, which is already taken in btrfs_make_block_group(). Since it is a new block group anyways, we can simply set the allocation offset to 0. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 +- fs/btrfs/zoned.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/zoned.h | 4 +- 3 files changed, 98 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e6bf728496eb..6d10874189df 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1856,7 +1856,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_load_block_group_zone_info(cache); + ret = btrfs_load_block_group_zone_info(cache, false); if (ret) { btrfs_err(info, "zoned: failed to load zone info of bg %llu", cache->start); @@ -2150,7 +2150,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; - ret = btrfs_load_block_group_zone_info(cache); + ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { btrfs_put_block_group(cache); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0a7cd00f405f..b892566a1c93 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -927,7 +927,68 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return 0; } -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) +/* + * Calculate an allocation pointer from the extent allocation information + * for a block group consist of conventional zones. It is pointed to the + * end of the highest addressed extent in the block group as an allocation + * offset. + */ +static int calculate_alloc_pointer(struct btrfs_block_group *cache, + u64 *offset_ret) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + u64 length; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = cache->start + cache->length; + key.type = 0; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + /* We should not find the exact match */ + if (!ret) + ret = -EUCLEAN; + if (ret < 0) + goto out; + + ret = btrfs_previous_extent_item(root, path, cache->start); + if (ret) { + if (ret == 1) { + ret = 0; + *offset_ret = 0; + } + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) + length = found_key.offset; + else + length = fs_info->nodesize; + + if (!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length)) { + ret = -EUCLEAN; + goto out; + } + *offset_ret = found_key.objectid + length - cache->start; + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct extent_map_tree *em_tree = &fs_info->mapping_tree; @@ -941,6 +1002,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; + u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; if (!btrfs_is_zoned(fs_info)) @@ -1040,11 +1102,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) if (num_conventional > 0) { /* - * Since conventional zones do not have a write pointer, we - * cannot determine alloc_offset from the pointer + * Avoid calling calculate_alloc_pointer() for new BG. It + * is no use for new BG. It must be always 0. + * + * Also, we have a lock chain of extent buffer lock -> + * chunk mutex. For new BG, this function is called from + * btrfs_make_block_group() which is already taking the + * chunk mutex. Thus, we cannot call + * calculate_alloc_pointer() which takes extent buffer + * locks to avoid deadlock. */ - ret = -EINVAL; - goto out; + if (new) { + cache->alloc_offset = 0; + goto out; + } + ret = calculate_alloc_pointer(cache, &last_alloc); + if (ret || map->num_stripes == num_conventional) { + if (!ret) + cache->alloc_offset = last_alloc; + else + btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", + cache->start); + goto out; + } } switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -1066,6 +1147,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) } out: + /* An extent is allocated after the write pointer */ + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { + btrfs_err(fs_info, + "zoned: got wrong write pointer in BG %llu: %llu > %llu", + logical, last_alloc, cache->alloc_offset); + ret = -EIO; + } + kfree(alloc_offsets); free_extent_map(em); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4f3152d7b98f..d27db3993e51 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -41,7 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -118,7 +118,7 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, } static inline int btrfs_load_block_group_zone_info( - struct btrfs_block_group *cache) + struct btrfs_block_group *cache, bool new) { return 0; } -- cgit v1.2.3-70-g09d2 From 169e0da91a21a571093feb8ff84c7e9229e64c08 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:52 +0900 Subject: btrfs: zoned: track unusable bytes for zones In a zoned filesystem a once written then freed region is not usable until the underlying zone has been reset. So we need to distinguish such unusable space from usable free space. Therefore we need to introduce the "zone_unusable" field to the block group structure, and "bytes_zone_unusable" to the space_info structure to track the unusable space. Pinned bytes are always reclaimed to the unusable space. But, when an allocated region is returned before using e.g., the block group becomes read-only between allocation time and reservation time, we can safely return the region to the block group. For the situation, this commit introduces "btrfs_add_free_space_unused". This behaves the same as btrfs_add_free_space() on regular filesystem. On zoned filesystems, it rewinds the allocation offset. Because the read-only bytes tracks free but unusable bytes when the block group is read-only, we need to migrate the zone_unusable bytes to read-only bytes when a block group is marked read-only. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 51 ++++++++++++++++++++++++++-------- fs/btrfs/block-group.h | 1 + fs/btrfs/extent-tree.c | 5 ++++ fs/btrfs/free-space-cache.c | 67 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/space-info.c | 13 +++++---- fs/btrfs/space-info.h | 4 ++- fs/btrfs/sysfs.c | 2 ++ fs/btrfs/zoned.c | 21 ++++++++++++++ fs/btrfs/zoned.h | 3 ++ 10 files changed, 151 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6d10874189df..e4444d4dd4b5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1009,12 +1009,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->length); + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1158,7 +1163,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->used; + cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1189,6 +1194,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (!ret) { sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } @@ -1876,12 +1887,20 @@ static int read_one_block_group(struct btrfs_fs_info *info, } /* - * Check for two cases, either we are full, and therefore don't need - * to bother with the caching work since we won't find any space, or we - * are empty, and we can just add all the space in and be done with it. - * This saves us _a_lot_ of time, particularly in the full case. + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. */ - if (cache->length == cache->used) { + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); @@ -1900,7 +1919,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, } trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, &space_info); + cache->used, cache->bytes_super, + cache->zone_unusable, &space_info); cache->space_info = space_info; @@ -1956,7 +1976,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, &space_info); + 0, 0, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2197,7 +2217,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); + cache->bytes_super, 0, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2305,8 +2325,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&cache->lock); if (!--cache->ro) { num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - cache->used; + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = cache->alloc_offset - cache->used; + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 224946fa9bed..0fd66febe115 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -189,6 +189,7 @@ struct btrfs_block_group { * allocation. This is used only on a zoned filesystem. */ u64 alloc_offset; + u64 zone_unusable; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5476ab84e544..5c61c3f136f7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,7 @@ #include "block-group.h" #include "discard.h" #include "rcu-string.h" +#include "zoned.h" #undef SCRAMBLE_DELAYED_REFS @@ -2740,6 +2741,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache->ro) { space_info->bytes_readonly += len; readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space && diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6134e10a6e7f..b93ac31eca69 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2477,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, int ret = 0; u64 filter_bytes = bytes; + ASSERT(!btrfs_is_zoned(fs_info)); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -2534,11 +2536,49 @@ out: return ret; } +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + ctl->free_space += to_free; + block_group->zone_unusable += to_unusable; + spin_unlock(&ctl->tree_lock); + if (!used) { + spin_lock(&block_group->lock); + block_group->alloc_offset -= size; + spin_unlock(&block_group->lock); + } + + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) + btrfs_mark_bg_unused(block_group); + + return 0; +} + int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2547,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, bytenr, size, trim_state); } +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); +} + /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add @@ -2557,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2574,6 +2628,9 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; + if (btrfs_is_zoned(block_group->fs_info)) + return 0; + spin_lock(&ctl->tree_lock); again: @@ -2668,6 +2725,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, struct rb_node *n; int count = 0; + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu", + block_group->length - block_group->alloc_offset); + return; + } + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ecb09a02d544..1f23088d43f9 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -107,6 +107,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bccd98141a6e..2da6177f4b0b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -169,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, ASSERT(s_info); return s_info->bytes_used + s_info->bytes_reserved + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0); } @@ -264,7 +265,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -280,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->bytes_readonly += bytes_readonly; + found->bytes_zone_unusable += bytes_zone_unusable; if (total_bytes > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -429,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes - btrfs_space_info_used(info, true), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); + info->bytes_readonly, info->bytes_zone_unusable); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -461,9 +463,10 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); + cache->reserved, cache->zone_unusable, + cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index e237156ce888..b1a8ffb03b3e 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -17,6 +17,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the @@ -123,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 19b9fffa2c9c..6eb1c50fa98c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -666,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); +SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(space_info, total_bytes_pinned, @@ -679,6 +680,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, total_bytes_pinned), diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b892566a1c93..c5f9f4c6f20b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1160,3 +1160,24 @@ out: return ret; } + +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) +{ + u64 unusable, free; + + if (!btrfs_is_zoned(cache->fs_info)) + return; + + WARN_ON(cache->bytes_super != 0); + unusable = cache->alloc_offset - cache->used; + free = cache->length - cache->alloc_offset; + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->free_space_ctl->free_space = free; + cache->zone_unusable = unusable; + + /* Should not have any excluded extents. Just in case, though */ + btrfs_free_excluded_extents(cache); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index d27db3993e51..37304d1675e6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -42,6 +42,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -123,6 +124,8 @@ static inline int btrfs_load_block_group_zone_info( return 0; } +static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 2eda57089ea31942e067d6ac37923c3154ef8a25 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:53 +0900 Subject: btrfs: zoned: implement sequential extent allocation Implement a sequential extent allocator for zoned filesystems. This allocator only needs to check if there is enough space in the block group after the allocation pointer to satisfy the extent allocation request. Therefore the allocator never manages bitmaps or clusters. Also, add assertions to the corresponding functions. As zone append writing is used, it would be unnecessary to track the allocation offset, as the allocator only needs to check available space. But by tracking and returning the offset as an allocated region, we can skip modification of ordered extents and checksum information when there is no IO reordering. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 ++ fs/btrfs/extent-tree.c | 90 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/free-space-cache.c | 6 +++ 3 files changed, 94 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e4444d4dd4b5..63093cfb807e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -725,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5c61c3f136f7..3f83ca503051 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3429,6 +3429,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache, enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, }; /* @@ -3681,6 +3682,65 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + int ret = 0; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + if (block_group->ro) { + ret = 1; + goto out; + } + + avail = block_group->length - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) @@ -3688,6 +3748,8 @@ static int do_allocation(struct btrfs_block_group *block_group, switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } @@ -3702,6 +3764,9 @@ static void release_block_group(struct btrfs_block_group *block_group, ffe_ctl->retry_clustered = false; ffe_ctl->retry_unclustered = false; break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3730,6 +3795,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3745,6 +3813,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) */ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Give up here */ + return -ENOSPC; default: BUG(); } @@ -3913,6 +3984,9 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + return 0; default: BUG(); } @@ -3976,6 +4050,9 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.last_ptr = NULL; ffe_ctl.use_cluster = true; + if (btrfs_is_zoned(fs_info)) + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -4118,20 +4195,21 @@ have_block_group: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > block_group->start + block_group->length) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b93ac31eca69..d2a43186cc7f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2928,6 +2928,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size); @@ -3059,6 +3061,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct rb_node *node; u64 ret = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -3835,6 +3839,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, int ret; u64 rem = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + *trimmed = 0; spin_lock(&block_group->lock); -- cgit v1.2.3-70-g09d2 From d3575156f6623eecf086a20bcf99a63f1598109c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:54 +0900 Subject: btrfs: zoned: redirty released extent buffers Tree manipulating operations like merging nodes often release once-allocated tree nodes. Such nodes are cleaned so that pages in the node are not uselessly written out. On zoned volumes, however, such optimization blocks the following IOs as the cancellation of the write out of the freed blocks breaks the sequential write sequence expected by the device. Introduce a list of clean and unwritten extent buffers that have been released in a transaction. Redirty the buffers so that btree_write_cache_pages() can send proper bios to the devices. Besides it clears the entire content of the extent buffer not to confuse raw block scanners e.g. 'btrfs check'. By clearing the content, csum_dirty_buffer() complains about bytenr mismatch, so avoid the checking and checksum using newly introduced buffer flag EXTENT_BUFFER_NO_CHECK. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ fs/btrfs/extent-tree.c | 12 +++++++++++- fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/transaction.c | 10 ++++++++++ fs/btrfs/transaction.h | 3 +++ fs/btrfs/tree-log.c | 6 ++++++ fs/btrfs/zoned.c | 37 +++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 7 +++++++ 9 files changed, 88 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8551b0fc1b22..eb1afd7d89f7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -4774,6 +4780,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3f83ca503051..dddcb8513c77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3292,8 +3292,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); goto out; + } } cache = btrfs_lookup_block_group(fs_info, buf->start); @@ -3304,6 +3306,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + if (btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); @@ -4635,6 +4644,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); set_extent_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4be117adda33..eedcfb40c356 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -25,6 +25,7 @@ #include "backref.h" #include "disk-io.h" #include "subpage.h" +#include "zoned.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -5182,6 +5183,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); + INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -6111,6 +6113,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + if (check_eb_range(eb, start, len)) return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 047b3e66897f..824640cb0ace 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -31,6 +31,7 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, }; /* these are flags for __process_pages_contig */ @@ -93,6 +94,7 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 00c0680dac3a..acff6bb49a97 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -380,6 +381,8 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -2350,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } + /* + * At this point, we should have written all the tree blocks allocated + * in this transaction. So it's now safe to free the redirtyied extent + * buffers. + */ + btrfs_free_redirty_list(cur_trans); + ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 935bd6958a8a..6335716e513f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -93,6 +93,9 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; + + spinlock_t releasing_ebs_lock; + struct list_head releasing_ebs; }; #define __TRANS_FREEZABLE (1U << 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4c7b283ed2b2..c02eeeac439c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -2752,6 +2753,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); return ret; } + btrfs_redirty_list_add( + trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3296,6 +3299,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); + + if (trans && log->node) + btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c5f9f4c6f20b..1de67d789b83 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -10,6 +10,7 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" +#include "transaction.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1181,3 +1182,39 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) /* Should not have any excluded extents. Just in case, though */ btrfs_free_excluded_extents(cache); } + +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (!btrfs_is_zoned(fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || + !list_empty(&eb->release_list)) + return; + + set_extent_buffer_dirty(eb); + set_extent_bits_nowait(&trans->dirty_pages, eb->start, + eb->start + eb->len - 1, EXTENT_DIRTY); + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + + spin_lock(&trans->releasing_ebs_lock); + list_add_tail(&eb->release_list, &trans->releasing_ebs); + spin_unlock(&trans->releasing_ebs_lock); + atomic_inc(&eb->refs); +} + +void btrfs_free_redirty_list(struct btrfs_transaction *trans) +{ + spin_lock(&trans->releasing_ebs_lock); + while (!list_empty(&trans->releasing_ebs)) { + struct extent_buffer *eb; + + eb = list_first_entry(&trans->releasing_ebs, + struct extent_buffer, release_list); + list_del_init(&eb->release_list); + free_extent_buffer(eb); + } + spin_unlock(&trans->releasing_ebs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 37304d1675e6..b250a578e38c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -43,6 +43,9 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); +void btrfs_free_redirty_list(struct btrfs_transaction *trans); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -126,6 +129,10 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } +static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } +static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 011b41bffa3dd086de3f2c393b35cde6133a7140 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:55 +0900 Subject: btrfs: zoned: advance allocation pointer after tree log node Since the allocation info of a tree log node is not recorded in the extent tree, calculate_alloc_pointer() cannot detect this node, so the pointer can be over a tree node. Replaying the log calls btrfs_remove_free_space() for each node in the log tree. So, advance the pointer after the node to not allocate over it. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d2a43186cc7f..5400294bd271 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2628,8 +2628,22 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; - if (btrfs_is_zoned(block_group->fs_info)) + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->alloc_offset < offset + bytes) + block_group->alloc_offset = offset + bytes; return 0; + } spin_lock(&ctl->tree_lock); -- cgit v1.2.3-70-g09d2 From dcba6e48b518e5e48522e9ea2b73b60827c93146 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:56 +0900 Subject: btrfs: zoned: reset zones of unused block groups We must reset the zones of a deleted unused block group to rewind the zones' write pointers to the zones' start. To do this, we can use the DISCARD_SYNC code to do the reset when the filesystem is running on zoned devices. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++++++-- fs/btrfs/extent-tree.c | 17 ++++++++++++----- fs/btrfs/zoned.h | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 63093cfb807e..70a0c0f8f99f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1408,8 +1408,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dddcb8513c77..d976c56b3a56 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1298,6 +1298,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { + struct btrfs_device *dev = stripe->dev; + u64 physical = stripe->physical; + u64 length = stripe->length; u64 bytes; struct request_queue *req_q; @@ -1305,14 +1308,18 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) + /* Zone reset on zoned filesystems */ + if (btrfs_can_zone_reset(dev, physical, length)) + ret = btrfs_reset_device_zone(dev, physical, + length, &bytes); + else if (blk_queue_discard(req_q)) + ret = btrfs_issue_discard(dev->bdev, physical, + length, &bytes); + else continue; - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b250a578e38c..c105641a6ad3 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -209,4 +209,19 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 p return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); } +static inline bool btrfs_can_zone_reset(struct btrfs_device *device, + u64 physical, u64 length) +{ + u64 zone_size; + + if (!btrfs_dev_is_sequential(device, physical)) + return false; + + zone_size = device->zone_info->zone_size; + if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size)) + return false; + + return true; +} + #endif -- cgit v1.2.3-70-g09d2 From 953651eb308fb56cd1a2d916e3d3c8b242240651 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:57 +0900 Subject: btrfs: factor out helper adding a page to bio Factor out adding a page to a bio from submit_extent_page(). The page is added only when bio_flags are the same, contiguous and the added page fits in the same stripe as pages in the bio. Condition checks are reordered to allow early return to avoid possibly heavy btrfs_bio_fits_in_stripe() calling. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 60 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eedcfb40c356..3b33e7afd8e4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3084,6 +3084,48 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } +/** + * Attempt to add a page to bio + * + * @bio: destination bio + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @pg_offset: starting offset in the page + * @size: portion of page that we want to write + * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @bio_flags: flags of the current bio to see if we can merge them + * @return: true if page was added, false otherwise + * + * Attempt to add a page to bio considering stripe alignment etc. + * + * Return true if successfully page added. Otherwise, return false. + */ +static bool btrfs_bio_add_page(struct bio *bio, struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + bool contig; + + if (prev_bio_flags != bio_flags) + return false; + + if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + contig = bio->bi_iter.bi_sector == sector; + else + contig = bio_end_sector(bio) == sector; + if (!contig) + return false; + + if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + return false; + + return bio_add_page(bio, page, size, pg_offset) == size; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3112,27 +3154,15 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = disk_bytenr >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); if (*bio_ret) { - bool contig; - bool can_merge = true; - bio = *bio_ret; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) - contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; - - if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) - can_merge = false; - - if (prev_bio_flags != bio_flags || !contig || !can_merge || - force_bio_submit || - bio_add_page(bio, page, io_size, pg_offset) < io_size) { + if (force_bio_submit || + !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, + pg_offset, prev_bio_flags, bio_flags)) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; -- cgit v1.2.3-70-g09d2 From e1326f0339fe0a3beecb0da4d1b8793443798e09 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:58 +0900 Subject: btrfs: zoned: use bio_add_zone_append_page A zoned device has its own hardware restrictions e.g. max_zone_append_size when using REQ_OP_ZONE_APPEND. To follow these restrictions, use bio_add_zone_append_page() instead of bio_add_page(). We need target device to use bio_add_zone_append_page(), so this commit reads the chunk information to cache the target device to btrfs_io_bio(bio)->device. Caching only the target device is sufficient here as zoned filesystems only supports the single profile at the moment. Once more profiles will be supported btrfs_io_bio can hold an extent_map to be able to check for the restrictions of all devices the btrfs_bio will be mapped to. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3b33e7afd8e4..6b26777efb92 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3109,6 +3109,7 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, { const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; + int ret; if (prev_bio_flags != bio_flags) return false; @@ -3123,7 +3124,12 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) return false; - return bio_add_page(bio, page, size, pg_offset) == size; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = bio_add_zone_append_page(bio, page, size, pg_offset); + else + ret = bio_add_page(bio, page, size, pg_offset); + + return ret == size; } /* @@ -3154,7 +3160,9 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(bio_ret); @@ -3185,11 +3193,26 @@ static int submit_extent_page(unsigned int opf, if (wbc) { struct block_device *bdev; - bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, page, io_size); } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + btrfs_io_bio(bio)->device = map->stripes[0].dev; + + free_extent_map(em); + } *bio_ret = bio; -- cgit v1.2.3-70-g09d2 From cfe94440d17404478771179150e6e4554f092dd5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:59 +0900 Subject: btrfs: zoned: handle REQ_OP_ZONE_APPEND as writing Zoned filesystems use REQ_OP_ZONE_APPEND bios for writing to actual devices. Let btrfs_end_bio() and btrfs_op be aware of it, by mapping REQ_OP_ZONE_APPEND to BTRFS_MAP_WRITE and using btrfs_op() instead of bio_op(). Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/volumes.c | 8 ++++---- fs/btrfs/volumes.h | 1 + 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1afd7d89f7..70621184a731 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -709,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -885,7 +885,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5522e9d09c8a..d7a9c770dc3b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2250,7 +2250,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; @@ -7681,7 +7681,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->logical_offset, dip->bytes, @@ -7847,7 +7847,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = bio_op(bio) == REQ_OP_WRITE; + bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; /* Check btrfs_submit_bio_hook() for rules about async submit. */ @@ -7897,7 +7897,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); size_t dip_size; struct btrfs_dio_private *dip; @@ -7927,7 +7927,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 10401def16ef..400375aaa197 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6448,7 +6448,7 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_device *dev = btrfs_io_bio(bio)->device; ASSERT(dev->bdev); - if (bio_op(bio) == REQ_OP_WRITE) + if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) @@ -6561,10 +6561,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { + ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ret = raid56_parity_write(fs_info, bio, bbio, map_length); } else { @@ -6587,7 +6587,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (bio_op(first_bio) == REQ_OP_WRITE && + (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 598ac225176d..d3bbdb4175df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -424,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) case REQ_OP_DISCARD: return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); -- cgit v1.2.3-70-g09d2 From d22002fd37bd970480c59754dfa448866a1f38bd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:00 +0900 Subject: btrfs: zoned: split ordered extent when bio is sent For a zone append write, the device decides the location the data is being written to. Therefore we cannot ensure that two bios are written consecutively on the device. In order to ensure that an ordered extent maps to a contiguous region on disk, we need to maintain a "one bio == one ordered extent" rule. Implement splitting of an ordered extent and extent map on bio submission to adhere to the rule. extract_ordered_extent() hooks into btrfs_submit_data_bio() and splits the corresponding ordered extent so that the ordered extent's region fits into one bio and the corresponding device limits. Several sanity checks need to be done in extract_ordered_extent() e.g. - We cannot split once end_bio'd ordered extent because we cannot divide ordered->bytes_left for the split ones - We do not expect a compressed ordered extent - We should not have checksum list because we omit the list splitting. Since the function is called before btrfs_wq_submit_bio() or btrfs_csum_one_bio(), this should be always ensured. We also need to split an extent map by creating a new one. If not, unpin_extent_cache() complains about the difference between the start of the extent map and the file's logical offset. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.c | 78 ++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 2 ++ 3 files changed, 175 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d7a9c770dc3b..750482a06d67 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2215,6 +2215,92 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + struct bio *bio, loff_t file_offset) +{ + struct btrfs_ordered_extent *ordered; + struct extent_map *em = NULL, *em_new = NULL; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; + u64 pre, post; + int ret = 0; + + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + + /* No need to split */ + if (ordered->disk_num_bytes == len) + goto out; + + /* We cannot split once end_bio'd ordered extent */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { + ret = -EINVAL; + goto out; + } + + /* We cannot split a compressed ordered extent */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { + ret = -EINVAL; + goto out; + } + + ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; + /* bio must be in one ordered extent */ + if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { + ret = -EINVAL; + goto out; + } + + /* Checksum list should be empty */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) { + ret = -EINVAL; + goto out; + } + + pre = start - ordered->disk_bytenr; + post = ordered_end - end; + + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, ordered->file_offset, len); + if (!em) { + read_unlock(&em_tree->lock); + ret = -EIO; + goto out; + } + read_unlock(&em_tree->lock); + + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + /* + * We cannot reuse em_new here but have to create a new one, as + * unpin_extent_cache() expects the start of the extent map to be the + * logical offset of the file, which does not hold true anymore after + * splitting. + */ + em_new = create_io_em(inode, em->start + pre, len, + em->start + pre, em->block_start + pre, len, + len, len, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_REGULAR); + if (IS_ERR(em_new)) { + ret = PTR_ERR(em_new); + goto out; + } + free_extent_map(em_new); + +out: + free_extent_map(em); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + /* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. @@ -2250,6 +2336,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *page = bio_first_bvec_all(bio)->bv_page; + loff_t file_offset = page_offset(page); + + ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + if (ret) + goto out; + } + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e8dee1578d4a..2dc707f02f00 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -920,6 +920,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, } } +static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, + u64 len) +{ + struct inode *inode = ordered->inode; + u64 file_offset = ordered->file_offset + pos; + u64 disk_bytenr = ordered->disk_bytenr + pos; + u64 num_bytes = len; + u64 disk_num_bytes = len; + int type; + unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); + int compress_type = ordered->compress_type; + unsigned long weight; + int ret; + + weight = hweight_long(flags_masked); + WARN_ON_ONCE(weight > 1); + if (!weight) + type = 0; + else + type = __ffs(flags_masked); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { + WARN_ON_ONCE(1); + ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), + file_offset, disk_bytenr, num_bytes, + disk_num_bytes, compress_type); + } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } else { + ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } + + return ret; +} + +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post) +{ + struct inode *inode = ordered->inode; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct rb_node *node; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret = 0; + + spin_lock_irq(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + + ordered->file_offset += pre; + ordered->disk_bytenr += pre; + ordered->num_bytes -= (pre + post); + ordered->disk_num_bytes -= (pre + post); + ordered->bytes_left -= (pre + post); + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + ordered->file_offset); + + spin_unlock_irq(&tree->lock); + + if (pre) + ret = clone_ordered_extent(ordered, 0, pre); + if (post) + ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, + post); + + return ret; +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index cca3307807e8..c400be75a3f1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -201,6 +201,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post); int __init ordered_data_init(void); void __cold ordered_data_exit(void); -- cgit v1.2.3-70-g09d2 From cacb2cea46382aacf0365dbe231bd1ac3349478e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:01 +0900 Subject: btrfs: zoned: check if bio spans across an ordered extent To ensure that an ordered extent maps to a contiguous region on disk, we need to maintain a "one bio == one ordered extent" rule. Ensure that constructing bio does not span more than an ordered extent. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent_io.c | 9 +++++++-- fs/btrfs/inode.c | 27 +++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9b0521d9e89..10da47ab093a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3120,6 +3120,8 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6b26777efb92..f64e2be5749e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3124,10 +3124,15 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) return false; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *first_page = bio_first_bvec_all(bio)->bv_page; + + if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) + return false; ret = bio_add_zone_append_page(bio, page, size, pg_offset); - else + } else { ret = bio_add_page(bio, page, size, pg_offset); + } return ret == size; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 750482a06d67..31545e503b9e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2215,6 +2215,33 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_extent *ordered; + u64 len = bio->bi_iter.bi_size + size; + bool ret = true; + + ASSERT(btrfs_is_zoned(fs_info)); + ASSERT(fs_info->max_zone_append_size > 0); + ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); + + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + if (!ordered) + return ret; + + if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > + ordered->disk_bytenr + ordered->disk_num_bytes) + ret = false; + + btrfs_put_ordered_extent(ordered); + + return ret; +} + static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { -- cgit v1.2.3-70-g09d2 From 138082f36610698e3fd00318f275d7f2159b8d26 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:02 +0900 Subject: btrfs: extend btrfs_rmap_block for specifying a device btrfs_rmap_block currently reverse-maps the physical addresses on all devices to the corresponding logical addresses. Extend the function to match to a specified device. The old functionality of querying all devices is left intact by specifying NULL as target device. A block_device instead of a btrfs_device is passed into btrfs_rmap_block, as this function is intended to reverse-map the result of a bio, which only has a block_device. Also export the function for later use. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 +++++++++++----- fs/btrfs/block-group.h | 8 +++----- fs/btrfs/tests/extent-map-tests.c | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 70a0c0f8f99f..f5e9f560ce6d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1588,6 +1588,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group + * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1597,9 +1598,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * Used primarily to exclude those portions of a block group that contain super * block copies. */ -EXPORT_FOR_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1617,6 +1618,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; + chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -1631,14 +1633,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u64 stripe_nr; + u64 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; + if (bdev && map->stripes[i].dev->bdev != bdev) + continue; + stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -1652,7 +1658,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size; + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { @@ -1694,7 +1700,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, + ret = btrfs_rmap_block(fs_info, cache->start, NULL, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 0fd66febe115..d14ac03bb93d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -277,6 +277,9 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, struct btrfs_caching_control *caching_ctl); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -303,9 +306,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); -#endif - #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 57379e96ccc9..c0aefe6dee0b 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", -- cgit v1.2.3-70-g09d2 From 08f455593fff701e103876d4db5d3f4f6d0ff871 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:03 +0900 Subject: btrfs: zoned: cache if block group is on a sequential zone On a zoned filesystem, cache if a block group is on a sequential write only zone. On sequential write only zones, we can use REQ_OP_ZONE_APPEND for writing data, therefore provide btrfs_use_zone_append() to figure out if IO is targeting a sequential write only zone and we can use REQ_OP_ZONE_APPEND for data writing. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 3 +++ fs/btrfs/zoned.c | 29 +++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 6 ++++++ 3 files changed, 38 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index d14ac03bb93d..31c7c5872b92 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -181,6 +181,9 @@ struct btrfs_block_group { */ int needs_free_space; + /* Flag indicating this block group is placed on a sequential zone */ + bool seq_zone; + /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1de67d789b83..f6c68704c840 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1101,6 +1101,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } + if (num_sequential > 0) + cache->seq_zone = true; + if (num_conventional > 0) { /* * Avoid calling calculate_alloc_pointer() for new BG. It @@ -1218,3 +1221,29 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) } spin_unlock(&trans->releasing_ebs_lock); } + +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + + if (!fs_info->max_zone_append_size) + return false; + + if (!is_data_inode(&inode->vfs_inode)) + return false; + + cache = btrfs_lookup_block_group(fs_info, em->block_start); + ASSERT(cache); + if (!cache) + return false; + + ret = cache->seq_zone; + btrfs_put_block_group(cache); + + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c105641a6ad3..14d578328cbe 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -46,6 +46,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -133,6 +134,11 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, + struct extent_map *em) +{ + return false; +} #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 24533f6a9ad633d6ff0332844fadafb9ecf4a917 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:04 +0900 Subject: btrfs: save irq flags when looking up an ordered extent A following patch will add another caller of btrfs_lookup_ordered_extent(), but from a bio's endio context. btrfs_lookup_ordered_extent() uses spin_lock_irq() which unconditionally disables interrupts. Change this to spin_lock_irqsave() so interrupts aren't disabled and re-enabled unconditionally. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2dc707f02f00..fe235ab935d3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -767,9 +767,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, file_offset); if (!node) goto out; @@ -780,7 +781,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino if (entry) refcount_inc(&entry->refs); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return entry; } -- cgit v1.2.3-70-g09d2 From d8e3fb106f393858b90b3befc4f6092a76c86d1c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:05 +0900 Subject: btrfs: zoned: use ZONE_APPEND write for zoned mode Enable zone append writing for zoned mode. When using zone append, a bio is issued to the start of a target zone and the device decides to place it inside the zone. Upon completion the device reports the actual written position back to the host. Three parts are necessary to enable zone append mode. First, modify the bio to use REQ_OP_ZONE_APPEND in btrfs_submit_bio_hook() and adjust the bi_sector to point the beginning of the zone. Second, record the returned physical address (and disk/partno) to the ordered extent in end_bio_extent_writepage() after the bio has been completed. We cannot resolve the physical address to the logical address because we can neither take locks nor allocate a buffer in this end_bio context. So, we need to record the physical address to resolve it later in btrfs_finish_ordered_io(). And finally, rewrite the logical addresses of the extent mapping and checksum data according to the physical address using btrfs_rmap_block. If the returned address matches the originally allocated address, we can skip this rewriting process. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 +++++++++-- fs/btrfs/file.c | 6 ++++- fs/btrfs/inode.c | 4 +++ fs/btrfs/ordered-data.c | 3 +++ fs/btrfs/ordered-data.h | 8 ++++++ fs/btrfs/volumes.c | 14 ++++++++++ fs/btrfs/zoned.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 12 +++++++++ 8 files changed, 129 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f64e2be5749e..14a68f3589cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2735,6 +2735,7 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; struct bvec_iter_all iter_all; + bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -2761,6 +2762,11 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + if (first_bvec) { + btrfs_record_physical_zoned(inode, start, bio); + first_bvec = false; + } + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -3664,6 +3670,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct extent_map *em; int ret = 0; int nr = 0; + u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; @@ -3710,6 +3717,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, /* Note that em_end from extent_map_end() is exclusive */ iosize = min(em_end, end + 1) - cur; + + if (btrfs_use_zone_append(inode, em)) + opf = REQ_OP_ZONE_APPEND; + free_extent_map(em); em = NULL; @@ -3735,8 +3746,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, disk_bytenr, iosize, + ret = submit_extent_page(opf | write_flags, wbc, page, + disk_bytenr, iosize, cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bf52d7e85914..01a72f53fb5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2168,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. + * + * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the + * logical address recorded in the ordered extent may change. We need + * to wait for the IO to stabilize the logical address. */ - if (full_sync) { + if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); } else { /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 31545e503b9e..6dbab9293425 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" struct btrfs_iget_args { u64 ino; @@ -2874,6 +2875,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (ordered_extent->disk) + btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fe235ab935d3..985a21558437 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,6 +199,9 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; + entry->physical = (u64)-1; + entry->disk = NULL; + entry->partno = (u8)-1; ASSERT(type == BTRFS_ORDERED_REGULAR || type == BTRFS_ORDERED_NOCOW || diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c400be75a3f1..99e0853e4d3b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -139,6 +139,14 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; + + /* + * Used to reverse-map physical address returned from ZONE_APPEND write + * command in a workqueue context + */ + u64 physical; + struct gendisk *disk; + u8 partno; }; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 400375aaa197..a4d47c6050f7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6500,6 +6500,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f6c68704c840..f4e226bda9b0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1247,3 +1247,73 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) return ret; } + +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio) +{ + struct btrfs_ordered_extent *ordered; + const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (bio_op(bio) != REQ_OP_ZONE_APPEND) + return; + + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + if (WARN_ON(!ordered)) + return; + + ordered->physical = physical; + ordered->disk = bio->bi_disk; + ordered->partno = bio->bi_partno; + + btrfs_put_ordered_extent(ordered); +} + +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_ordered_sum *sum; + struct block_device *bdev; + u64 orig_logical = ordered->disk_bytenr; + u64 *logical = NULL; + int nr, stripe_len; + + /* Zoned devices should not have partitions. So, we can assume it is 0 */ + ASSERT(ordered->partno == 0); + bdev = bdgrab(ordered->disk->part0); + if (WARN_ON(!bdev)) + return; + + if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, + ordered->physical, &logical, &nr, + &stripe_len))) + goto out; + + WARN_ON(nr != 1); + + if (orig_logical == *logical) + goto out; + + ordered->disk_bytenr = *logical; + + em_tree = &inode->extent_tree; + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); + em->block_start = *logical; + free_extent_map(em); + write_unlock(&em_tree->lock); + + list_for_each_entry(sum, &ordered->list, list) { + if (*logical < orig_logical) + sum->bytenr -= orig_logical - *logical; + else + sum->bytenr += *logical - orig_logical; + } + +out: + kfree(logical); + bdput(bdev); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 14d578328cbe..04f7b21652b6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -47,6 +47,9 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio); +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -139,6 +142,15 @@ static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, { return false; } + +static inline void btrfs_record_physical_zoned(struct inode *inode, + u64 file_offset, struct bio *bio) +{ +} + +static inline void btrfs_rewrite_logical_zoned( + struct btrfs_ordered_extent *ordered) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 544d24f9de73642a65d50389b789a957b14ae3f6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:06 +0900 Subject: btrfs: zoned: enable zone append writing for direct IO Likewise to buffered IO, enable zone append writing for direct IO when its used on a zoned block device. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dbab9293425..dd6fe8afd0e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7738,6 +7738,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; + if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + iomap->flags |= IOMAP_F_ZONE_APPEND; + free_extent_map(em); return 0; @@ -7964,6 +7967,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) dip->dio_bio->bi_status = err; + btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + bio_put(bio); btrfs_dio_private_put(dip); } @@ -8124,6 +8129,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; + WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && + fs_info->max_zone_append_size && + bio_op(bio) != REQ_OP_ZONE_APPEND); + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + status = extract_ordered_extent(BTRFS_I(inode), bio, + file_offset); + if (status) { + bio_put(bio); + goto out_err; + } + } + ASSERT(submit_len >= clone_len); submit_len -= clone_len; -- cgit v1.2.3-70-g09d2 From 42c011000963442ce533d92a492c4a057b2f5a46 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:07 +0900 Subject: btrfs: zoned: introduce dedicated data write path for zoned filesystems If more than one IO is issued for one file extent, these IO can be written to separate regions on a device. Since we cannot map one file extent to such a separate area on a zoned filesystem, we need to follow the "one IO == one ordered extent" rule. The normal buffered, uncompressed and not pre-allocated write path (used by cow_file_range()) sometimes does not follow this rule. It can write a part of an ordered extent when specified a region to write e.g., when its called from fdatasync(). Introduce a dedicated (uncompressed buffered) data write path for zoned filesystems, that will COW the region and write it at once. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd6fe8afd0e0..c4779cde83c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1394,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode, return 0; } +static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0); + if (ret) + return ret; + + if (*page_started) + return 0; + + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + *page_started = 1; + + return 0; +} + static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { @@ -1871,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page { int ret; int force_cow = need_force_cow(inode, start, end); + const bool zoned = btrfs_is_zoned(inode->root->fs_info); if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, -- cgit v1.2.3-70-g09d2 From 0bc09ca12980db3ef1e55bfad25b1803d57628c9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:08 +0900 Subject: btrfs: zoned: serialize metadata IO We cannot use zone append for writing metadata, because the B-tree nodes have references to each other using logical address. Without knowing the address in advance, we cannot construct the tree in the first place. So we need to serialize write IOs for metadata. We cannot add a mutex around allocation and submission because metadata blocks are allocated in an earlier stage to build up B-trees. Add a zoned_meta_io_lock and hold it during metadata IO submission in btree_write_cache_pages() to serialize IOs. Furthermore, this adds a per-block group metadata IO submission pointer "meta_write_pointer" to ensure sequential writing, which can break when attempting to write back blocks in an unfinished transaction. If the writing out failed because of a hole and the write out is for data integrity (WB_SYNC_ALL), it returns EAGAIN. A caller like fsync() code should handle this properly e.g. by falling back to a full transaction commit. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/extent_io.c | 25 ++++++++++++++++++++++++- fs/btrfs/zoned.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 31c7c5872b92..a07108d65c44 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -193,6 +193,7 @@ struct btrfs_block_group { */ u64 alloc_offset; u64 zone_unusable; + u64 meta_write_pointer; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 10da47ab093a..1bb4f767966a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -975,6 +975,7 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70621184a731..458bb27e0327 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2769,6 +2769,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14a68f3589cc..dfa6c6106b94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,6 +26,7 @@ #include "disk-io.h" #include "subpage.h" #include "zoned.h" +#include "block-group.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -4161,6 +4162,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; + struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -4193,13 +4195,31 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; + if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { + /* + * If for_sync, this hole will be filled with + * trasnsaction commit. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + ret = -EAGAIN; + else + ret = 0; + free_extent_buffer(eb); + return ret; + } + *eb_context = eb; ret = lock_extent_buffer_for_io(eb, epd); if (ret <= 0) { + btrfs_revert_meta_write_pointer(cache, eb); + if (cache) + btrfs_put_block_group(cache); free_extent_buffer(eb); return ret; } + if (cache) + btrfs_put_block_group(cache); ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4245,6 +4265,7 @@ int btree_write_cache_pages(struct address_space *mapping, tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -4285,7 +4306,7 @@ retry: } if (ret < 0) { end_write_bio(&epd, ret); - return ret; + goto out; } /* * If something went wrong, don't allow any metadata write bio to be @@ -4320,6 +4341,8 @@ retry: ret = -EROFS; end_write_bio(&epd, ret); } +out: + btrfs_zoned_meta_io_unlock(fs_info); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f4e226bda9b0..b2a6553b2db0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1159,6 +1159,9 @@ out: ret = -EIO; } + if (!ret) + cache->meta_write_pointer = cache->alloc_offset + cache->start; + kfree(alloc_offsets); free_extent_map(em); @@ -1317,3 +1320,50 @@ out: kfree(logical); bdput(bdev); } + +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + struct btrfs_block_group *cache; + bool ret = true; + + if (!btrfs_is_zoned(fs_info)) + return true; + + cache = *cache_ret; + + if (cache && (eb->start < cache->start || + cache->start + cache->length <= eb->start)) { + btrfs_put_block_group(cache); + cache = NULL; + *cache_ret = NULL; + } + + if (!cache) + cache = btrfs_lookup_block_group(fs_info, eb->start); + + if (cache) { + if (cache->meta_write_pointer != eb->start) { + btrfs_put_block_group(cache); + cache = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; + } + + *cache_ret = cache; + } + + return ret; +} + +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ + if (!btrfs_is_zoned(eb->fs_info) || !cache) + return; + + ASSERT(cache->meta_write_pointer == eb->start + eb->len); + cache->meta_write_pointer = eb->start; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 04f7b21652b6..0755a25d0f4c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -50,6 +50,11 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret); +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -151,6 +156,19 @@ static inline void btrfs_record_physical_zoned(struct inode *inode, static inline void btrfs_rewrite_logical_zoned( struct btrfs_ordered_extent *ordered) { } +static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + return true; +} + +static inline void btrfs_revert_meta_write_pointer( + struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -242,4 +260,18 @@ static inline bool btrfs_can_zone_reset(struct btrfs_device *device, return true; } +static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_lock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_unlock(&fs_info->zoned_meta_io_lock); +} + #endif -- cgit v1.2.3-70-g09d2 From 24c0a7227fdfa598badcfc0f735d16745d39e0c4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:09 +0900 Subject: btrfs: zoned: wait for existing extents before truncating When truncating a file, file buffers which have already been allocated but not yet written may be truncated. Truncating these buffers could cause breakage of a sequential write pattern in a block group if the truncated blocks are for example followed by blocks allocated to another file. To avoid this problem, always wait for write out of all unwritten buffers before proceeding with the truncate execution. Signed-off-by: Naohiro Aota Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4779cde83c6..535abf898225 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5169,6 +5169,15 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, + ALIGN(newsize, fs_info->sectorsize), + (u64)-1); + if (ret) + return ret; + } /* * We're truncating a file that used to have good data down to -- cgit v1.2.3-70-g09d2 From 4eef29ef6360d9c3e4be111392e20b70e19171cc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:10 +0900 Subject: btrfs: zoned: do not use async metadata checksum on zoned filesystems On zoned filesystems, btrfs uses per-fs zoned_meta_io_lock to serialize the metadata write IOs. Even with this serialization, write bios sent from btree_write_cache_pages can be reordered by async checksum workers as these workers are per CPU and not per zone. To preserve write bio ordering, we disable async metadata checksum on a zoned filesystem. This does not result in lower performance with HDDs as a single CPU core is fast enough to do checksum for a single zone write stream with the maximum possible bandwidth of the device. If multiple zones are being written simultaneously, HDD seek overhead lowers the achievable maximum bandwidth, resulting again in a per zone checksum serialization not affecting the performance. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 458bb27e0327..6e16f556ed75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -871,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) -- cgit v1.2.3-70-g09d2 From 78ce9fc269af6e69c1399ab910ba6bc81c934f67 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:11 +0900 Subject: btrfs: zoned: mark block groups to copy for device-replace This is the 1/4 patch to support device-replace on zoned filesystems. We have two types of IOs during the device replace process. One is an IO to "copy" (by the scrub functions) all the device extents from the source device to the destination device. The other one is an IO to "clone" (by handle_ops_on_dev_replace()) new incoming write IOs from users to the source device into the target device. Cloning incoming IOs can break the sequential write rule in on target device. When a write is mapped in the middle of a block group, the IO is directed to the middle of a target device zone, which breaks the sequential write requirement. However, the cloning function cannot be disabled since incoming IOs targeting already copied device extents must be cloned so that the IO is executed on the target device. We cannot use dev_replace->cursor_{left,right} to determine whether a bio is going to a not yet copied region. Since we have a time gap between finishing btrfs_scrub_dev() and rewriting the mapping tree in btrfs_dev_replace_finishing(), we can have a newly allocated device extent which is never cloned nor copied. So the point is to copy only already existing device extents. This patch introduces mark_block_group_to_copy() to mark existing block groups as a target of copying. Then, handle_ops_on_dev_replace() and dev-replace can check the flag to do their job. Also, btrfs_finish_block_group_to_copy() will check if the copied stripe is the last stripe in the block group. With the last stripe copied, the to_copy flag is finally disabled. Afterwards we can safely clone incoming IOs on this block group. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/dev-replace.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/dev-replace.h | 3 + fs/btrfs/scrub.c | 16 +++++ 4 files changed, 204 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a07108d65c44..d37ee576ac6e 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -95,6 +95,7 @@ struct btrfs_block_group { unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; + unsigned int to_copy:1; int disk_cache_state; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bc73f798ce3a..3a9c1e046ebe 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" +#include "block-group.h" /* * Device replace overview @@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, + struct btrfs_device *src_dev) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_block_group *cache; + struct btrfs_trans_handle *trans; + int ret = 0; + u64 chunk_offset; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&fs_info->chunk_mutex); + + /* Ensure we don't have pending new block group */ + spin_lock(&fs_info->trans_lock); + while (fs_info->running_transaction && + !list_empty(&fs_info->running_transaction->dev_update_list)) { + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->chunk_mutex); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret == -ENOENT) { + spin_lock(&fs_info->trans_lock); + continue; + } else { + goto unlock; + } + } + + ret = btrfs_commit_transaction(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret) + goto unlock; + + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto unlock; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = src_dev->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto free_path; + if (ret > 0) { + ret = 0; + goto free_path; + } + } else { + ret = 0; + } + } + + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != src_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) + goto skip; + + spin_lock(&cache->lock); + cache->to_copy = 1; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + +skip: + ret = btrfs_next_item(root, path); + if (ret != 0) { + if (ret > 0) + ret = 0; + break; + } + } + +free_path: + btrfs_free_path(path); +unlock: + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 chunk_offset = cache->start; + int num_extents, cur_extent; + int i; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return true; + + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + return true; + } + spin_unlock(&cache->lock); + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(em)); + map = em->map_lookup; + + num_extents = cur_extent = 0; + for (i = 0; i < map->num_stripes; i++) { + /* We have more device extent to copy */ + if (srcdev != map->stripes[i].dev) + continue; + + num_extents++; + if (physical == map->stripes[i].physical) + cur_extent = i; + } + + free_extent_map(em); + + if (num_extents > 1 && cur_extent < num_extents - 1) { + /* + * Has more stripes on this device. Keep this block group + * readonly until we finish all the stripes. + */ + return false; + } + + /* Last stripe on this device */ + spin_lock(&cache->lock); + cache->to_copy = 0; + spin_unlock(&cache->lock); + + return true; +} + static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + ret = mark_block_group_to_copy(fs_info, src_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 60b70dacc299..3911049a5f23 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..da4f9c24e42d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3561,6 +3561,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + spin_lock(&cache->lock); + if (!cache->to_copy) { + spin_unlock(&cache->lock); + ro_set = 0; + goto done; + } + spin_unlock(&cache->lock); + } + /* * Make sure that while we are scrubbing the corresponding block * group doesn't get its logical address and its device extents @@ -3692,6 +3702,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) + ro_set = 0; + +done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; -- cgit v1.2.3-70-g09d2 From 6143c23ccced762d21a87ef5fa421ba876231131 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:12 +0900 Subject: btrfs: zoned: implement cloning for zoned device-replace This is 2/4 patch to implement device replace for zoned filesystems. In zoned mode, a block group must be either copied (from the source device to the target device) or cloned (to both devices). Implement the cloning part. If a block group targeted by an IO is marked to copy, we should not clone the IO to the destination device, because the block group is eventually copied by the replace process. This commit also handles cloning of device reset. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++++++------------- fs/btrfs/volumes.c | 31 +++++++++++++++++++++++++-- fs/btrfs/zoned.c | 9 ++++++++ 3 files changed, 80 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d976c56b3a56..cd308bf3a220 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ #include "discard.h" #include "rcu-string.h" #include "zoned.h" +#include "dev-replace.h" #undef SCRAMBLE_DELAYED_REFS @@ -1265,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } +static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { @@ -1298,28 +1339,14 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { - struct btrfs_device *dev = stripe->dev; - u64 physical = stripe->physical; - u64 length = stripe->length; u64 bytes; - struct request_queue *req_q; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - req_q = bdev_get_queue(stripe->dev->bdev); - /* Zone reset on zoned filesystems */ - if (btrfs_can_zone_reset(dev, physical, length)) - ret = btrfs_reset_device_zone(dev, physical, - length, &bytes); - else if (blk_queue_discard(req_q)) - ret = btrfs_issue_discard(dev->bdev, physical, - length, &bytes); - else - continue; - + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4d47c6050f7..52ec6721ada2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5973,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non-ZONED mode does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + spin_lock(&cache->lock); + ret = cache->to_copy; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + return ret; +} + static void handle_ops_on_dev_replace(enum btrfs_map_op op, struct btrfs_bio **bbio_ret, struct btrfs_dev_replace *dev_replace, + u64 logical, int *num_stripes_ret, int *max_errors_ret) { struct btrfs_bio *bbio = *bbio_ret; @@ -5988,6 +6008,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, if (op == BTRFS_MAP_WRITE) { int index_where_to_add; + /* + * A block group which have "to_copy" set will eventually + * copied by dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + /* * duplicate the write operations while the dev replace * procedure is running. Since the copying of the old disk to @@ -6376,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, - &max_errors); + handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + &num_stripes, &max_errors); } *bbio_ret = bbio; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b2a6553b2db0..15288f766842 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -11,6 +11,7 @@ #include "disk-io.h" #include "block-group.h" #include "transaction.h" +#include "dev-replace.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1036,6 +1037,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) for (i = 0; i < map->num_stripes; i++) { bool is_sequential; struct blk_zone zone; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1062,6 +1065,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) */ btrfs_dev_clear_zone_empty(device, physical); + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + up_read(&dev_replace->rwsem); + /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. -- cgit v1.2.3-70-g09d2 From de17addce7a20db311c020fa91497a7341782d2d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:13 +0900 Subject: btrfs: zoned: implement copying for zoned device-replace This is 3/4 patch to implement device-replace on zoned filesystems. This commit implements copying. To do this, it tracks the write pointer during the device replace process. As device-replace's copy process is smart enough to only copy used extents on the source device, we have to fill the gap to honor the sequential write requirement in the target device. The device-replace process on zoned filesystems must copy or clone all the extents in the source device exactly once. So, we need to ensure allocations started just before the dev-replace process to have their corresponding extent information in the B-trees. finish_extent_writes_for_zoned() implements that functionality, which basically is the removed code in the commit 042528f8d840 ("Btrfs: fix block group remaining RO forever after error during device replace"). Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 2 +- fs/btrfs/zoned.c | 9 ++++++ fs/btrfs/zoned.h | 7 +++++ 4 files changed, 101 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index da4f9c24e42d..92904902d160 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -166,6 +166,7 @@ struct scrub_ctx { int pages_per_rd_bio; int is_dev_replace; + u64 write_pointer; struct scrub_bio *wr_curr_bio; struct mutex wr_lock; @@ -1619,6 +1620,25 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, return scrub_add_page_to_wr_bio(sblock->sctx, spage); } +static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +{ + int ret = 0; + u64 length; + + if (!btrfs_is_zoned(sctx->fs_info)) + return 0; + + if (sctx->write_pointer < physical) { + length = physical - sctx->write_pointer; + + ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, + sctx->write_pointer, length); + if (!ret) + sctx->write_pointer = physical; + } + return ret; +} + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { @@ -1641,6 +1661,13 @@ again: if (sbio->page_count == 0) { struct bio *bio; + ret = fill_writer_pointer_gap(sctx, + spage->physical_for_dev_replace); + if (ret) { + mutex_unlock(&sctx->wr_lock); + return ret; + } + sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; sbio->dev = sctx->wr_tgtdev; @@ -1702,6 +1729,9 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * doubled the write performance on spinning disks when measured * with Linux 3.5 */ btrfsic_submit_bio(sbio->bio); + + if (btrfs_is_zoned(sctx->fs_info)) + sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -3025,6 +3055,20 @@ out: return ret < 0 ? ret : 0; } +static void sync_replace_for_zoned(struct scrub_ctx *sctx) +{ + if (!btrfs_is_zoned(sctx->fs_info)) + return; + + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3165,6 +3209,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ blk_start_plug(&plug); + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); + sctx->flush_all_writes = true; + } + /* * now find all extents for each stripe and scrub them */ @@ -3353,6 +3405,9 @@ again: if (ret) goto out; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3475,6 +3530,25 @@ out: return ret; } +static int finish_extent_writes_for_zoned(struct btrfs_root *root, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) @@ -3629,6 +3703,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * group is not RO. */ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, cache); + if (ret) { + btrfs_dec_block_group_ro(cache); + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + if (ret == 0) { ro_set = 1; } else if (ret == -ENOSPC && !sctx->is_dev_replace) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 52ec6721ada2..1312b17a6b49 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5978,7 +5978,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) struct btrfs_block_group *cache; bool ret; - /* Non-ZONED mode does not use "to_copy" flag */ + /* Non zoned filesystem does not use "to_copy" flag */ if (!btrfs_is_zoned(fs_info)) return false; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 15288f766842..a4707bab6073 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1376,3 +1376,12 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, ASSERT(cache->meta_write_pointer == eb->start + eb->len); cache->meta_write_pointer = eb->start; } + +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) +{ + if (!btrfs_dev_is_sequential(device, physical)) + return -EOPNOTSUPP; + + return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, + length >> SECTOR_SHIFT, GFP_NOFS, 0); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 0755a25d0f4c..5ed1ea2009ea 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -55,6 +55,7 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct btrfs_block_group **cache_ret); void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -169,6 +170,12 @@ static inline void btrfs_revert_meta_write_pointer( { } +static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, + u64 physical, u64 length) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 7db1c5d14dcd521bef1780b79dcc68b3968447a9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:14 +0900 Subject: btrfs: zoned: support dev-replace in zoned filesystems This is 4/4 patch to implement device-replace on zoned filesystems. Even after the copying is done, the write pointers of the source device and the destination device may not be synchronized. For example, when the last allocated extent is freed before device-replace process, the extent is not copied, leaving a hole there. Synchronize the write pointers by writing zeroes to the destination device. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 40 ++++++++++++++++++++++++++++++ fs/btrfs/zoned.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 9 +++++++ 3 files changed, 123 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92904902d160..e0c3ec01e324 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1628,6 +1628,9 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) if (!btrfs_is_zoned(sctx->fs_info)) return 0; + if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + if (sctx->write_pointer < physical) { length = physical - sctx->write_pointer; @@ -3069,6 +3072,32 @@ static void sync_replace_for_zoned(struct scrub_ctx *sctx) wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); } +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3475,6 +3504,17 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); btrfs_free_path(ppath); + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; + + ret2 = sync_write_pointer_for_zoned(sctx, base + offset, + map->stripes[num].physical, + physical_end); + if (ret2) + ret = ret2; + } + return ret < 0 ? ret : 0; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a4707bab6073..9a5cf153da89 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,6 +12,7 @@ #include "block-group.h" #include "transaction.h" #include "dev-replace.h" +#include "space-info.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1385,3 +1386,76 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, GFP_NOFS, 0); } + +static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, + struct blk_zone *zone) +{ + struct btrfs_bio *bbio = NULL; + u64 mapped_length = PAGE_SIZE; + unsigned int nofs_flag; + int nmirrors; + int i, ret; + + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bbio); + if (ret || !bbio || mapped_length < PAGE_SIZE) { + btrfs_put_bbio(bbio); + return -EIO; + } + + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + return -EINVAL; + + nofs_flag = memalloc_nofs_save(); + nmirrors = (int)bbio->num_stripes; + for (i = 0; i < nmirrors; i++) { + u64 physical = bbio->stripes[i].physical; + struct btrfs_device *dev = bbio->stripes[i].dev; + + /* Missing device */ + if (!dev->bdev) + continue; + + ret = btrfs_get_dev_zone(dev, physical, zone); + /* Failing device */ + if (ret == -EIO || ret == -EOPNOTSUPP) + continue; + break; + } + memalloc_nofs_restore(nofs_flag); + + return ret; +} + +/* + * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by + * filling zeros between @physical_pos to a write pointer of dev-replace + * source device. + */ +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos) +{ + struct btrfs_fs_info *fs_info = tgt_dev->fs_info; + struct blk_zone zone; + u64 length; + u64 wp; + int ret; + + if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) + return 0; + + ret = read_zone_info(fs_info, logical, &zone); + if (ret) + return ret; + + wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); + + if (physical_pos == wp) + return 0; + + if (physical_pos > wp) + return -EUCLEAN; + + length = wp - physical_pos; + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5ed1ea2009ea..932ad9bc0de6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -56,6 +56,8 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -176,6 +178,13 @@ static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, return -EOPNOTSUPP; } +static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + u64 logical, u64 physical_start, + u64 physical_pos) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3-70-g09d2 From 32430c614844169a5e5554dcbb307735ddd1f780 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:15 +0900 Subject: btrfs: zoned: enable relocation on a zoned filesystem Currently fallocate() is disabled on a zoned filesystem. Since current relocation process relies on preallocation to move file data extents, it must be handled differently. On a zoned filesystem, we just truncate the inode to the size that we wanted to pre-allocate. Then, we flush dirty pages on the file before finishing the relocation process. run_delalloc_zoned() will handle all the allocations and submit IOs to the underlying layers. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 473b78874844..232d5da7b7be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2553,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; + /* + * On a zoned filesystem, we cannot preallocate the file region. + * Instead, we dirty and fiemap_write the region. + */ + if (btrfs_is_zoned(inode->root->fs_info)) { + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + + end = cluster->end - offset + 1; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + i_size_write(&inode->vfs_inode, end); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + return btrfs_end_transaction(trans); + } + inode_lock(&inode->vfs_inode); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2756,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode, } } WARN_ON(nr != cluster->nr); + if (btrfs_is_zoned(fs_info) && !ret) + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); out: kfree(ra); return ret; @@ -3434,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; + if (btrfs_is_zoned(trans->fs_info)) + flags &= ~BTRFS_INODE_PREALLOC; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3450,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, item, flags); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); -- cgit v1.2.3-70-g09d2 From f7ef5287a63d644e62a52893af8c6cfcb5043213 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:16 +0900 Subject: btrfs: zoned: relocate block group to repair IO failure in zoned filesystems When a bad checksum is found and if the filesystem has a mirror of the damaged data, we read the correct data from the mirror and writes it to damaged blocks. This however, violates the sequential write constraints of a zoned block device. We can consider three methods to repair an IO failure in zoned filesystems: (1) Reset and rewrite the damaged zone (2) Allocate new device extent and replace the damaged device extent to the new extent (3) Relocate the corresponding block group Method (1) is most similar to a behavior done with regular devices. However, it also wipes non-damaged data in the same device extent, and so it unnecessary degrades non-damaged data. Method (2) is much like device replacing but done in the same device. It is safe because it keeps the device extent until the replacing finish. However, extending device replacing is non-trivial. It assumes "src_dev->physical == dst_dev->physical". Also, the extent mapping replacing function should be extended to support replacing device extent position in one device. Method (3) invokes relocation of the damaged block group and is straightforward to implement. It relocates all the mirrored device extents, so it potentially is a more costly operation than method (1) or (2). But it relocates only used extents which reduce the total IO size. Let's apply method (3) for now. In the future, we can extend device-replace and apply method (2). For protecting a block group gets relocated multiple time with multiple IO errors, this commit introduces "relocating_repair" bit to show it's now relocating to repair IO failures. Also it uses a new kthread "btrfs-relocating-repair", not to block IO path with relocating process. This commit also supports repairing in the scrub process. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/extent_io.c | 3 +++ fs/btrfs/scrub.c | 3 +++ fs/btrfs/volumes.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 5 files changed, 80 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index d37ee576ac6e..29678426247d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -96,6 +96,7 @@ struct btrfs_block_group { unsigned int has_caching_ctl:1; unsigned int removed:1; unsigned int to_copy:1; + unsigned int relocating_repair:1; int disk_cache_state; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dfa6c6106b94..4dfb3ead1175 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2260,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + if (btrfs_is_zoned(fs_info)) + return btrfs_repair_one_zone(fs_info, logical); + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e0c3ec01e324..310fce00fcda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -857,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) + return btrfs_repair_one_zone(fs_info, logical); + /* * We must use GFP_NOFS because the scrub task might be waiting for a * worker task executing this function and in turn a transaction commit diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1312b17a6b49..b8fab44394f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7980,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + return -EBUSY; + } + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!cache->relocating_repair) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_exclop_finish(fs_info); + + return ret; +} + +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return 0; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return 0; + + spin_lock(&cache->lock); + if (cache->relocating_repair) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + return 0; + } + cache->relocating_repair = 1; + spin_unlock(&cache->lock); + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3bbdb4175df..d4c3e0dd32b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -599,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif -- cgit v1.2.3-70-g09d2 From 6ab6ebb76042d3d94a7c6c447f770a28a412c68c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:17 +0900 Subject: btrfs: split alloc_log_tree() This is a preparation patch for the next patch. Split alloc_log_tree() into two parts. The first one allocating the tree structure, remains in alloc_log_tree() and the second part allocating the tree node, which is moved into btrfs_alloc_log_tree_node(). Also export the latter part is to be used in the next patch. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 33 +++++++++++++++++++++++++++------ fs/btrfs/disk-io.h | 2 ++ 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6e16f556ed75..d2fa92526b3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1254,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1264,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1276,26 +1283,33 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1307,11 +1321,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9f4a2a1e3d36..0e7e9526b6a8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,6 +120,8 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, -- cgit v1.2.3-70-g09d2 From 40ab3be102f0a61dbb93093f330b432324a793f1 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:18 +0900 Subject: btrfs: zoned: extend zoned allocator to use dedicated tree-log block group This is the 1/3 patch to enable tree log on zoned filesystems. The tree-log feature does not work on a zoned filesystem as is. Blocks for a tree-log tree are allocated mixed with other metadata blocks and btrfs writes and syncs the tree-log blocks to devices at the time of fsync(), which has a different timing than a global transaction commit. As a result, both writing tree-log blocks and writing other metadata blocks become non-sequential writes that zoned filesystems must avoid. Introduce a dedicated block group for tree-log blocks, so that tree-log blocks and other metadata blocks can be separate write streams. As a result, each write stream can now be written to devices separately. "fs_info->treelog_bg" tracks the dedicated block group and assigns "treelog_bg" on-demand on tree-log block allocation time. This commit extends the zoned block allocator to use the block group. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/zoned.h | 14 ++++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index f5e9f560ce6d..5064be59dac5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -901,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); + btrfs_clear_treelog_bg(block_group); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1bb4f767966a..6f4b493625ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -976,6 +976,8 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d2fa92526b3b..84c6650d5ef7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2787,6 +2787,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd308bf3a220..78ad31a59e59 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3497,6 +3497,9 @@ struct find_free_extent_ctl { bool have_caching_bg; bool orig_have_caching_bg; + /* Allocation is called for tree-log */ + bool for_treelog; + /* RAID index, converted from flags */ int index; @@ -3725,6 +3728,22 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the @@ -3734,23 +3753,54 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 start = block_group->start; u64 num_bytes = ffe_ctl->num_bytes; u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; int ret = 0; + bool skip; ASSERT(btrfs_is_zoned(block_group->fs_info)); + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); if (block_group->ro) { ret = 1; goto out; } + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + avail = block_group->length - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { @@ -3765,6 +3815,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; } + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3779,6 +3832,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, ffe_ctl->search_start = ffe_ctl->found_offset; out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); return ret; @@ -4028,7 +4084,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - /* Nothing to do */ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } return 0; default: BUG(); @@ -4072,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root, struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); WARN_ON(num_bytes < fs_info->sectorsize); @@ -4085,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.for_treelog = for_treelog; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; /* For clustered allocation */ @@ -4159,8 +4222,11 @@ search: struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) + if (unlikely(block_group->ro)) { + if (for_treelog) + btrfs_clear_treelog_bg(block_group); continue; + } btrfs_grab_block_group(block_group, delalloc); ffe_ctl.search_start = block_group->start; @@ -4346,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); flags = get_alloc_profile_by_root(root, is_data); again: @@ -4369,8 +4436,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); + "allocation failed flags %llu, wanted %llu tree-log %d", + flags, num_bytes, for_treelog); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 932ad9bc0de6..61e969652fe1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -7,6 +7,7 @@ #include #include "volumes.h" #include "disk-io.h" +#include "block-group.h" struct btrfs_zoned_device_info { /* @@ -290,4 +291,17 @@ static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->zoned_meta_io_lock); } +static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if (!btrfs_is_zoned(fs_info)) + return; + + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg == bg->start) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); +} + #endif -- cgit v1.2.3-70-g09d2 From fa1a0f42a0356846fb1acd1d53061d53413a4c45 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:19 +0900 Subject: btrfs: zoned: serialize log transaction on zoned filesystems This is the 2/3 patch to enable tree-log on zoned filesystems. Since we can start more than one log transactions per subvolume simultaneously, nodes from multiple transactions can be allocated interleaved. Such mixed allocation results in non-sequential writes at the time of a log transaction commit. The nodes of the global log root tree (fs_info->log_root_tree), also have the same problem with mixed allocation. Serializes log transactions by waiting for a committing transaction when someone tries to start a new transaction, to avoid the mixed allocation problem. We must also wait for running log transactions from another subvolume, but there is no easy way to detect which subvolume root is running a log transaction. So, this patch forbids starting a new log transaction when other subvolumes already allocated the global log root tree. Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c02eeeac439c..4e72794342c0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -140,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -150,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -160,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -173,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -201,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); -- cgit v1.2.3-70-g09d2 From 3ddebf27fcd3a910989c85a3bfc9085225038c5b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:20 +0900 Subject: btrfs: zoned: reorder log node allocation on zoned filesystem This is the 3/3 patch to enable tree-log on zoned filesystems. The allocation order of nodes of "fs_info->log_root_tree" and nodes of "root->log_root" is not the same as the writing order of them. So, the writing causes unaligned write errors. Reorder the allocation of them by delaying allocation of the root node of "fs_info->log_root_tree," so that the node buffers can go out sequentially to devices. Cc: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++++----- fs/btrfs/tree-log.c | 27 +++++++++++++++++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84c6650d5ef7..c2576c5fe62e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1298,16 +1298,18 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; - int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); - ret = btrfs_alloc_log_tree_node(trans, log_root); - if (ret) { - btrfs_put_root(log_root); - return ret; + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } } WARN_ON(fs_info->log_root_tree); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4e72794342c0..fc04625cbbd1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3162,6 +3162,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -3320,12 +3333,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } } clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, -- cgit v1.2.3-70-g09d2 From b528f467132713a03984b0f9592073d75677c501 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 5 Feb 2021 23:58:36 +0900 Subject: btrfs: zoned: deal with holes writing out tree-log pages Since the zoned filesystem requires sequential write out of metadata, we cannot proceed with a hole in tree-log pages. When such a hole exists, btree_write_cache_pages() will return -EAGAIN. This happens when someone, e.g., a concurrent transaction commit, writes a dirty extent in this tree-log commit. If we are not going to wait for the extents, we can hope the concurrent writing fills the hole for us. So, we can ignore the error in this case and hope the next write will succeed. If we want to wait for them and got the error, we cannot wait for them because it will cause a deadlock. So, let's bail out to a full commit in this case. Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc04625cbbd1..d90695c1ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3120,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); @@ -3242,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); - if (ret) { + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); -- cgit v1.2.3-70-g09d2 From 9d294a685fbcb256ce8c5f7fd88a7596d0f52a8a Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:21 +0900 Subject: btrfs: zoned: enable to mount ZONED incompat flag This final patch adds the ZONED incompat flag to the supported flags and enables to mount ZONED flagged file system. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f4b493625ef..3bc00aed13b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -298,7 +298,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34) + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) -- cgit v1.2.3-70-g09d2