From 9b378f6ad48cfa195ed868db9123c09ee7ec5ea2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 13 Aug 2023 12:34:08 +0100 Subject: btrfs: fix infinite directory reads The readdir implementation currently processes always up to the last index it finds. This however can result in an infinite loop if the directory has a large number of entries such that they won't all fit in the given buffer passed to the readdir callback, that is, dir_emit() returns a non-zero value. Because in that case readdir() will be called again and if in the meanwhile new directory entries were added and we still can't put all the remaining entries in the buffer, we keep repeating this over and over. The following C program and test script reproduce the problem: $ cat /mnt/readdir_prog.c #include #include #include int main(int argc, char *argv[]) { DIR *dir = opendir("."); struct dirent *dd; while ((dd = readdir(dir))) { printf("%s\n", dd->d_name); rename(dd->d_name, "TEMPFILE"); rename("TEMPFILE", dd->d_name); } closedir(dir); } $ gcc -o /mnt/readdir_prog /mnt/readdir_prog.c $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV &> /dev/null #mkfs.xfs -f $DEV &> /dev/null #mkfs.ext4 -F $DEV &> /dev/null mount $DEV $MNT mkdir $MNT/testdir for ((i = 1; i <= 2000; i++)); do echo -n > $MNT/testdir/file_$i done cd $MNT/testdir /mnt/readdir_prog cd /mnt umount $MNT This behaviour is surprising to applications and it's unlike ext4, xfs, tmpfs, vfat and other filesystems, which always finish. In this case where new entries were added due to renames, some file names may be reported more than once, but this varies according to each filesystem - for example ext4 never reported the same file more than once while xfs reports the first 13 file names twice. So change our readdir implementation to track the last index number when opendir() is called and then make readdir() never process beyond that index number. This gives the same behaviour as ext4. Reported-by: Rob Landley Link: https://lore.kernel.org/linux-btrfs/2c8c55ec-04c6-e0dc-9c5c-8c7924778c35@landley.net/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=217681 CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/delayed-inode.c | 5 +- fs/btrfs/delayed-inode.h | 1 + fs/btrfs/inode.c | 131 ++++++++++++++++++++++++++++------------------- 4 files changed, 84 insertions(+), 54 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2d2b313bde5..9419f4e37a58 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -443,6 +443,7 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + u64 last_index; struct extent_state *llseek_cached_state; }; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6b457b010cbc..6d51db066503 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1632,6 +1632,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) } bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, struct list_head *ins_list, struct list_head *del_list) { @@ -1651,14 +1652,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); - while (item) { + while (item && item->index <= last_index) { refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); - while (item) { + while (item && item->index <= last_index) { refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f21daa3dbc7..dc1085b2a397 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -148,6 +148,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); /* Used for readdir() */ bool btrfs_readdir_get_delayed_items(struct inode *inode, + u64 last_index, struct list_head *ins_list, struct list_head *del_list); void btrfs_readdir_put_delayed_items(struct inode *inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9055e19b01ef..aa090b0b5d29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5872,6 +5872,74 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } +/* + * Find the highest existing sequence number in a directory and then set the + * in-memory index_cnt variable to the first free sequence number. + */ +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + if (path->slots[0] == 0) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + found_key.type != BTRFS_DIR_INDEX_KEY) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + inode->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) +{ + if (dir->index_cnt == (u64)-1) { + int ret; + + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = dir->index_cnt; + + return 0; +} + /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy @@ -5884,10 +5952,17 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; + u64 last_index; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); + if (ret) + return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; + private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); @@ -5954,7 +6029,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, private->last_index, + &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; @@ -5972,6 +6048,8 @@ again: break; if (found_key.offset < ctx->pos) continue; + if (found_key.offset > private->last_index) + break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) continue; di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); @@ -6107,57 +6185,6 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } -/* - * find the highest existing sequence number in a directory - * and then set the in-memory index_cnt variable to reflect - * free sequence numbers - */ -static int btrfs_set_inode_index_count(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_key key, found_key; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret; - - key.objectid = btrfs_ino(inode); - key.type = BTRFS_DIR_INDEX_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - /* FIXME: we should be able to handle this */ - if (ret == 0) - goto out; - ret = 0; - - if (path->slots[0] == 0) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - path->slots[0]--; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != btrfs_ino(inode) || - found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; -} - /* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree -- cgit v1.2.3-70-g09d2 From b471965fdb2daa225850e5972d86600992fa398e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 28 Jul 2023 14:48:13 +0800 Subject: btrfs: fix replace/scrub failure with metadata_uuid Fstests with POST_MKFS_CMD="btrfstune -m" (as in the mailing list) reported a few of the test cases failing. The failure scenario can be summarized and simplified as follows: $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sdb1 /dev/sdb2 :0 $ btrfstune -m /dev/sdb1 :0 $ wipefs -a /dev/sdb1 :0 $ mount -o degraded /dev/sdb2 /btrfs :0 $ btrfs replace start -B -f -r 1 /dev/sdb1 /btrfs :1 STDERR: ERROR: ioctl(DEV_REPLACE_START) failed on "/btrfs": Input/output error [11290.583502] BTRFS warning (device sdb2): tree block 22036480 mirror 2 has bad fsid, has 99835c32-49f0-4668-9e66-dc277a96b4a6 want da40350c-33ac-4872-92a8-4948ed8c04d0 [11290.586580] BTRFS error (device sdb2): unable to fix up (regular) error at logical 22020096 on dev /dev/sdb8 physical 1048576 As above, the replace is failing because we are verifying the header with fs_devices::fsid instead of fs_devices::metadata_uuid, despite the metadata_uuid actually being present. To fix this, use fs_devices::metadata_uuid. We copy fsid into fs_devices::metadata_uuid if there is no metadata_uuid, so its fine. Fixes: a3ddbaebc7c9 ("btrfs: scrub: introduce a helper to verify one metadata block") CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2c7fdbb60314..2aa43d2094de 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -605,7 +605,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr btrfs_stack_header_bytenr(header), logical); return; } - if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, -- cgit v1.2.3-70-g09d2 From 09c3717c3a60e3ef599bc17c70cd3ae2b979ad41 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 1 Aug 2023 09:28:28 -0700 Subject: btrfs: only subtract from len_to_oe_boundary when it is tracking an extent bio_ctrl->len_to_oe_boundary is used to make sure we stay inside a zone as we submit bios for writes. Every time we add a page to the bio, we decrement those bytes from len_to_oe_boundary, and then we submit the bio if we happen to hit zero. Most of the time, len_to_oe_boundary gets set to U32_MAX. submit_extent_page() adds pages into our bio, and the size of the bio ends up limited by: - Are we contiguous on disk? - Does bio_add_page() allow us to stuff more in? - is len_to_oe_boundary > 0? The len_to_oe_boundary math starts with U32_MAX, which isn't page or sector aligned, and subtracts from it until it hits zero. In the non-zoned case, the last IO we submit before we hit zero is going to be unaligned, triggering BUGs. This is hard to trigger because bio_add_page() isn't going to make a bio of U32_MAX size unless you give it a perfect set of pages and fully contiguous extents on disk. We can hit it pretty reliably while making large swapfiles during provisioning because the machine is freshly booted, mostly idle, and the disk is freshly formatted. It's also possible to trigger with reads when read_ahead_kb is set to 4GB. The code has been clean up and shifted around a few times, but this flaw has been lurking since the counter was added. I think the commit 24e6c8082208 ("btrfs: simplify main loop in submit_extent_page") ended up exposing the bug. The fix used here is to skip doing math on len_to_oe_boundary unless we've changed it from the default U32_MAX value. bio_add_page() is the real limit we want, and there's no reason to do extra math when block layer is doing it for us. Sample reproducer, note you'll need to change the path to the bdi and device: SUBVOL=/btrfs/swapvol SWAPFILE=$SUBVOL/swapfile SZMB=8192 mkfs.btrfs -f /dev/vdb mount /dev/vdb /btrfs btrfs subvol create $SUBVOL chattr +C $SUBVOL dd if=/dev/zero of=$SWAPFILE bs=1M count=$SZMB sync echo 4 > /proc/sys/vm/drop_caches echo 4194304 > /sys/class/bdi/btrfs-2/read_ahead_kb while true; do echo 1 > /proc/sys/vm/drop_caches echo 1 > /proc/sys/vm/drop_caches dd of=/dev/zero if=$SWAPFILE bs=4096M count=2 iflag=fullblock done Fixes: 24e6c8082208 ("btrfs: simplify main loop in submit_extent_page") CC: stable@vger.kernel.org # 6.4+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: Chris Mason Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ca765d62324f..90ad3006ef3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -902,7 +902,30 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, size -= len; pg_offset += len; disk_bytenr += len; - bio_ctrl->len_to_oe_boundary -= len; + + /* + * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * sector aligned. alloc_new_bio() then sets it to the end of + * our ordered extent for writes into zoned devices. + * + * When len_to_oe_boundary is tracking an ordered extent, we + * trust the ordered extent code to align things properly, and + * the check above to cap our write to the ordered extent + * boundary is correct. + * + * When len_to_oe_boundary is U32_MAX, the cap above would + * result in a 4095 byte IO for the last page right before + * we hit the bio limit of UINT_MAX. bio_add_page() has all + * the checks required to make sure we don't overflow the bio, + * and we should just ignore len_to_oe_boundary completely + * unless we're using it to track an ordered extent. + * + * It's pretty hard to make a bio sized U32_MAX, but it can + * happen when the page cache is able to feed us contiguous + * pages for large extents. + */ + if (bio_ctrl->len_to_oe_boundary != U32_MAX) + bio_ctrl->len_to_oe_boundary -= len; /* Ordered extent boundary: move on to a new bio. */ if (bio_ctrl->len_to_oe_boundary == 0) -- cgit v1.2.3-70-g09d2 From 29eefa6d0d07e185f7bfe9576f91e6dba98189c2 Mon Sep 17 00:00:00 2001 From: xiaoshoukui Date: Tue, 15 Aug 2023 02:55:59 -0400 Subject: btrfs: fix BUG_ON condition in btrfs_cancel_balance Pausing and canceling balance can race to interrupt balance lead to BUG_ON panic in btrfs_cancel_balance. The BUG_ON condition in btrfs_cancel_balance does not take this race scenario into account. However, the race condition has no other side effects. We can fix that. Reproducing it with panic trace like this: kernel BUG at fs/btrfs/volumes.c:4618! RIP: 0010:btrfs_cancel_balance+0x5cf/0x6a0 Call Trace: ? do_nanosleep+0x60/0x120 ? hrtimer_nanosleep+0xb7/0x1a0 ? sched_core_clone_cookie+0x70/0x70 btrfs_ioctl_balance_ctl+0x55/0x70 btrfs_ioctl+0xa46/0xd20 __x64_sys_ioctl+0x7d/0xa0 do_syscall_64+0x38/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Race scenario as follows: > mutex_unlock(&fs_info->balance_mutex); > -------------------- > .......issue pause and cancel req in another thread > -------------------- > ret = __btrfs_balance(fs_info); > > mutex_lock(&fs_info->balance_mutex); > if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { > btrfs_info(fs_info, "balance: paused"); > btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); > } CC: stable@vger.kernel.org # 4.19+ Signed-off-by: xiaoshoukui Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index efa19a528c33..a3085c7daaf7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4641,8 +4641,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) } } - BUG_ON(fs_info->balance_ctl || - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; -- cgit v1.2.3-70-g09d2 From c962098ca4af146f2625ed64399926a098752c9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Aug 2023 16:57:30 -0400 Subject: btrfs: fix incorrect splitting in btrfs_drop_extent_map_range In production we were seeing a variety of WARN_ON()'s in the extent_map code, specifically in btrfs_drop_extent_map_range() when we have to call add_extent_mapping() for our second split. Consider the following extent map layout PINNED [0 16K) [32K, 48K) and then we call btrfs_drop_extent_map_range for [0, 36K), with skip_pinned == true. The initial loop will have start = 0 end = 36K len = 36K we will find the [0, 16k) extent, but since we are pinned we will skip it, which has this code start = em_end; if (end != (u64)-1) len = start + len - em_end; em_end here is 16K, so now the values are start = 16K len = 16K + 36K - 16K = 36K len should instead be 20K. This is a problem when we find the next extent at [32K, 48K), we need to split this extent to leave [36K, 48k), however the code for the split looks like this split->start = start + len; split->len = em_end - (start + len); In this case we have em_end = 48K split->start = 16K + 36K // this should be 16K + 20K split->len = 48K - (16K + 36K) // this overflows as 16K + 36K is 52K and now we have an invalid extent_map in the tree that potentially overlaps other entries in the extent map. Even in the non-overlapping case we will have split->start set improperly, which will cause problems with any block related calculations. We don't actually need len in this loop, we can simply use end as our end point, and only adjust start up when we find a pinned extent we need to skip. Adjust the logic to do this, which keeps us from inserting an invalid extent map. We only skip_pinned in the relocation case, so this is relatively rare, except in the case where you are running relocation a lot, which can happen with auto relocation on. Fixes: 55ef68990029 ("Btrfs: Fix btrfs_drop_extent_cache for skip pinned case") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0cdb3e86f29b..a6d8368ed0ed 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -760,8 +760,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { start = em_end; - if (end != (u64)-1) - len = start + len - em_end; goto next; } @@ -829,8 +827,8 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (!split) goto remove_em; } - split->start = start + len; - split->len = em_end - (start + len); + split->start = end; + split->len = em_end - end; split->block_start = em->block_start; split->flags = flags; split->compress_type = em->compress_type; -- cgit v1.2.3-70-g09d2