From c346c629765ab982967017e2ae859156d0e235cf Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 28 Aug 2024 19:14:11 +0300 Subject: btrfs: qgroup: don't use extent changeset when not needed The local extent changeset is passed to clear_record_extent_bits() where it may have some additional memory dynamically allocated for ulist. When qgroup is disabled, the memory is leaked because in this case the changeset is not released upon __btrfs_qgroup_release_data() return. Since the recorded contents of the changeset are not used thereafter, just don't pass it. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Reported-by: syzbot+81670362c283f3dd889c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/000000000000aa8c0c060ade165e@google.com Fixes: af0e2aab3b70 ("btrfs: qgroup: flush reservations during quota disable") CC: stable@vger.kernel.org # 6.10+ Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Fedor Pchelkin Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7d6f5d9420ec..feb8f9f2f358 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4346,10 +4346,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - extent_changeset_init(&changeset); return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ -- cgit v1.2.3-70-g09d2 From b1934cd6069538db2255dc94ba573771ecf3b560 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 31 Aug 2024 01:32:49 +0900 Subject: btrfs: zoned: handle broken write pointer on zones Btrfs rejects to mount a FS if it finds a block group with a broken write pointer (e.g, unequal write pointers on two zones of RAID1 block group). Since such case can happen easily with a power-loss or crash of a system, we need to handle the case more gently. Handle such block group by making it unallocatable, so that there will be no writes into it. That can be done by setting the allocation pointer at the end of allocating region (= block_group->zone_capacity). Then, existing code handle zone_unusable properly. Having proper zone_capacity is necessary for the change. So, set it as fast as possible. We cannot handle RAID0 and RAID10 case like this. But, they are anyway unable to read because of a missing stripe. Fixes: 265f7237dd25 ("btrfs: zoned: allow DUP on meta-data block groups") Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") CC: stable@vger.kernel.org # 6.1+ Reported-by: HAN Yuwei Cc: Xuefer Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 66f63e82af79..047e3337852e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1406,6 +1406,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, return -EINVAL; } + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", @@ -1432,7 +1434,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } bg->alloc_offset = zone_info[0].alloc_offset; - bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); return 0; } @@ -1450,6 +1451,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, return -EINVAL; } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) @@ -1471,9 +1475,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } - /* In case a device is missing we have a cap of 0, so don't use it. */ - bg->zone_capacity = min_not_zero(zone_info[0].capacity, - zone_info[1].capacity); } if (zone_info[0].alloc_offset != WP_MISSING_DEV) @@ -1563,6 +1564,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; + u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1623,7 +1625,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; @@ -1650,6 +1653,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + cache->alloc_offset = cache->zone_capacity; + ret = 0; + } + out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && -- cgit v1.2.3-70-g09d2 From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Aug 2024 18:25:49 +0100 Subject: btrfs: fix race between direct IO write and fsync when using same fd If we have 2 threads that are using the same file descriptor and one of them is doing direct IO writes while the other is doing fsync, we have a race where we can end up either: 1) Attempt a fsync without holding the inode's lock, triggering an assertion failures when assertions are enabled; 2) Do an invalid memory access from the fsync task because the file private points to memory allocated on stack by the direct IO task and it may be used by the fsync task after the stack was destroyed. The race happens like this: 1) A user space program opens a file descriptor with O_DIRECT; 2) The program spawns 2 threads using libpthread for example; 3) One of the threads uses the file descriptor to do direct IO writes, while the other calls fsync using the same file descriptor. 4) Call task A the thread doing direct IO writes and task B the thread doing fsyncs; 5) Task A does a direct IO write, and at btrfs_direct_write() sets the file's private to an on stack allocated private with the member 'fsync_skip_inode_lock' set to true; 6) Task B enters btrfs_sync_file() and sees that there's a private structure associated to the file which has 'fsync_skip_inode_lock' set to true, so it skips locking the inode's VFS lock; 7) Task A completes the direct IO write, and resets the file's private to NULL since it had no prior private and our private was stack allocated. Then it unlocks the inode's VFS lock; 8) Task B enters btrfs_get_ordered_extents_for_logging(), then the assertion that checks the inode's VFS lock is held fails, since task B never locked it and task A has already unlocked it. The stack trace produced is the following: assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ordered-data.c:983! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8 Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020 RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs] Code: 50 d6 86 c0 e8 (...) RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246 RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800 RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38 R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800 R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000 FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0 Call Trace: ? __die_body.cold+0x14/0x24 ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x6a/0x90 ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] ? exc_invalid_op+0x50/0x70 ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] ? asm_exc_invalid_op+0x1a/0x20 ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a] ? __seccomp_filter+0x31d/0x4f0 __x64_sys_fdatasync+0x4f/0x90 do_syscall_64+0x82/0x160 ? do_futex+0xcb/0x190 ? __x64_sys_futex+0x10e/0x1d0 ? switch_fpu_return+0x4f/0xd0 ? syscall_exit_to_user_mode+0x72/0x220 ? do_syscall_64+0x8e/0x160 ? syscall_exit_to_user_mode+0x72/0x220 ? do_syscall_64+0x8e/0x160 ? syscall_exit_to_user_mode+0x72/0x220 ? do_syscall_64+0x8e/0x160 ? syscall_exit_to_user_mode+0x72/0x220 ? do_syscall_64+0x8e/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Another problem here is if task B grabs the private pointer and then uses it after task A has finished, since the private was allocated in the stack of task A, it results in some invalid memory access with a hard to predict result. This issue, triggering the assertion, was observed with QEMU workloads by two users in the Link tags below. Fix this by not relying on a file's private to pass information to fsync that it should skip locking the inode and instead pass this information through a special value stored in current->journal_info. This is safe because in the relevant section of the direct IO write path we are not holding a transaction handle, so current->journal_info is NULL. The following C program triggers the issue: $ cat repro.c /* Get the O_DIRECT definition. */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include static int fd; static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset) { while (count > 0) { ssize_t ret; ret = pwrite(fd, buf, count, offset); if (ret < 0) { if (errno == EINTR) continue; return ret; } count -= ret; buf += ret; } return 0; } static void *fsync_loop(void *arg) { while (1) { int ret; ret = fsync(fd); if (ret != 0) { perror("Fsync failed"); exit(6); } } } int main(int argc, char *argv[]) { long pagesize; void *write_buf; pthread_t fsyncer; int ret; if (argc != 2) { fprintf(stderr, "Use: %s \n", argv[0]); return 1; } fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666); if (fd == -1) { perror("Failed to open/create file"); return 1; } pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) { perror("Failed to get page size"); return 2; } ret = posix_memalign(&write_buf, pagesize, pagesize); if (ret) { perror("Failed to allocate buffer"); return 3; } ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL); if (ret != 0) { fprintf(stderr, "Failed to create writer thread: %d\n", ret); return 4; } while (1) { ret = do_write(fd, write_buf, pagesize, 0); if (ret != 0) { perror("Write failed"); exit(5); } } return 0; } $ mkfs.btrfs -f /dev/sdi $ mount /dev/sdi /mnt/sdi $ timeout 10 ./repro /mnt/sdi/foo Usually the race is triggered within less than 1 second. A test case for fstests will follow soon. Reported-by: Paulo Dias Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187 Reported-by: Andreas Jahn Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199 Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/ Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/direct-io.c | 16 +++------------- fs/btrfs/file.c | 9 +++++++-- fs/btrfs/transaction.h | 6 ++++++ 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75fa563e4cac..c8568b1a61c4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,7 +459,6 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; - bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 67adbe9d294a..364bce34f034 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -864,13 +864,6 @@ again: if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); } else { - struct btrfs_file_private stack_private = { 0 }; - struct btrfs_file_private *private; - const bool have_private = (file->private_data != NULL); - - if (!have_private) - file->private_data = &stack_private; - /* * If we have a synchronous write, we must make sure the fsync * triggered by the iomap_dio_complete() call below doesn't @@ -879,13 +872,10 @@ again: * partial writes due to the input buffer (or parts of it) not * being already faulted in. */ - private = file->private_data; - private->fsync_skip_inode_lock = true; + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; ret = iomap_dio_complete(dio); - private->fsync_skip_inode_lock = false; - - if (!have_private) - file->private_data = NULL; + current->journal_info = NULL; } /* No increment (+=) because iomap returns a cumulative value. */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9914419f3b7d..2aeb8116549c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct btrfs_file_private *private = file->private_data; struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; bool full_sync; - const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); + bool skip_ilock = false; + + if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { + skip_ilock = true; + current->journal_info = NULL; + lockdep_assert_held(&inode->vfs_inode.i_rwsem); + } trace_btrfs_sync_file(file, datasync); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 98c03ddc760b..dd9ce9b9f69e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -27,6 +27,12 @@ struct btrfs_root_item; struct btrfs_root; struct btrfs_path; +/* + * Signal that a direct IO write is in progress, to avoid deadlock for sync + * direct IO writes when fsync is called during the direct IO write path. + */ +#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 -- cgit v1.2.3-70-g09d2