From 96c6ca71572a3556ed0c37237305657ff47174b7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 19 Sep 2024 22:20:34 +0100
Subject: btrfs: send: fix buffer overflow detection when copying path to cache
 entry

Starting with commit c0247d289e73 ("btrfs: send: annotate struct
name_cache_entry with __counted_by()") we annotated the variable length
array "name" from the name_cache_entry structure with __counted_by() to
improve overflow detection. However that alone was not correct, because
the length of that array does not match the "name_len" field - it matches
that plus 1 to include the NUL string terminator, so that makes a
fortified kernel think there's an overflow and report a splat like this:

  strcpy: detected buffer overflow: 20 byte write of buffer size 19
  WARNING: CPU: 3 PID: 3310 at __fortify_report+0x45/0x50
  CPU: 3 UID: 0 PID: 3310 Comm: btrfs Not tainted 6.11.0-prnet #1
  Hardware name: CompuLab Ltd.  sbc-ihsw/Intense-PC2 (IPC2), BIOS IPC2_3.330.7 X64 03/15/2018
  RIP: 0010:__fortify_report+0x45/0x50
  Code: 48 8b 34 (...)
  RSP: 0018:ffff97ebc0d6f650 EFLAGS: 00010246
  RAX: 7749924ef60fa600 RBX: ffff8bf5446a521a RCX: 0000000000000027
  RDX: 00000000ffffdfff RSI: ffff97ebc0d6f548 RDI: ffff8bf84e7a1cc8
  RBP: ffff8bf548574080 R08: ffffffffa8c40e10 R09: 0000000000005ffd
  R10: 0000000000000004 R11: ffffffffa8c70e10 R12: ffff8bf551eef400
  R13: 0000000000000000 R14: 0000000000000013 R15: 00000000000003a8
  FS:  00007fae144de8c0(0000) GS:ffff8bf84e780000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fae14691690 CR3: 00000001027a2003 CR4: 00000000001706f0
  Call Trace:
   <TASK>
   ? __warn+0x12a/0x1d0
   ? __fortify_report+0x45/0x50
   ? report_bug+0x154/0x1c0
   ? handle_bug+0x42/0x70
   ? exc_invalid_op+0x1a/0x50
   ? asm_exc_invalid_op+0x1a/0x20
   ? __fortify_report+0x45/0x50
   __fortify_panic+0x9/0x10
  __get_cur_name_and_parent+0x3bc/0x3c0
   get_cur_path+0x207/0x3b0
   send_extent_data+0x709/0x10d0
   ? find_parent_nodes+0x22df/0x25d0
   ? mas_nomem+0x13/0x90
   ? mtree_insert_range+0xa5/0x110
   ? btrfs_lru_cache_store+0x5f/0x1e0
   ? iterate_extent_inodes+0x52d/0x5a0
   process_extent+0xa96/0x11a0
   ? __pfx_lookup_backref_cache+0x10/0x10
   ? __pfx_store_backref_cache+0x10/0x10
   ? __pfx_iterate_backrefs+0x10/0x10
   ? __pfx_check_extent_item+0x10/0x10
   changed_cb+0x6fa/0x930
   ? tree_advance+0x362/0x390
   ? memcmp_extent_buffer+0xd7/0x160
   send_subvol+0xf0a/0x1520
   btrfs_ioctl_send+0x106b/0x11d0
   ? __pfx___clone_root_cmp_sort+0x10/0x10
   _btrfs_ioctl_send+0x1ac/0x240
   btrfs_ioctl+0x75b/0x850
   __se_sys_ioctl+0xca/0x150
   do_syscall_64+0x85/0x160
   ? __count_memcg_events+0x69/0x100
   ? handle_mm_fault+0x1327/0x15c0
   ? __se_sys_rt_sigprocmask+0xf1/0x180
   ? syscall_exit_to_user_mode+0x75/0xa0
   ? do_syscall_64+0x91/0x160
   ? do_user_addr_fault+0x21d/0x630
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
  RIP: 0033:0x7fae145eeb4f
  Code: 00 48 89 (...)
  RSP: 002b:00007ffdf1cb09b0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fae145eeb4f
  RDX: 00007ffdf1cb0ad0 RSI: 0000000040489426 RDI: 0000000000000004
  RBP: 00000000000078fe R08: 00007fae144006c0 R09: 00007ffdf1cb0927
  R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffdf1cb1ce8
  R13: 0000000000000003 R14: 000055c499fab2e0 R15: 0000000000000004
   </TASK>

Fix this by not storing the NUL string terminator since we don't actually
need it for name cache entries, this way "name_len" corresponds to the
actual size of the "name" array. This requires marking the "name" array
field with __nonstring and using memcpy() instead of strcpy() as
recommended by the guidelines at:

   https://github.com/KSPP/linux/issues/90

Reported-by: David Arendt <admin@prnet.org>
Link: https://lore.kernel.org/linux-btrfs/cee4591a-3088-49ba-99b8-d86b4242b8bd@prnet.org/
Fixes: c0247d289e73 ("btrfs: send: annotate struct name_cache_entry with __counted_by()")
CC: stable@vger.kernel.org # 6.11
Tested-by: David Arendt <admin@prnet.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7f48ba6c1c77..5871ca845b0e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -346,8 +346,10 @@ struct name_cache_entry {
 	u64 parent_gen;
 	int ret;
 	int need_later_update;
+	/* Name length without NUL terminator. */
 	int name_len;
-	char name[] __counted_by(name_len);
+	/* Not NUL terminated. */
+	char name[] __counted_by(name_len) __nonstring;
 };
 
 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
@@ -2388,7 +2390,7 @@ out_cache:
 	/*
 	 * Store the result of the lookup in the name cache.
 	 */
-	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
+	nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
 	if (!nce) {
 		ret = -ENOMEM;
 		goto out;
@@ -2400,7 +2402,7 @@ out_cache:
 	nce->parent_gen = *parent_gen;
 	nce->name_len = fs_path_len(dest);
 	nce->ret = ret;
-	strcpy(nce->name, dest->start);
+	memcpy(nce->name, dest->start, nce->name_len);
 
 	if (ino < sctx->send_progress)
 		nce->need_later_update = 0;
-- 
cgit v1.2.3-70-g09d2


From 97f9782276fc9cb0de37a5eecb82204e48a5a612 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 19 Sep 2024 12:16:38 +0200
Subject: btrfs: also add stripe entries for NOCOW writes

NOCOW writes do not generate stripe_extent entries in the RAID stripe
tree, as the RAID stripe-tree feature initially was designed with a
zoned filesystem in mind and on a zoned filesystem, we do not allow NOCOW
writes. But the RAID stripe-tree feature is independent from the zoned
feature, so we must also do NOCOW writes for RAID stripe-tree filesystems.

Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index edac499fd83d..c6e4b58c334c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3111,6 +3111,11 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 		ret = btrfs_update_inode_fallback(trans, inode);
 		if (ret) /* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
+
+		ret = btrfs_insert_raid_extent(trans, ordered_extent);
+		if (ret)
+			btrfs_abort_transaction(trans, ret);
+
 		goto out;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From db7e68b522c01eb666cfe1f31637775f18997811 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 24 Sep 2024 16:50:22 -0400
Subject: btrfs: drop the backref cache during relocation if we commit

Since the inception of relocation we have maintained the backref cache
across transaction commits, updating the backref cache with the new
bytenr whenever we COWed blocks that were in the cache, and then
updating their bytenr once we detected a transaction id change.

This works as long as we're only ever modifying blocks, not changing the
structure of the tree.

However relocation does in fact change the structure of the tree.  For
example, if we are relocating a data extent, we will look up all the
leaves that point to this data extent.  We will then call
do_relocation() on each of these leaves, which will COW down to the leaf
and then update the file extent location.

But, a key feature of do_relocation() is the pending list.  This is all
the pending nodes that we modified when we updated the file extent item.
We will then process all of these blocks via finish_pending_nodes, which
calls do_relocation() on all of the nodes that led up to that leaf.

The purpose of this is to make sure we don't break sharing unless we
absolutely have to.  Consider the case that we have 3 snapshots that all
point to this leaf through the same nodes, the initial COW would have
created a whole new path.  If we did this for all 3 snapshots we would
end up with 3x the number of nodes we had originally.  To avoid this we
will cycle through each of the snapshots that point to each of these
nodes and update their pointers to point at the new nodes.

Once we update the pointer to the new node we will drop the node we
removed the link for and all of its children via btrfs_drop_subtree().
This is essentially just btrfs_drop_snapshot(), but for an arbitrary
point in the snapshot.

The problem with this is that we will never reflect this in the backref
cache.  If we do this btrfs_drop_snapshot() for a node that is in the
backref tree, we will leave the node in the backref tree.  This becomes
a problem when we change the transid, as now the backref cache has
entire subtrees that no longer exist, but exist as if they still are
pointed to by the same roots.

In the best case scenario you end up with "adding refs to an existing
tree ref" errors from insert_inline_extent_backref(), where we attempt
to link in nodes on roots that are no longer valid.

Worst case you will double free some random block and re-use it when
there's still references to the block.

This is extremely subtle, and the consequences are quite bad.  There
isn't a way to make sure our backref cache is consistent between
transid's.

In order to fix this we need to simply evict the entire backref cache
anytime we cross transid's.  This reduces performance in that we have to
rebuild this backref cache every time we change transid's, but fixes the
bug.

This has existed since relocation was added, and is a pretty critical
bug.  There's a lot more cleanup that can be done now that this
functionality is going away, but this patch is as small as possible in
order to fix the problem and make it easy for us to backport it to all
the kernels it needs to be backported to.

Followup series will dismantle more of this code and simplify relocation
drastically to remove this functionality.

We have a reproducer that reproduced the corruption within a few minutes
of running.  With this patch it survives several iterations/hours of
running the reproducer.

Fixes: 3fd0a5585eb9 ("Btrfs: Metadata ENOSPC handling for balance")
CC: stable@vger.kernel.org
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 12 ++++++---
 fs/btrfs/relocation.c | 75 +++------------------------------------------------
 2 files changed, 11 insertions(+), 76 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e2f478ecd7fd..f8e1d5b2c512 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3179,10 +3179,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
 		btrfs_backref_cleanup_node(cache, node);
 	}
 
-	cache->last_trans = 0;
-
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
-		ASSERT(list_empty(&cache->pending[i]));
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		while (!list_empty(&cache->pending[i])) {
+			node = list_first_entry(&cache->pending[i],
+						struct btrfs_backref_node,
+						list);
+			btrfs_backref_cleanup_node(cache, node);
+		}
+	}
 	ASSERT(list_empty(&cache->pending_edge));
 	ASSERT(list_empty(&cache->useless_node));
 	ASSERT(list_empty(&cache->changed));
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ea4ed85919ec..cf1dfeaaf2d8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -232,70 +232,6 @@ static struct btrfs_backref_node *walk_down_backref(
 	return NULL;
 }
 
-static void update_backref_node(struct btrfs_backref_cache *cache,
-				struct btrfs_backref_node *node, u64 bytenr)
-{
-	struct rb_node *rb_node;
-	rb_erase(&node->rb_node, &cache->rb_root);
-	node->bytenr = bytenr;
-	rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
-	if (rb_node)
-		btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
-}
-
-/*
- * update backref cache after a transaction commit
- */
-static int update_backref_cache(struct btrfs_trans_handle *trans,
-				struct btrfs_backref_cache *cache)
-{
-	struct btrfs_backref_node *node;
-	int level = 0;
-
-	if (cache->last_trans == 0) {
-		cache->last_trans = trans->transid;
-		return 0;
-	}
-
-	if (cache->last_trans == trans->transid)
-		return 0;
-
-	/*
-	 * detached nodes are used to avoid unnecessary backref
-	 * lookup. transaction commit changes the extent tree.
-	 * so the detached nodes are no longer useful.
-	 */
-	while (!list_empty(&cache->detached)) {
-		node = list_entry(cache->detached.next,
-				  struct btrfs_backref_node, list);
-		btrfs_backref_cleanup_node(cache, node);
-	}
-
-	while (!list_empty(&cache->changed)) {
-		node = list_entry(cache->changed.next,
-				  struct btrfs_backref_node, list);
-		list_del_init(&node->list);
-		BUG_ON(node->pending);
-		update_backref_node(cache, node, node->new_bytenr);
-	}
-
-	/*
-	 * some nodes can be left in the pending list if there were
-	 * errors during processing the pending nodes.
-	 */
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		list_for_each_entry(node, &cache->pending[level], list) {
-			BUG_ON(!node->pending);
-			if (node->bytenr == node->new_bytenr)
-				continue;
-			update_backref_node(cache, node, node->new_bytenr);
-		}
-	}
-
-	cache->last_trans = 0;
-	return 1;
-}
-
 static bool reloc_root_is_dead(const struct btrfs_root *root)
 {
 	/*
@@ -551,9 +487,6 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
 	struct btrfs_backref_edge *new_edge;
 	struct rb_node *rb_node;
 
-	if (cache->last_trans > 0)
-		update_backref_cache(trans, cache);
-
 	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
 	if (rb_node) {
 		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
@@ -3698,11 +3631,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 			break;
 		}
 restart:
-		if (update_backref_cache(trans, &rc->backref_cache)) {
-			btrfs_end_transaction(trans);
-			trans = NULL;
-			continue;
-		}
+		if (rc->backref_cache.last_trans != trans->transid)
+			btrfs_backref_release_cache(&rc->backref_cache);
+		rc->backref_cache.last_trans = trans->transid;
 
 		ret = find_next_extent(rc, path, &key);
 		if (ret < 0)
-- 
cgit v1.2.3-70-g09d2


From 50c6f6e6806c65e41a039f0edef0816974403253 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 25 Sep 2024 11:38:55 +0100
Subject: btrfs: tracepoints: end assignment with semicolon at
 btrfs_qgroup_extent event class

While running checkpatch.pl against a patch that modifies the
btrfs_qgroup_extent event class, it complained about using a comma instead
of a semicolon:

  $ ./scripts/checkpatch.pl qgroups/0003-btrfs-qgroups-remove-bytenr-field-from-struct-btrfs_.patch
  WARNING: Possible comma where semicolon could be used
  #215: FILE: include/trace/events/btrfs.h:1720:
  +		__entry->bytenr		= bytenr,
		__entry->num_bytes	= rec->num_bytes;

  total: 0 errors, 1 warnings, 184 lines checked

So replace the comma with a semicolon to silence checkpatch and possibly
other tools. It also makes the code consistent with the rest.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bf60ad50011e..af6b3827fb1d 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1716,7 +1716,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->bytenr		= rec->bytenr,
+		__entry->bytenr		= rec->bytenr;
 		__entry->num_bytes	= rec->num_bytes;
 	),
 
-- 
cgit v1.2.3-70-g09d2


From fa630df665aa9ddce3a96ce7b54e10a38e4d2a2b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 27 Sep 2024 10:50:12 +0100
Subject: btrfs: send: fix invalid clone operation for file that got its size
 decreased

During an incremental send we may end up sending an invalid clone
operation, for the last extent of a file which ends at an unaligned offset
that matches the final i_size of the file in the send snapshot, in case
the file had its initial size (the size in the parent snapshot) decreased
in the send snapshot. In this case the destination will fail to apply the
clone operation because its end offset is not sector size aligned and it
ends before the current size of the file.

Sending the truncate operation always happens when we finish processing an
inode, after we process all its extents (and xattrs, names, etc). So fix
this by ensuring the file has a valid size before we send a clone
operation for an unaligned extent that ends at the final i_size of the
file. The size we truncate to matches the start offset of the clone range
but it could be any value between that start offset and the final size of
the file since the clone operation will expand the i_size if the current
size is smaller than the end offset. The start offset of the range was
chosen because it's always sector size aligned and avoids a truncation
into the middle of a page, which results in dirtying the page due to
filling part of it with zeroes and then making the clone operation at the
receiver trigger IO.

The following test reproduces the issue:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/sdi
  MNT=/mnt/sdi

  mkfs.btrfs -f $DEV
  mount $DEV $MNT

  # Create a file with a size of 256K + 5 bytes, having two extents, one
  # with a size of 128K and another one with a size of 128K + 5 bytes.
  last_ext_size=$((128 * 1024 + 5))
  xfs_io -f -d -c "pwrite -S 0xab -b 128K 0 128K" \
         -c "pwrite -S 0xcd -b $last_ext_size 128K $last_ext_size" \
         $MNT/foo

  # Another file which we will later clone foo into, but initially with
  # a larger size than foo.
  xfs_io -f -c "pwrite -S 0xef 0 1M" $MNT/bar

  btrfs subvolume snapshot -r $MNT/ $MNT/snap1

  # Now resize bar and clone foo into it.
  xfs_io -c "truncate 0" \
         -c "reflink $MNT/foo" $MNT/bar

  btrfs subvolume snapshot -r $MNT/ $MNT/snap2

  rm -f /tmp/send-full /tmp/send-inc
  btrfs send -f /tmp/send-full $MNT/snap1
  btrfs send -p $MNT/snap1 -f /tmp/send-inc $MNT/snap2

  umount $MNT
  mkfs.btrfs -f $DEV
  mount $DEV $MNT

  btrfs receive -f /tmp/send-full $MNT
  btrfs receive -f /tmp/send-inc $MNT

  umount $MNT

Running it before this patch:

  $ ./test.sh
  (...)
  At subvol snap1
  At snapshot snap2
  ERROR: failed to clone extents to bar: Invalid argument

A test case for fstests will be sent soon.

Reported-by: Ben Millwood <thebenmachine@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAJhrHS2z+WViO2h=ojYvBPDLsATwLbg+7JaNCyYomv0fUxEpQQ@mail.gmail.com/
Fixes: 46a6e10a1ab1 ("btrfs: send: allow cloning non-aligned extent if it ends at i_size")
CC: stable@vger.kernel.org # 6.11
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5871ca845b0e..27306d98ec43 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6189,8 +6189,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	if (ret < 0)
 		return ret;
 
-	if (clone_root->offset + num_bytes == info.size)
+	if (clone_root->offset + num_bytes == info.size) {
+		/*
+		 * The final size of our file matches the end offset, but it may
+		 * be that its current size is larger, so we have to truncate it
+		 * to any value between the start offset of the range and the
+		 * final i_size, otherwise the clone operation is invalid
+		 * because it's unaligned and it ends before the current EOF.
+		 * We do this truncate to the final i_size when we finish
+		 * processing the inode, but it's too late by then. And here we
+		 * truncate to the start offset of the range because it's always
+		 * sector size aligned while if it were the final i_size it
+		 * would result in dirtying part of a page, filling part of a
+		 * page with zeroes and then having the clone operation at the
+		 * receiver trigger IO and wait for it due to the dirty page.
+		 */
+		if (sctx->parent_root != NULL) {
+			ret = send_truncate(sctx, sctx->cur_ino,
+					    sctx->cur_inode_gen, offset);
+			if (ret < 0)
+				return ret;
+		}
 		goto clone_data;
+	}
 
 write_data:
 	ret = send_extent_data(sctx, path, offset, num_bytes);
-- 
cgit v1.2.3-70-g09d2


From c3b47f49e83197e8dffd023ec568403bcdbb774b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 28 Sep 2024 08:05:58 +0930
Subject: btrfs: fix a NULL pointer dereference when failed to start a new
 trasacntion

[BUG]
Syzbot reported a NULL pointer dereference with the following crash:

  FAULT_INJECTION: forcing a failure.
   start_transaction+0x830/0x1670 fs/btrfs/transaction.c:676
   prepare_to_relocate+0x31f/0x4c0 fs/btrfs/relocation.c:3642
   relocate_block_group+0x169/0xd20 fs/btrfs/relocation.c:3678
  ...
  BTRFS info (device loop0): balance: ended with status: -12
  Oops: general protection fault, probably for non-canonical address 0xdffffc00000000cc: 0000 [#1] PREEMPT SMP KASAN NOPTI
  KASAN: null-ptr-deref in range [0x0000000000000660-0x0000000000000667]
  RIP: 0010:btrfs_update_reloc_root+0x362/0xa80 fs/btrfs/relocation.c:926
  Call Trace:
   <TASK>
   commit_fs_roots+0x2ee/0x720 fs/btrfs/transaction.c:1496
   btrfs_commit_transaction+0xfaf/0x3740 fs/btrfs/transaction.c:2430
   del_balance_item fs/btrfs/volumes.c:3678 [inline]
   reset_balance_state+0x25e/0x3c0 fs/btrfs/volumes.c:3742
   btrfs_balance+0xead/0x10c0 fs/btrfs/volumes.c:4574
   btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3673
   vfs_ioctl fs/ioctl.c:51 [inline]
   __do_sys_ioctl fs/ioctl.c:907 [inline]
   __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893
   do_syscall_x64 arch/x86/entry/common.c:52 [inline]
   do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

[CAUSE]
The allocation failure happens at the start_transaction() inside
prepare_to_relocate(), and during the error handling we call
unset_reloc_control(), which makes fs_info->balance_ctl to be NULL.

Then we continue the error path cleanup in btrfs_balance() by calling
reset_balance_state() which will call del_balance_item() to fully delete
the balance item in the root tree.

However during the small window between set_reloc_contrl() and
unset_reloc_control(), we can have a subvolume tree update and created a
reloc_root for that subvolume.

Then we go into the final btrfs_commit_transaction() of
del_balance_item(), and into btrfs_update_reloc_root() inside
commit_fs_roots().

That function checks if fs_info->reloc_ctl is in the merge_reloc_tree
stage, but since fs_info->reloc_ctl is NULL, it results a NULL pointer
dereference.

[FIX]
Just add extra check on fs_info->reloc_ctl inside
btrfs_update_reloc_root(), before checking
fs_info->reloc_ctl->merge_reloc_tree.

That DEAD_RELOC_TREE handling is to prevent further modification to the
reloc tree during merge stage, but since there is no reloc_ctl at all,
we do not need to bother that.

Reported-by: syzbot+283673dbc38527ef9f3d@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/66f6bfa7.050a0220.38ace9.0019.GAE@google.com/
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cf1dfeaaf2d8..f3834f8d26b4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -856,7 +856,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	btrfs_grab_root(reloc_root);
 
 	/* root->reloc_root will stay until current relocation finished */
-	if (fs_info->reloc_ctl->merge_reloc_tree &&
+	if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree &&
 	    btrfs_root_refs(root_item) == 0) {
 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
 		/*
-- 
cgit v1.2.3-70-g09d2


From 41fd1e94066a815a7ab0a7025359e9b40e4b3576 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Oct 2024 11:06:52 +0100
Subject: btrfs: wait for fixup workers before stopping cleaner kthread during
 umount

During unmount, at close_ctree(), we have the following steps in this order:

1) Park the cleaner kthread - this doesn't destroy the kthread, it basically
   halts its execution (wake ups against it work but do nothing);

2) We stop the cleaner kthread - this results in freeing the respective
   struct task_struct;

3) We call btrfs_stop_all_workers() which waits for any jobs running in all
   the work queues and then free the work queues.

Syzbot reported a case where a fixup worker resulted in a crash when doing
a delayed iput on its inode while attempting to wake up the cleaner at
btrfs_add_delayed_iput(), because the task_struct of the cleaner kthread
was already freed. This can happen during unmount because we don't wait
for any fixup workers still running before we call kthread_stop() against
the cleaner kthread, which stops and free all its resources.

Fix this by waiting for any fixup workers at close_ctree() before we call
kthread_stop() against the cleaner and run pending delayed iputs.

The stack traces reported by syzbot were the following:

  BUG: KASAN: slab-use-after-free in __lock_acquire+0x77/0x2050 kernel/locking/lockdep.c:5065
  Read of size 8 at addr ffff8880272a8a18 by task kworker/u8:3/52

  CPU: 1 UID: 0 PID: 52 Comm: kworker/u8:3 Not tainted 6.12.0-rc1-syzkaller #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
  Workqueue: btrfs-fixup btrfs_work_helper
  Call Trace:
   <TASK>
   __dump_stack lib/dump_stack.c:94 [inline]
   dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
   print_address_description mm/kasan/report.c:377 [inline]
   print_report+0x169/0x550 mm/kasan/report.c:488
   kasan_report+0x143/0x180 mm/kasan/report.c:601
   __lock_acquire+0x77/0x2050 kernel/locking/lockdep.c:5065
   lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5825
   __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
   _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
   class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline]
   try_to_wake_up+0xb0/0x1480 kernel/sched/core.c:4154
   btrfs_writepage_fixup_worker+0xc16/0xdf0 fs/btrfs/inode.c:2842
   btrfs_work_helper+0x390/0xc50 fs/btrfs/async-thread.c:314
   process_one_work kernel/workqueue.c:3229 [inline]
   process_scheduled_works+0xa63/0x1850 kernel/workqueue.c:3310
   worker_thread+0x870/0xd30 kernel/workqueue.c:3391
   kthread+0x2f0/0x390 kernel/kthread.c:389
   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
   </TASK>

  Allocated by task 2:
   kasan_save_stack mm/kasan/common.c:47 [inline]
   kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
   unpoison_slab_object mm/kasan/common.c:319 [inline]
   __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:345
   kasan_slab_alloc include/linux/kasan.h:247 [inline]
   slab_post_alloc_hook mm/slub.c:4086 [inline]
   slab_alloc_node mm/slub.c:4135 [inline]
   kmem_cache_alloc_node_noprof+0x16b/0x320 mm/slub.c:4187
   alloc_task_struct_node kernel/fork.c:180 [inline]
   dup_task_struct+0x57/0x8c0 kernel/fork.c:1107
   copy_process+0x5d1/0x3d50 kernel/fork.c:2206
   kernel_clone+0x223/0x880 kernel/fork.c:2787
   kernel_thread+0x1bc/0x240 kernel/fork.c:2849
   create_kthread kernel/kthread.c:412 [inline]
   kthreadd+0x60d/0x810 kernel/kthread.c:765
   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

  Freed by task 61:
   kasan_save_stack mm/kasan/common.c:47 [inline]
   kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
   kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579
   poison_slab_object mm/kasan/common.c:247 [inline]
   __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264
   kasan_slab_free include/linux/kasan.h:230 [inline]
   slab_free_hook mm/slub.c:2343 [inline]
   slab_free mm/slub.c:4580 [inline]
   kmem_cache_free+0x1a2/0x420 mm/slub.c:4682
   put_task_struct include/linux/sched/task.h:144 [inline]
   delayed_put_task_struct+0x125/0x300 kernel/exit.c:228
   rcu_do_batch kernel/rcu/tree.c:2567 [inline]
   rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823
   handle_softirqs+0x2c5/0x980 kernel/softirq.c:554
   __do_softirq kernel/softirq.c:588 [inline]
   invoke_softirq kernel/softirq.c:428 [inline]
   __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637
   irq_exit_rcu+0x9/0x30 kernel/softirq.c:649
   instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1037 [inline]
   sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1037
   asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702

  Last potentially related work creation:
   kasan_save_stack+0x3f/0x60 mm/kasan/common.c:47
   __kasan_record_aux_stack+0xac/0xc0 mm/kasan/generic.c:541
   __call_rcu_common kernel/rcu/tree.c:3086 [inline]
   call_rcu+0x167/0xa70 kernel/rcu/tree.c:3190
   context_switch kernel/sched/core.c:5318 [inline]
   __schedule+0x184b/0x4ae0 kernel/sched/core.c:6675
   schedule_idle+0x56/0x90 kernel/sched/core.c:6793
   do_idle+0x56a/0x5d0 kernel/sched/idle.c:354
   cpu_startup_entry+0x42/0x60 kernel/sched/idle.c:424
   start_secondary+0x102/0x110 arch/x86/kernel/smpboot.c:314
   common_startup_64+0x13e/0x147

  The buggy address belongs to the object at ffff8880272a8000
   which belongs to the cache task_struct of size 7424
  The buggy address is located 2584 bytes inside of
   freed 7424-byte region [ffff8880272a8000, ffff8880272a9d00)

  The buggy address belongs to the physical page:
  page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x272a8
  head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
  flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff)
  page_type: f5(slab)
  raw: 00fff00000000040 ffff88801bafa500 dead000000000122 0000000000000000
  raw: 0000000000000000 0000000080040004 00000001f5000000 0000000000000000
  head: 00fff00000000040 ffff88801bafa500 dead000000000122 0000000000000000
  head: 0000000000000000 0000000080040004 00000001f5000000 0000000000000000
  head: 00fff00000000003 ffffea00009caa01 ffffffffffffffff 0000000000000000
  head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
  page dumped because: kasan: bad access detected
  page_owner tracks the page as allocated
  page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 2, tgid 2 (kthreadd), ts 71247381401, free_ts 71214998153
   set_page_owner include/linux/page_owner.h:32 [inline]
   post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1537
   prep_new_page mm/page_alloc.c:1545 [inline]
   get_page_from_freelist+0x3039/0x3180 mm/page_alloc.c:3457
   __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4733
   alloc_pages_mpol_noprof+0x3e8/0x680 mm/mempolicy.c:2265
   alloc_slab_page+0x6a/0x120 mm/slub.c:2413
   allocate_slab+0x5a/0x2f0 mm/slub.c:2579
   new_slab mm/slub.c:2632 [inline]
   ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3819
   __slab_alloc+0x58/0xa0 mm/slub.c:3909
   __slab_alloc_node mm/slub.c:3962 [inline]
   slab_alloc_node mm/slub.c:4123 [inline]
   kmem_cache_alloc_node_noprof+0x1fe/0x320 mm/slub.c:4187
   alloc_task_struct_node kernel/fork.c:180 [inline]
   dup_task_struct+0x57/0x8c0 kernel/fork.c:1107
   copy_process+0x5d1/0x3d50 kernel/fork.c:2206
   kernel_clone+0x223/0x880 kernel/fork.c:2787
   kernel_thread+0x1bc/0x240 kernel/fork.c:2849
   create_kthread kernel/kthread.c:412 [inline]
   kthreadd+0x60d/0x810 kernel/kthread.c:765
   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
  page last free pid 5230 tgid 5230 stack trace:
   reset_page_owner include/linux/page_owner.h:25 [inline]
   free_pages_prepare mm/page_alloc.c:1108 [inline]
   free_unref_page+0xcd0/0xf00 mm/page_alloc.c:2638
   discard_slab mm/slub.c:2678 [inline]
   __put_partials+0xeb/0x130 mm/slub.c:3146
   put_cpu_partial+0x17c/0x250 mm/slub.c:3221
   __slab_free+0x2ea/0x3d0 mm/slub.c:4450
   qlink_free mm/kasan/quarantine.c:163 [inline]
   qlist_free_all+0x9a/0x140 mm/kasan/quarantine.c:179
   kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286
   __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:329
   kasan_slab_alloc include/linux/kasan.h:247 [inline]
   slab_post_alloc_hook mm/slub.c:4086 [inline]
   slab_alloc_node mm/slub.c:4135 [inline]
   kmem_cache_alloc_noprof+0x135/0x2a0 mm/slub.c:4142
   getname_flags+0xb7/0x540 fs/namei.c:139
   do_sys_openat2+0xd2/0x1d0 fs/open.c:1409
   do_sys_open fs/open.c:1430 [inline]
   __do_sys_openat fs/open.c:1446 [inline]
   __se_sys_openat fs/open.c:1441 [inline]
   __x64_sys_openat+0x247/0x2a0 fs/open.c:1441
   do_syscall_x64 arch/x86/entry/common.c:52 [inline]
   do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

  Memory state around the buggy address:
   ffff8880272a8900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
   ffff8880272a8980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  >ffff8880272a8a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                              ^
   ffff8880272a8a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
   ffff8880272a8b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ==================================================================

Reported-by: syzbot+8aaf2df2ef0164ffe1fb@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/66fb36b1.050a0220.aab67.003b.GAE@google.com/
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 25d768e67e37..1238a38c59b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4255,6 +4255,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_cleanup_defrag_inodes(fs_info);
 
+	/*
+	 * Wait for any fixup workers to complete.
+	 * If we don't wait for them here and they are still running by the time
+	 * we call kthread_stop() against the cleaner kthread further below, we
+	 * get an use-after-free on the cleaner because the fixup worker adds an
+	 * inode to the list of delayed iputs and then attempts to wakeup the
+	 * cleaner kthread, which was already stopped and destroyed. We parked
+	 * already the cleaner, but below we run all pending delayed iputs.
+	 */
+	btrfs_flush_workqueue(fs_info->fixup_workers);
+
 	/*
 	 * After we parked the cleaner kthread, ordered extents may have
 	 * completed and created new delayed iputs. If one of the async reclaim
-- 
cgit v1.2.3-70-g09d2


From d6e7ac65d4c106149d08a0ffba39fc516ae3d21b Mon Sep 17 00:00:00 2001
From: Leo Martins <loemra.dev@gmail.com>
Date: Tue, 24 Sep 2024 16:42:29 -0700
Subject: btrfs: disable rate limiting when debug enabled

Disable ratelimiting for btrfs_printk when CONFIG_BTRFS_DEBUG is
enabled. This allows for more verbose output which is often needed by
functions like btrfs_dump_space_info().

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Leo Martins <loemra.dev@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/messages.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 77752eec125d..363fd28c0268 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -239,7 +239,8 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	if (__ratelimit(ratelimit)) {
+	/* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */
+	if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) {
 		if (fs_info) {
 			char statestr[STATE_STRING_BUF_LEN];
 
-- 
cgit v1.2.3-70-g09d2