From b742d7b4f0e03df25c2a772adcded35044b625ca Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Jun 2023 11:04:32 -0700
Subject: xfs: use deferred frees for btree block freeing

Btrees that aren't freespace management trees use the normal extent
allocation and freeing routines for their blocks. Hence when a btree
block is freed, a direct call to xfs_free_extent() is made and the
extent is immediately freed. This puts the entire free space
management btrees under this path, so we are stacking btrees on
btrees in the call stack. The inobt, finobt and refcount btrees
all do this.

However, the bmap btree does not do this - it calls
xfs_free_extent_later() to defer the extent free operation via an
XEFI and hence it gets processed in deferred operation processing
during the commit of the primary transaction (i.e. via intent
chaining).

We need to change xfs_free_extent() to behave in a non-blocking
manner so that we can avoid deadlocks with busy extents near ENOSPC
in transactions that free multiple extents. Inserting or removing a
record from a btree can cause a multi-level tree merge operation and
that will free multiple blocks from the btree in a single
transaction. i.e. we can call xfs_free_extent() multiple times, and
hence the btree manipulation transaction is vulnerable to this busy
extent deadlock vector.

To fix this, convert all the remaining callers of xfs_free_extent()
to use xfs_free_extent_later() to queue XEFIs and hence defer
processing of the extent frees to a context that can be safely
restarted if a deadlock condition is detected.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
---
 fs/xfs/libxfs/xfs_ialloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/xfs/libxfs/xfs_ialloc.c')

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 34600f94c2f4..1e5fafbc0cdb 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1853,8 +1853,8 @@ xfs_difree_inode_chunk(
 		/* not sparse, calculate extent info directly */
 		return xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, sagbno),
-				M_IGEO(mp)->ialloc_blks,
-				&XFS_RMAP_OINFO_INODES);
+				M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
+				XFS_AG_RESV_NONE);
 	}
 
 	/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1899,8 +1899,8 @@ xfs_difree_inode_chunk(
 		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
 		error = xfs_free_extent_later(tp,
-				XFS_AGB_TO_FSB(mp, agno, agbno),
-				contigblk, &XFS_RMAP_OINFO_INODES);
+				XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
 		if (error)
 			return error;
 
-- 
cgit v1.2.3-70-g09d2


From 2d7d1e7ea321b0b2810eb00183e21332ee9c4b6f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 29 Jun 2023 10:15:45 -0700
Subject: xfs: AGI length should be bounds checked

Similar to the recent patch strengthening the AGF agf_length
verification, the AGI verifier does not check that the AGI length field
is within known good bounds.  This isn't currently checked by runtime
kernel code, yet we assume in many places that it is correct and verify
other metadata against it.

Add length verification to the AGI verifier.  Just like the AGF length
checking, the length of the AGI must be equal to the size of the AG
specified in the superblock, unless it is the last AG in the filesystem.
In that case, it must be less than or equal to sb->sb_agblocks and
greater than XFS_MIN_AG_BLOCKS, which is the smallest AG a growfs
operation will allow to exist.

There's only one place in the filesystem that actually uses agi_length,
but let's not leave it vulnerable to the same weird nonsense that
generates syzbot bugs, eh?

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc.c  | 72 +++++++++++++++++++++++++++++-----------------
 fs/xfs/libxfs/xfs_alloc.h  |  3 ++
 fs/xfs/libxfs/xfs_ialloc.c | 24 +++++++---------
 3 files changed, 60 insertions(+), 39 deletions(-)

(limited to 'fs/xfs/libxfs/xfs_ialloc.c')

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 200c460a3c7a..3069194527dd 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2956,6 +2956,47 @@ xfs_alloc_put_freelist(
 	return 0;
 }
 
+/*
+ * Check that this AGF/AGI header's sequence number and length matches the AG
+ * number and size in fsblocks.
+ */
+xfs_failaddr_t
+xfs_validate_ag_length(
+	struct xfs_buf		*bp,
+	uint32_t		seqno,
+	uint32_t		length)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	/*
+	 * During growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && seqno != bp->b_pag->pag_agno)
+		return __this_address;
+
+	/*
+	 * Only the last AG in the filesystem is allowed to be shorter
+	 * than the AG size recorded in the superblock.
+	 */
+	if (length != mp->m_sb.sb_agblocks) {
+		/*
+		 * During growfs, the new last AG can get here before we
+		 * have updated the superblock. Give it a pass on the seqno
+		 * check.
+		 */
+		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
+			return __this_address;
+		if (length < XFS_MIN_AG_BLOCKS)
+			return __this_address;
+		if (length > mp->m_sb.sb_agblocks)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
 /*
  * Verify the AGF is consistent.
  *
@@ -2975,6 +3016,8 @@ xfs_agf_verify(
 {
 	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_agf		*agf = bp->b_addr;
+	xfs_failaddr_t		fa;
+	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
 	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
 
 	if (xfs_has_crc(mp)) {
@@ -2993,33 +3036,10 @@ xfs_agf_verify(
 	/*
 	 * Both agf_seqno and agf_length need to validated before anything else
 	 * block number related in the AGF or AGFL can be checked.
-	 *
-	 * During growfs operations, the perag is not fully initialised,
-	 * so we can't use it for any useful checking. growfs ensures we can't
-	 * use it by using uncached buffers that don't have the perag attached
-	 * so we can detect and avoid this problem.
-	 */
-	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
-		return __this_address;
-
-	/*
-	 * Only the last AGF in the filesytsem is allowed to be shorter
-	 * than the AG size recorded in the superblock.
 	 */
-	if (agf_length != mp->m_sb.sb_agblocks) {
-		/*
-		 * During growfs, the new last AGF can get here before we
-		 * have updated the superblock. Give it a pass on the seqno
-		 * check.
-		 */
-		if (bp->b_pag &&
-		    be32_to_cpu(agf->agf_seqno) != mp->m_sb.sb_agcount - 1)
-			return __this_address;
-		if (agf_length < XFS_MIN_AG_BLOCKS)
-			return __this_address;
-		if (agf_length > mp->m_sb.sb_agblocks)
-			return __this_address;
-	}
+	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
+	if (fa)
+		return fa;
 
 	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
 		return __this_address;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index e544ee43fba6..6bb8d295c321 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -273,4 +273,7 @@ extern struct kmem_cache	*xfs_extfree_item_cache;
 int __init xfs_extfree_intent_init_cache(void);
 void xfs_extfree_intent_destroy_cache(void);
 
+xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno,
+		uint32_t length);
+
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 1e5fafbc0cdb..b83e54c70906 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2486,11 +2486,14 @@ xfs_ialloc_log_agi(
 
 static xfs_failaddr_t
 xfs_agi_verify(
-	struct xfs_buf	*bp)
+	struct xfs_buf		*bp)
 {
-	struct xfs_mount *mp = bp->b_mount;
-	struct xfs_agi	*agi = bp->b_addr;
-	int		i;
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_agi		*agi = bp->b_addr;
+	xfs_failaddr_t		fa;
+	uint32_t		agi_seqno = be32_to_cpu(agi->agi_seqno);
+	uint32_t		agi_length = be32_to_cpu(agi->agi_length);
+	int			i;
 
 	if (xfs_has_crc(mp)) {
 		if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
@@ -2507,6 +2510,10 @@ xfs_agi_verify(
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 		return __this_address;
 
+	fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
+	if (fa)
+		return fa;
+
 	if (be32_to_cpu(agi->agi_level) < 1 ||
 	    be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
 		return __this_address;
@@ -2516,15 +2523,6 @@ xfs_agi_verify(
 	     be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
 		return __this_address;
 
-	/*
-	 * during growfs operations, the perag is not fully initialised,
-	 * so we can't use it for any useful checking. growfs ensures we can't
-	 * use it by using uncached buffers that don't have the perag attached
-	 * so we can detect and avoid this problem.
-	 */
-	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
-		return __this_address;
-
 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
 		if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
 			continue;
-- 
cgit v1.2.3-70-g09d2