From d40c2865bdbbbba6418436b0a877daebe1d7c63e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 28 May 2024 12:12:39 +0800 Subject: xfs: avoid redundant AGFL buffer invalidation Currently AGFL blocks can be filled from the following three sources: - allocbt free blocks, as in xfs_allocbt_free_block(); - rmapbt free blocks, as in xfs_rmapbt_free_block(); - refilled from freespace btrees, as in xfs_alloc_fix_freelist(). Originally, allocbt free blocks would be marked as stale only when they put back in the general free space pool as Dave mentioned on IRC, "we don't stale AGF metadata btree blocks when they are returned to the AGFL .. but once they get put back in the general free space pool, we have to make sure the buffers are marked stale as the next user of those blocks might be user data...." However, after commit ca250b1b3d71 ("xfs: invalidate allocbt blocks moved to the free list") and commit edfd9dd54921 ("xfs: move buffer invalidation to xfs_btree_free_block"), even allocbt / bmapbt free blocks will be invalidated immediately since they may fail to pass V5 format validation on writeback even writeback to free space would be safe. IOWs, IMHO currently there is actually no difference of free blocks between AGFL freespace pool and the general free space pool. So let's avoid extra redundant AGFL buffer invalidation, since otherwise we're currently facing unnecessary xfs_log_force() due to xfs_trans_binval() again on buffers already marked as stale before as below: [ 333.507469] Call Trace: [ 333.507862] xfs_buf_find+0x371/0x6a0 <- xfs_buf_lock [ 333.508451] xfs_buf_get_map+0x3f/0x230 [ 333.509062] xfs_trans_get_buf_map+0x11a/0x280 [ 333.509751] xfs_free_agfl_block+0xa1/0xd0 [ 333.510403] xfs_agfl_free_finish_item+0x16e/0x1d0 [ 333.511157] xfs_defer_finish_noroll+0x1ef/0x5c0 [ 333.511871] xfs_defer_finish+0xc/0xa0 [ 333.512471] xfs_itruncate_extents_flags+0x18a/0x5e0 [ 333.513253] xfs_inactive_truncate+0xb8/0x130 [ 333.513930] xfs_inactive+0x223/0x270 xfs_log_force() will take tens of milliseconds with AGF buffer locked. It becomes an unnecessary long latency especially on our PMEM devices with FSDAX enabled and fsops like xfs_reflink_find_shared() at the same time are stuck due to the same AGF lock. Removing the double invalidation on the AGFL blocks does not make this issue go away, but this patch fixes for our workloads in reality and it should also work by the code analysis. Note that I'm not sure I need to remove another redundant one in xfs_alloc_ag_vextent_small() since it's unrelated to our workloads. Also fstests are passed with this patch. Signed-off-by: Gao Xiang Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 28 +--------------------------- fs/xfs/libxfs/xfs_alloc.h | 6 ++++-- fs/xfs/xfs_extfree_item.c | 4 ++-- 3 files changed, 7 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6c55a6e88eba..63315ddc46c6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1932,7 +1932,7 @@ out_nominleft: /* * Free the extent starting at agno/bno for length. */ -STATIC int +int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, @@ -2422,32 +2422,6 @@ xfs_alloc_space_available( return true; } -int -xfs_free_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_buf *agbp, - struct xfs_owner_info *oinfo) -{ - int error; - struct xfs_buf *bp; - - error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, - XFS_AG_RESV_AGFL); - if (error) - return error; - - error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, - XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), - tp->t_mountp->m_bsize, 0, &bp); - if (error) - return error; - xfs_trans_binval(tp, bp); - - return 0; -} - /* * Check the agfl fields of the agf for inconsistency or corruption. * diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0b956f8b9d5a..3dc8e44fea76 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -80,6 +80,10 @@ int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, struct xfs_buf *agflbp, xfs_agblock_t bno, int btreeblk); +int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type); /* * Compute and fill in value of m_alloc_maxlevels. @@ -194,8 +198,6 @@ int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); -int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, - struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 8c382f092332..01ebbd7691a5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -547,8 +547,8 @@ xfs_agfl_free_finish_item( error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); if (!error) - error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, - agbno, agbp, &oinfo); + error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno, + agbno, 1, &oinfo, XFS_AG_RESV_AGFL); next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); -- cgit v1.2.3-70-g09d2 From d3b689d7c711a9f36d3e48db9eaa75784a892f4c Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 28 May 2024 17:15:09 +0000 Subject: xfs: Fix xfs_flush_unmap_range() range for RT Currently xfs_flush_unmap_range() does unmap for a full RT extent range, which we also want to ensure is clean and idle. This code change is originally from Dave Chinner. Reviewed-by: Christoph Hellwig 4 Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Signed-off-by: Chandan Babu R --- fs/xfs/xfs_bmap_util.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a4d9fbc21b83..501068eab502 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -808,14 +808,18 @@ xfs_flush_unmap_range( xfs_off_t offset, xfs_off_t len) { - struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); xfs_off_t rounding, start, end; int error; - rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE); - start = round_down(offset, rounding); - end = round_up(offset + len, rounding) - 1; + /* + * Make sure we extend the flush out to extent alignment + * boundaries so any extent range overlapping the start/end + * of the modification we are about to do is clean and idle. + */ + rounding = max_t(xfs_off_t, xfs_inode_alloc_unitsize(ip), PAGE_SIZE); + start = rounddown_64(offset, rounding); + end = roundup_64(offset + len, rounding) - 1; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) -- cgit v1.2.3-70-g09d2 From f23660f059470ec7043748da7641e84183c23bc8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 28 May 2024 17:15:10 +0000 Subject: xfs: Fix xfs_prepare_shift() range for RT The RT extent range must be considered in the xfs_flush_unmap_range() call to stabilize the boundary. This code change is originally from Dave Chinner. Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Signed-off-by: Chandan Babu R --- fs/xfs/xfs_bmap_util.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 501068eab502..fe2e2c930975 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -902,7 +902,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { - struct xfs_mount *mp = ip->i_mount; + unsigned int rounding; int error; /* @@ -920,11 +920,13 @@ xfs_prepare_shift( * with the full range of the operation. If we don't, a COW writeback * completion could race with an insert, front merge with the start * extent (after split) during the shift and corrupt the file. Start - * with the block just prior to the start to stabilize the boundary. + * with the allocation unit just prior to the start to stabilize the + * boundary. */ - offset = round_down(offset, mp->m_sb.sb_blocksize); + rounding = xfs_inode_alloc_unitsize(ip); + offset = rounddown_64(offset, rounding); if (offset) - offset -= mp->m_sb.sb_blocksize; + offset -= rounding; /* * Writeback and invalidate cache for the remainder of the file as we're -- cgit v1.2.3-70-g09d2 From fb63435b7c7dc112b1ae1baea5486e0a6e27b196 Mon Sep 17 00:00:00 2001 From: lei lu Date: Mon, 3 Jun 2024 17:46:08 +0800 Subject: xfs: add bounds checking to xlog_recover_process_data There is a lack of verification of the space occupied by fixed members of xlog_op_header in the xlog_recover_process_data. We can create a crafted image to trigger an out of bounds read by following these steps: 1) Mount an image of xfs, and do some file operations to leave records 2) Before umounting, copy the image for subsequent steps to simulate abnormal exit. Because umount will ensure that tail_blk and head_blk are the same, which will result in the inability to enter xlog_recover_process_data 3) Write a tool to parse and modify the copied image in step 2 4) Make the end of the xlog_op_header entries only 1 byte away from xlog_rec_header->h_size 5) xlog_rec_header->h_num_logops++ 6) Modify xlog_rec_header->h_crc Fix: Add a check to make sure there is sufficient space to access fixed members of xlog_op_header. Signed-off-by: lei lu Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_recover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4fe627991e86..409b645ce799 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2489,7 +2489,10 @@ xlog_recover_process_data( ohead = (struct xlog_op_header *)dp; dp += sizeof(*ohead); - ASSERT(dp <= end); + if (dp > end) { + xfs_warn(log->l_mp, "%s: op header overrun", __func__); + return -EFSCORRUPTED; + } /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, -- cgit v1.2.3-70-g09d2 From 0c7fcdb6d06cdf8b19b57c17605215b06afa864a Mon Sep 17 00:00:00 2001 From: lei lu Date: Fri, 14 Jun 2024 10:22:53 +0800 Subject: xfs: don't walk off the end of a directory data block This adds sanity checks for xfs_dir2_data_unused and xfs_dir2_data_entry to make sure don't stray beyond valid memory region. Before patching, the loop simply checks that the start offset of the dup and dep is within the range. So in a crafted image, if last entry is xfs_dir2_data_unused, we can change dup->length to dup->length-1 and leave 1 byte of space. In the next traversal, this space will be considered as dup or dep. We may encounter an out of bound read when accessing the fixed members. In the patch, we make sure that the remaining bytes large enough to hold an unused entry before accessing xfs_dir2_data_unused and xfs_dir2_data_unused is XFS_DIR2_DATA_ALIGN byte aligned. We also make sure that the remaining bytes large enough to hold a dirent with a single-byte name before accessing xfs_dir2_data_entry. Signed-off-by: lei lu Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_dir2_data.c | 31 ++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_dir2_priv.h | 7 +++++++ 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index ea0b9628df18..a16b05c43e2e 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -178,6 +178,14 @@ __xfs_dir3_data_check( while (offset < end) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + unsigned int reclen; + + /* + * Are the remaining bytes large enough to hold an + * unused entry? + */ + if (offset > end - xfs_dir2_data_unusedsize(1)) + return __this_address; /* * If it's unused, look for the space in the bestfree table. @@ -187,9 +195,13 @@ __xfs_dir3_data_check( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { xfs_failaddr_t fa; + reclen = xfs_dir2_data_unusedsize( + be16_to_cpu(dup->length)); if (lastfree != 0) return __this_address; - if (offset + be16_to_cpu(dup->length) > end) + if (be16_to_cpu(dup->length) != reclen) + return __this_address; + if (offset + reclen > end) return __this_address; if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != offset) @@ -207,10 +219,18 @@ __xfs_dir3_data_check( be16_to_cpu(bf[2].length)) return __this_address; } - offset += be16_to_cpu(dup->length); + offset += reclen; lastfree = 1; continue; } + + /* + * This is not an unused entry. Are the remaining bytes + * large enough for a dirent with a single-byte name? + */ + if (offset > end - xfs_dir2_data_entsize(mp, 1)) + return __this_address; + /* * It's a real entry. Validate the fields. * If this is a block directory then make sure it's @@ -219,9 +239,10 @@ __xfs_dir3_data_check( */ if (dep->namelen == 0) return __this_address; - if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) + reclen = xfs_dir2_data_entsize(mp, dep->namelen); + if (offset + reclen > end) return __this_address; - if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) + if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) return __this_address; if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset) return __this_address; @@ -245,7 +266,7 @@ __xfs_dir3_data_check( if (i >= be32_to_cpu(btp->count)) return __this_address; } - offset += xfs_dir2_data_entsize(mp, dep->namelen); + offset += reclen; } /* * Need to have seen all the entries and all the bestfree slots. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 3befb32509fa..10041350274a 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -189,6 +189,13 @@ void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); +static inline unsigned int +xfs_dir2_data_unusedsize( + unsigned int len) +{ + return round_up(len, XFS_DIR2_DATA_ALIGN); +} + static inline unsigned int xfs_dir2_data_entsize( struct xfs_mount *mp, -- cgit v1.2.3-70-g09d2 From 8626b67acfa424834ad2f321cecc1f768e7f0106 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:26 +0200 Subject: xfs: move the dio write relocking out of xfs_ilock_for_iomap About half of xfs_ilock_for_iomap deals with a special case for direct I/O writes to COW files that need to take the ilock exclusively. Move this code into the one callers that cares and simplify xfs_ilock_for_iomap. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_iomap.c | 71 ++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 414903885ab9..72c981e3dc92 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -717,53 +717,30 @@ imap_needs_cow( return true; } +/* + * Extents not yet cached requires exclusive access, don't block for + * IOMAP_NOWAIT. + * + * This is basically an opencoded xfs_ilock_data_map_shared() call, but with + * support for IOMAP_NOWAIT. + */ static int xfs_ilock_for_iomap( struct xfs_inode *ip, unsigned flags, unsigned *lockmode) { - unsigned int mode = *lockmode; - bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); - - /* - * COW writes may allocate delalloc space or convert unwritten COW - * extents, so we need to make sure to take the lock exclusively here. - */ - if (xfs_is_cow_inode(ip) && is_write) - mode = XFS_ILOCK_EXCL; - - /* - * Extents not yet cached requires exclusive access, don't block. This - * is an opencoded xfs_ilock_data_map_shared() call but with - * non-blocking behaviour. - */ - if (xfs_need_iread_extents(&ip->i_df)) { - if (flags & IOMAP_NOWAIT) - return -EAGAIN; - mode = XFS_ILOCK_EXCL; - } - -relock: if (flags & IOMAP_NOWAIT) { - if (!xfs_ilock_nowait(ip, mode)) + if (xfs_need_iread_extents(&ip->i_df)) + return -EAGAIN; + if (!xfs_ilock_nowait(ip, *lockmode)) return -EAGAIN; } else { - xfs_ilock(ip, mode); + if (xfs_need_iread_extents(&ip->i_df)) + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); } - /* - * The reflink iflag could have changed since the earlier unlocked - * check, so if we got ILOCK_SHARED for a write and but we're now a - * reflink inode we have to switch to ILOCK_EXCL and relock. - */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { - xfs_iunlock(ip, mode); - mode = XFS_ILOCK_EXCL; - goto relock; - } - - *lockmode = mode; return 0; } @@ -801,7 +778,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned int lockmode = XFS_ILOCK_SHARED; + unsigned int lockmode; u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -817,10 +794,30 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* + * COW writes may allocate delalloc space or convert unwritten COW + * extents, so we need to make sure to take the lock exclusively here. + */ + if (xfs_is_cow_inode(ip)) + lockmode = XFS_ILOCK_EXCL; + else + lockmode = XFS_ILOCK_SHARED; + +relock: error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; + /* + * The reflink iflag could have changed since the earlier unlocked + * check, check if it again and relock if needed. + */ + if (xfs_is_cow_inode(ip) && lockmode == XFS_ILOCK_SHARED) { + xfs_iunlock(ip, lockmode); + lockmode = XFS_ILOCK_EXCL; + goto relock; + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) -- cgit v1.2.3-70-g09d2 From 29bc0dd0a2f6d738fd339826af57cd17f7a39bd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:27 +0200 Subject: xfs: cleanup xfs_ilock_iocb_for_write Move the relock path out of the straight line and add a comment explaining why it exists. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_file.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b240ea5241dc..74c2c8d253e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -213,14 +213,18 @@ xfs_ilock_iocb_for_write( if (ret) return ret; - if (*lock_mode == XFS_IOLOCK_EXCL) - return 0; - if (!xfs_iflags_test(ip, XFS_IREMAPPING)) - return 0; + /* + * If a reflink remap is in progress we always need to take the iolock + * exclusively to wait for it to finish. + */ + if (*lock_mode == XFS_IOLOCK_SHARED && + xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); + } - xfs_iunlock(ip, *lock_mode); - *lock_mode = XFS_IOLOCK_EXCL; - return xfs_ilock_iocb(iocb, *lock_mode); + return 0; } static unsigned int -- cgit v1.2.3-70-g09d2 From 9092b1de35a45ec7291156382db7a7ee13bdbb27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:28 +0200 Subject: xfs: simplify xfs_dax_fault Replace the separate stub with an IS_ENABLED check, and take the call to dax_finish_sync_fault into xfs_dax_fault instead of leaving it in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_file.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 74c2c8d253e6..8aab2f66fe01 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1251,31 +1251,27 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } -#ifdef CONFIG_FS_DAX static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, unsigned int order, - bool write_fault, - pfn_t *pfn) + bool write_fault) { - return dax_iomap_fault(vmf, order, pfn, NULL, + vm_fault_t ret; + pfn_t pfn; + + if (!IS_ENABLED(CONFIG_FS_DAX)) { + ASSERT(0); + return VM_FAULT_SIGBUS; + } + ret = dax_iomap_fault(vmf, order, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, order, pfn); + return ret; } -#else -static inline vm_fault_t -xfs_dax_fault( - struct vm_fault *vmf, - unsigned int order, - bool write_fault, - pfn_t *pfn) -{ - ASSERT(0); - return VM_FAULT_SIGBUS; -} -#endif /* * Locking for serialisation of IO during page faults. This results in a lock @@ -1309,11 +1305,7 @@ __xfs_filemap_fault( lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); if (IS_DAX(inode)) { - pfn_t pfn; - - ret = xfs_dax_fault(vmf, order, write_fault, &pfn); - if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, order, pfn); + ret = xfs_dax_fault(vmf, order, write_fault); } else if (write_fault) { ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { -- cgit v1.2.3-70-g09d2 From 6a39ec1d394458e59f411edecf7b08ce34bdc7c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:29 +0200 Subject: xfs: refactor __xfs_filemap_fault Split the write fault and DAX fault handling into separate helpers so that the main fault handler is easier to follow. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_file.c | 71 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8aab2f66fe01..32a2cd6ec82e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1252,7 +1252,7 @@ xfs_file_llseek( } static inline vm_fault_t -xfs_dax_fault( +xfs_dax_fault_locked( struct vm_fault *vmf, unsigned int order, bool write_fault) @@ -1273,6 +1273,45 @@ xfs_dax_fault( return ret; } +static vm_fault_t +xfs_dax_read_fault( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int lock_mode; + vm_fault_t ret; + + lock_mode = xfs_ilock_for_write_fault(ip); + ret = xfs_dax_fault_locked(vmf, order, false); + xfs_iunlock(ip, lock_mode); + + return ret; +} + +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + unsigned int lock_mode; + vm_fault_t ret; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + if (IS_DAX(inode)) + ret = xfs_dax_fault_locked(vmf, order, true); + else + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); + xfs_iunlock(XFS_I(inode), lock_mode); + + sb_end_pagefault(inode->i_sb); + return ret; +} + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1290,34 +1329,14 @@ __xfs_filemap_fault( bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - vm_fault_t ret; - unsigned int lock_mode = 0; - - trace_xfs_filemap_fault(ip, order, write_fault); - if (write_fault) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - if (IS_DAX(inode) || write_fault) - lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); - - if (IS_DAX(inode)) { - ret = xfs_dax_fault(vmf, order, write_fault); - } else if (write_fault) { - ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); - } else { - ret = filemap_fault(vmf); - } - - if (lock_mode) - xfs_iunlock(XFS_I(inode), lock_mode); + trace_xfs_filemap_fault(XFS_I(inode), order, write_fault); if (write_fault) - sb_end_pagefault(inode->i_sb); - return ret; + return xfs_write_fault(vmf, order); + if (IS_DAX(inode)) + return xfs_dax_read_fault(vmf, order); + return filemap_fault(vmf); } static inline bool -- cgit v1.2.3-70-g09d2 From 4e82fa11fbbcc5426366dc2ddc839fd56b9d53de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:30 +0200 Subject: xfs: always take XFS_MMAPLOCK shared in xfs_dax_read_fault After the previous refactoring, xfs_dax_fault is now never used for write faults, so don't bother with the xfs_ilock_for_write_fault logic to protect against writes when remapping is in progress. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 32a2cd6ec82e..904be41f3e5e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1279,12 +1279,11 @@ xfs_dax_read_fault( unsigned int order) { struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); - unsigned int lock_mode; vm_fault_t ret; - lock_mode = xfs_ilock_for_write_fault(ip); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault_locked(vmf, order, false); - xfs_iunlock(ip, lock_mode); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); return ret; } -- cgit v1.2.3-70-g09d2 From 4818fd60db5feeeecb84d36d0162c3fb3eccb522 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Jun 2024 07:44:31 +0200 Subject: xfs: fold xfs_ilock_for_write_fault into xfs_write_fault Now that the page fault handler has been refactored, the only caller of xfs_ilock_for_write_fault is simple enough and calls it unconditionally. Fold the logic and expand the comments explaining it. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_file.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 904be41f3e5e..4cdc54dc9686 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -227,21 +227,6 @@ xfs_ilock_iocb_for_write( return 0; } -static unsigned int -xfs_ilock_for_write_fault( - struct xfs_inode *ip) -{ - /* get a shared lock if no remapping in progress */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - if (!xfs_iflags_test(ip, XFS_IREMAPPING)) - return XFS_MMAPLOCK_SHARED; - - /* wait for remapping to complete */ - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - return XFS_MMAPLOCK_EXCL; -} - STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, @@ -1294,18 +1279,30 @@ xfs_write_fault( unsigned int order) { struct inode *inode = file_inode(vmf->vma->vm_file); - unsigned int lock_mode; + struct xfs_inode *ip = XFS_I(inode); + unsigned int lock_mode = XFS_MMAPLOCK_SHARED; vm_fault_t ret; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); - lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + /* + * Normally we only need the shared mmaplock, but if a reflink remap is + * in progress we take the exclusive lock to wait for the remap to + * finish before taking a write fault. + */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (xfs_iflags_test(ip, XFS_IREMAPPING)) { + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + lock_mode = XFS_MMAPLOCK_EXCL; + } + if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); - xfs_iunlock(XFS_I(inode), lock_mode); + xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; -- cgit v1.2.3-70-g09d2 From a330cae8a7147890262b06e1aa13db048e3b130f Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 6 Jun 2024 17:17:54 +0800 Subject: xfs: Remove header files which are included more than once Following warning is reported, so remove these duplicated header including: ./fs/xfs/libxfs/xfs_trans_resv.c: xfs_da_format.h is included more than once. ./fs/xfs/scrub/quota_repair.c: xfs_format.h is included more than once. ./fs/xfs/xfs_handle.c: xfs_da_btree.h is included more than once. ./fs/xfs/xfs_qm_bhv.c: xfs_mount.h is included more than once. ./fs/xfs/xfs_trace.c: xfs_bmap.h is included more than once. This is just a clean code, no logic changed. Signed-off-by: Wenchao Hao Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_trans_resv.c | 1 - fs/xfs/scrub/quota_repair.c | 1 - fs/xfs/xfs_handle.c | 1 - fs/xfs/xfs_qm_bhv.c | 1 - fs/xfs/xfs_trace.c | 1 - 5 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6dbe6e7251e7..3dc8f785bf29 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,7 +22,6 @@ #include "xfs_rtbitmap.h" #include "xfs_attr_item.h" #include "xfs_log.h" -#include "xfs_da_format.h" #define _ALLOC true #define _FREE false diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 90cd1512bba9..cd51f10f2920 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -12,7 +12,6 @@ #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_bit.h" -#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index a3f16e9b6fe5..cf5acbd3c7ca 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -21,7 +21,6 @@ #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_parent.h" -#include "xfs_da_btree.h" #include "xfs_handle.h" #include "xfs_health.h" #include "xfs_icache.h" diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 271c1021c733..a11436579877 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -11,7 +11,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_qm.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9c7fbaae2717..e1ec56d95791 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -38,7 +38,6 @@ #include "xfs_iomap.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" -#include "xfs_bmap.h" #include "xfs_exchmaps.h" #include "xfs_exchrange.h" #include "xfs_parent.h" -- cgit v1.2.3-70-g09d2 From 3ba3ab1f6719287674cf77a1208944cf38ef71c7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 24 Jun 2024 08:04:21 -0700 Subject: xfs: enable FITRIM on the realtime device Implement FITRIM for the realtime device by pretending that it's "space" immediately after the data device. We have to hold the rtbitmap ILOCK while the discard operations are ongoing because there's no busy extent tracking for the rt volume to prevent reallocations. Cc: Konst Mayer Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_discard.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_trace.h | 29 +++++ 2 files changed, 308 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 25fe3b932b5a..6f0fc7fe1f2b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -20,6 +20,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rtbitmap.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -322,7 +323,7 @@ xfs_trim_should_stop(void) * we found in the last batch as the key to start the next. */ static int -xfs_trim_extents( +xfs_trim_perag_extents( struct xfs_perag *pag, xfs_agblock_t start, xfs_agblock_t end, @@ -383,6 +384,259 @@ xfs_trim_extents( } +static int +xfs_trim_datadev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_extlen_t minlen, + uint64_t *blocks_trimmed) +{ + xfs_agnumber_t start_agno, end_agno; + xfs_agblock_t start_agbno, end_agbno; + xfs_daddr_t ddev_end; + struct xfs_perag *pag; + int last_error = 0, error; + + ddev_end = min_t(xfs_daddr_t, end, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); + + start_agno = xfs_daddr_to_agno(mp, start); + start_agbno = xfs_daddr_to_agbno(mp, start); + end_agno = xfs_daddr_to_agno(mp, ddev_end); + end_agbno = xfs_daddr_to_agbno(mp, ddev_end); + + for_each_perag_range(mp, start_agno, end_agno, pag) { + xfs_agblock_t agend = pag->block_count; + + if (start_agno == end_agno) + agend = end_agbno; + error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen, + blocks_trimmed); + if (error) + last_error = error; + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; + } + start_agbno = 0; + } + + return last_error; +} + +#ifdef CONFIG_XFS_RT +struct xfs_trim_rtdev { + /* list of rt extents to free */ + struct list_head extent_list; + + /* pointer to count of blocks trimmed */ + uint64_t *blocks_trimmed; + + /* minimum length that caller allows us to trim */ + xfs_rtblock_t minlen_fsb; + + /* restart point for the rtbitmap walk */ + xfs_rtxnum_t restart_rtx; + + /* stopping point for the current rtbitmap walk */ + xfs_rtxnum_t stop_rtx; +}; + +struct xfs_rtx_busy { + struct list_head list; + xfs_rtblock_t bno; + xfs_rtblock_t length; +}; + +static void +xfs_discard_free_rtdev_extents( + struct xfs_trim_rtdev *tr) +{ + struct xfs_rtx_busy *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { + list_del_init(&busyp->list); + kfree(busyp); + } +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +static int +xfs_discard_rtdev_extents( + struct xfs_mount *mp, + struct xfs_trim_rtdev *tr) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_rtx_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + xfs_rtblock_t start = NULLRTBLOCK, length = 0; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &tr->extent_list, list) { + if (start == NULLRTBLOCK) + start = busyp->bno; + length += busyp->length; + + trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); + + error = __blkdev_issue_discard(bdev, + XFS_FSB_TO_BB(mp, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error) + break; + } + xfs_discard_free_rtdev_extents(tr); + + if (bio) { + error = submit_bio_wait(bio); + if (error == -EOPNOTSUPP) + error = 0; + if (error) + xfs_info(mp, + "discard failed for rtextent [0x%llx,%llu], error %d", + (unsigned long long)start, + (unsigned long long)length, + error); + bio_put(bio); + } + blk_finish_plug(&plug); + + return error; +} + +static int +xfs_trim_gather_rtextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_trim_rtdev *tr = priv; + struct xfs_rtx_busy *busyp; + xfs_rtblock_t rbno, rlen; + + if (rec->ar_startext > tr->stop_rtx) { + /* + * If we've scanned a large number of rtbitmap blocks, update + * the cursor to point at this extent so we restart the next + * batch from this extent. + */ + tr->restart_rtx = rec->ar_startext; + return -ECANCELED; + } + + rbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + + /* Ignore too small. */ + if (rlen < tr->minlen_fsb) { + trace_xfs_discard_rttoosmall(mp, rbno, rlen); + return 0; + } + + busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); + if (!busyp) + return -ENOMEM; + + busyp->bno = rbno; + busyp->length = rlen; + INIT_LIST_HEAD(&busyp->list); + list_add_tail(&busyp->list, &tr->extent_list); + *tr->blocks_trimmed += rlen; + + tr->restart_rtx = rec->ar_startext + rec->ar_extcount; + return 0; +} + +static int +xfs_trim_rtdev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_rtalloc_rec low = { }; + struct xfs_rtalloc_rec high = { }; + struct xfs_trim_rtdev tr = { + .blocks_trimmed = blocks_trimmed, + .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + }; + struct xfs_trans *tp; + xfs_daddr_t rtdev_daddr; + int error; + + INIT_LIST_HEAD(&tr.extent_list); + + /* Shift the start and end downwards to match the rt device. */ + rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (start > rtdev_daddr) + start -= rtdev_daddr; + else + start = 0; + + if (end <= rtdev_daddr) + return 0; + end -= rtdev_daddr; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + end = min_t(xfs_daddr_t, end, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1); + + /* Convert the rt blocks to rt extents */ + low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); + high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); + + /* + * Walk the free ranges between low and high. The query_range function + * trims the extents returned. + */ + do { + tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); + error = xfs_rtalloc_query_range(mp, tp, &low, &high, + xfs_trim_gather_rtextent, &tr); + + if (error == -ECANCELED) + error = 0; + if (error) { + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_discard_free_rtdev_extents(&tr); + break; + } + + if (list_empty(&tr.extent_list)) { + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + break; + } + + error = xfs_discard_rtdev_extents(mp, &tr); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + if (error) + break; + + low.ar_startext = tr.restart_rtx; + } while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext); + + xfs_trans_cancel(tp); + return error; +} +#else +# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + /* * trim a range of the filesystem. * @@ -391,28 +645,37 @@ xfs_trim_extents( * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format * is a linear address range. Hence we need to use DADDR based conversions and * comparisons for determining the correct offset and regions to trim. + * + * The realtime device is mapped into the FITRIM "address space" immediately + * after the data device. */ int xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct xfs_perag *pag; unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); + struct block_device *rt_bdev = NULL; struct fstrim_range range; xfs_daddr_t start, end; xfs_extlen_t minlen; - xfs_agnumber_t start_agno, end_agno; - xfs_agblock_t start_agbno, end_agbno; + xfs_rfsblock_t max_blocks; uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) + if (mp->m_rtdev_targp && + bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) + rt_bdev = mp->m_rtdev_targp->bt_bdev; + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) return -EOPNOTSUPP; + if (rt_bdev) + granularity = max(granularity, + bdev_discard_granularity(rt_bdev)); + /* * We haven't recovered the log, so we cannot use our bnobt-guided * storage zapping commands. @@ -433,35 +696,27 @@ xfs_ioc_trim( * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ - if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; + if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || range.len < mp->m_sb.sb_blocksize) return -EINVAL; start = BTOBB(range.start); - end = min_t(xfs_daddr_t, start + BTOBBT(range.len), - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1; + end = start + BTOBBT(range.len) - 1; - start_agno = xfs_daddr_to_agno(mp, start); - start_agbno = xfs_daddr_to_agbno(mp, start); - end_agno = xfs_daddr_to_agno(mp, end); - end_agbno = xfs_daddr_to_agbno(mp, end); - - for_each_perag_range(mp, start_agno, end_agno, pag) { - xfs_agblock_t agend = pag->block_count; - - if (start_agno == end_agno) - agend = end_agbno; - error = xfs_trim_extents(pag, start_agbno, agend, minlen, + if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { + error = xfs_trim_datadev_extents(mp, start, end, minlen, &blocks_trimmed); if (error) last_error = error; + } - if (xfs_trim_should_stop()) { - xfs_perag_rele(pag); - break; - } - start_agbno = 0; + if (rt_bdev && !xfs_trim_should_stop()) { + error = xfs_trim_rtdev_extents(mp, start, end, minlen, + &blocks_trimmed); + if (error) + last_error = error; } if (last_error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 25ff6fe1eb6c..ba839ce6a9cf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2460,6 +2460,35 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); +DECLARE_EVENT_CLASS(xfs_rtdiscard_class, + TP_PROTO(struct xfs_mount *mp, + xfs_rtblock_t rtbno, xfs_rtblock_t len), + TP_ARGS(mp, rtbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rtblock_t, rtbno) + __field(xfs_rtblock_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_rtdev_targp->bt_dev; + __entry->rtbno = rtbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rtbno 0x%llx rtbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rtbno, + __entry->len) +) + +#define DEFINE_RTDISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_rtdiscard_class, name, \ + TP_PROTO(struct xfs_mount *mp, \ + xfs_rtblock_t rtbno, xfs_rtblock_t len), \ + TP_ARGS(mp, rtbno, len)) +DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent); +DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall); +DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax); + DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), TP_ARGS(cur, level, bp), -- cgit v1.2.3-70-g09d2 From 150bb10a28b9c8709ae227fc898d9cf6136faa1e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:23 -0700 Subject: xfs: verify buffer, inode, and dquot items every tx commit generic/388 has an annoying tendency to fail like this during log recovery: XFS (sda4): Unmounting Filesystem 435fe39b-82b6-46ef-be56-819499585130 XFS (sda4): Mounting V5 Filesystem 435fe39b-82b6-46ef-be56-819499585130 XFS (sda4): Starting recovery (logdev: internal) 00000000: 49 4e 81 b6 03 02 00 00 00 00 00 07 00 00 00 07 IN.............. 00000010: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 10 ................ 00000020: 35 9a 8b c1 3e 6e 81 00 35 9a 8b c1 3f dc b7 00 5...>n..5...?... 00000030: 35 9a 8b c1 3f dc b7 00 00 00 00 00 00 3c 86 4f 5...?........<.O 00000040: 00 00 00 00 00 00 02 f3 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 1f 01 00 00 00 00 00 00 00 02 b2 74 c9 0b .............t.. 00000060: ff ff ff ff d7 45 73 10 00 00 00 00 00 00 00 2d .....Es........- 00000070: 00 00 07 92 00 01 fe 30 00 00 00 00 00 00 00 1a .......0........ 00000080: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000090: 35 9a 8b c1 3b 55 0c 00 00 00 00 00 04 27 b2 d1 5...;U.......'.. 000000a0: 43 5f e3 9b 82 b6 46 ef be 56 81 94 99 58 51 30 C_....F..V...XQ0 XFS (sda4): Internal error Bad dinode after recovery at line 539 of file fs/xfs/xfs_inode_item_recover.c. Caller xlog_recover_items_pass2+0x4e/0xc0 [xfs] CPU: 0 PID: 2189311 Comm: mount Not tainted 6.9.0-rc4-djwx #rc4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20171121_152543-x86-ol7-builder-01.us.oracle.com-4.el7.1 04/01/2014 Call Trace: dump_stack_lvl+0x4f/0x60 xfs_corruption_error+0x90/0xa0 xlog_recover_inode_commit_pass2+0x5f1/0xb00 xlog_recover_items_pass2+0x4e/0xc0 xlog_recover_commit_trans+0x2db/0x350 xlog_recovery_process_trans+0xab/0xe0 xlog_recover_process_data+0xa7/0x130 xlog_do_recovery_pass+0x398/0x840 xlog_do_log_recovery+0x62/0xc0 xlog_do_recover+0x34/0x1d0 xlog_recover+0xe9/0x1a0 xfs_log_mount+0xff/0x260 xfs_mountfs+0x5d9/0xb60 xfs_fs_fill_super+0x76b/0xa30 get_tree_bdev+0x124/0x1d0 vfs_get_tree+0x17/0xa0 path_mount+0x72b/0xa90 __x64_sys_mount+0x112/0x150 do_syscall_64+0x49/0x100 entry_SYSCALL_64_after_hwframe+0x4b/0x53 XFS (sda4): Corruption detected. Unmount and run xfs_repair XFS (sda4): Metadata corruption detected at xfs_dinode_verify.part.0+0x739/0x920 [xfs], inode 0x427b2d1 XFS (sda4): Filesystem has been shut down due to log error (0x2). XFS (sda4): Please unmount the filesystem and rectify the problem(s). XFS (sda4): log mount/recovery failed: error -117 XFS (sda4): log mount failed This inode log item recovery failing the dinode verifier after replaying the contents of the inode log item into the ondisk inode. Looking back into what the kernel was doing at the time of the fs shutdown, a thread was in the middle of running a series of transactions, each of which committed changes to the inode. At some point in the middle of that chain, an invalid (at least according to the verifier) change was committed. Had the filesystem not shut down in the middle of the chain, a subsequent transaction would have corrected the invalid state and nobody would have noticed. But that's not what happened here. Instead, the invalid inode state was committed to the ondisk log, so log recovery tripped over it. The actual defect here was an overzealous inode verifier, which was fixed in a separate patch. This patch adds some transaction precommit functions for CONFIG_XFS_DEBUG=y mode so that we can detect these kinds of transient errors at transaction commit time, where it's much easier to find the root cause. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Kconfig | 12 ++++++++++++ fs/xfs/xfs.h | 4 ++++ fs/xfs/xfs_buf_item.c | 32 ++++++++++++++++++++++++++++++++ fs/xfs/xfs_dquot_item.c | 31 +++++++++++++++++++++++++++++++ fs/xfs/xfs_inode_item.c | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+) (limited to 'fs') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index d41edd30388b..fffd6fffdce0 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -217,6 +217,18 @@ config XFS_DEBUG Say N unless you are an XFS developer, or you play one on TV. +config XFS_DEBUG_EXPENSIVE + bool "XFS expensive debugging checks" + depends on XFS_FS && XFS_DEBUG + help + Say Y here to get an XFS build with expensive debugging checks + enabled. These checks may affect performance significantly. + + Note that the resulting code will be HUGER and SLOWER, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. + config XFS_ASSERT_FATAL bool "XFS fatal asserts" default y diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index f6ffb4f248f7..9355ccad9503 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -10,6 +10,10 @@ #define DEBUG 1 #endif +#ifdef CONFIG_XFS_DEBUG_EXPENSIVE +#define DEBUG_EXPENSIVE 1 +#endif + #ifdef CONFIG_XFS_ASSERT_FATAL #define XFS_ASSERT_FATAL 1 #endif diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 43031842341a..47549cfa61cd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_error.h" struct kmem_cache *xfs_buf_item_cache; @@ -781,8 +782,39 @@ xfs_buf_item_committed( return lsn; } +#ifdef DEBUG_EXPENSIVE +static int +xfs_buf_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + struct xfs_mount *mp = bp->b_mount; + xfs_failaddr_t fa; + + if (!bp->b_ops || !bp->b_ops->verify_struct) + return 0; + if (bip->bli_flags & XFS_BLI_STALE) + return 0; + + fa = bp->b_ops->verify_struct(bp); + if (fa) { + xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, + bp->b_addr, BBTOB(bp->b_length), fa); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + ASSERT(fa == NULL); + } + + return 0; +} +#else +# define xfs_buf_item_precommit NULL +#endif + static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, + .iop_precommit = xfs_buf_item_precommit, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 6a1aae799cf1..7d19091215b0 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_qm.h" #include "xfs_log.h" +#include "xfs_error.h" static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) { @@ -193,8 +194,38 @@ xfs_qm_dquot_logitem_committing( return xfs_qm_dquot_logitem_release(lip); } +#ifdef DEBUG_EXPENSIVE +static int +xfs_qm_dquot_logitem_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_mount *mp = dqp->q_mount; + struct xfs_disk_dquot ddq = { }; + xfs_failaddr_t fa; + + xfs_dquot_to_disk(&ddq, dqp); + fa = xfs_dquot_verify(mp, &ddq, dqp->q_id); + if (fa) { + XFS_CORRUPTION_ERROR("Bad dquot during logging", + XFS_ERRLEVEL_LOW, mp, &ddq, sizeof(ddq)); + xfs_alert(mp, + "Metadata corruption detected at %pS, dquot 0x%x", + fa, dqp->q_id); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + ASSERT(fa == NULL); + } + + return 0; +} +#else +# define xfs_qm_dquot_logitem_precommit NULL +#endif + static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, + .iop_precommit = xfs_qm_dquot_logitem_precommit, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f28d653300d1..ef05cbbe116c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -37,6 +37,36 @@ xfs_inode_item_sort( return INODE_ITEM(lip)->ili_inode->i_ino; } +#ifdef DEBUG_EXPENSIVE +static void +xfs_inode_item_precommit_check( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dinode *dip; + xfs_failaddr_t fa; + + dip = kzalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | GFP_NOFS); + if (!dip) { + ASSERT(dip != NULL); + return; + } + + xfs_inode_to_disk(ip, dip, 0); + xfs_dinode_calc_crc(mp, dip); + fa = xfs_dinode_verify(mp, ip->i_ino, dip); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), fa); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + ASSERT(fa == NULL); + } + kfree(dip); +} +#else +# define xfs_inode_item_precommit_check(ip) ((void)0) +#endif + /* * Prior to finally logging the inode, we have to ensure that all the * per-modification inode state changes are applied. This includes VFS inode @@ -169,6 +199,8 @@ xfs_inode_item_precommit( iip->ili_fields |= (flags | iip->ili_last_fields); spin_unlock(&iip->ili_lock); + xfs_inode_item_precommit_check(ip); + /* * We are done with the log item transaction dirty state, so clear it so * that it doesn't pollute future transactions. -- cgit v1.2.3-70-g09d2 From 24a4e1cb322e2bf0f3a1afd1978b610a23aa8f36 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:30 -0700 Subject: xfs: use consistent uid/gid when grabbing dquots for inodes I noticed that callers of xfs_qm_vop_dqalloc use the following code to compute the anticipated uid of the new file: mapped_fsuid(idmap, &init_user_ns); whereas the VFS uses a slightly different computation for actually assigning i_uid: mapped_fsuid(idmap, i_user_ns(inode)); Technically, these are not the same things. According to Christian Brauner, the only time that inode->i_sb->s_user_ns != &init_user_ns is when the filesystem was mounted in a new mount namespace by an unpriviledged user. XFS does not allow this, which is why we've never seen bug reports about quotas being incorrect or the uid checks in xfs_qm_vop_create_dqattach tripping debug assertions. However, this /is/ a logic bomb, so let's make the code consistent. Link: https://lore.kernel.org/linux-fsdevel/20240617-weitblick-gefertigt-4a41f37119fa@brauner/ Fixes: c14329d39f2d ("fs: port fs{g,u}id helpers to mnt_idmap") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 16 ++++++++++------ fs/xfs/xfs_symlink.c | 8 +++++--- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a4e3cd8971fc..fd1a59af6cbb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1059,10 +1059,12 @@ xfs_create( prid = xfs_get_initial_prid(dp); /* - * Make sure that we have allocated dquot(s) on disk. + * Make sure that we have allocated dquot(s) on disk. The uid/gid + * computation code must match what the VFS uses to assign i_[ug]id. + * INHERIT adjusts the gid computation for setgid/grpid systems. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), - mapped_fsgid(idmap, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), + mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1234,10 +1236,12 @@ xfs_create_tmpfile( prid = xfs_get_initial_prid(dp); /* - * Make sure that we have allocated dquot(s) on disk. + * Make sure that we have allocated dquot(s) on disk. The uid/gid + * computation code must match what the VFS uses to assign i_[ug]id. + * INHERIT adjusts the gid computation for setgid/grpid systems. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), - mapped_fsgid(idmap, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), + mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 17aee806ec2e..53ed512c6f21 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -122,10 +122,12 @@ xfs_symlink( prid = xfs_get_initial_prid(dp); /* - * Make sure that we have allocated dquot(s) on disk. + * Make sure that we have allocated dquot(s) on disk. The uid/gid + * computation code must match what the VFS uses to assign i_[ug]id. + * INHERIT adjusts the gid computation for setgid/grpid systems. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), - mapped_fsgid(idmap, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), + mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) -- cgit v1.2.3-70-g09d2 From d76e137057ae84e0ca1aac54a1f1ae7c0596c1cd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:31 -0700 Subject: xfs: move inode copy-on-write predicates to xfs_inode.[ch] Move these inode predicate functions to xfs_inode.[ch] since they're not reflink functions. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 8 ++++++++ fs/xfs/xfs_inode.h | 7 +++++++ fs/xfs/xfs_reflink.h | 10 ---------- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fd1a59af6cbb..dd2b80d5d344 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4297,3 +4297,11 @@ xfs_inode_alloc_unitsize( return XFS_FSB_TO_B(ip->i_mount, blocks); } + +/* Should we always be using copy on write for file writes? */ +bool +xfs_is_always_cow_inode( + struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 292b90b5f2ac..e97b2b838c69 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -292,6 +292,13 @@ static inline bool xfs_is_metadata_inode(struct xfs_inode *ip) xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } +bool xfs_is_always_cow_inode(struct xfs_inode *ip); + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 65c5dfe17ecf..fb55e4ce49fa 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,16 +6,6 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 -static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) -{ - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); -} - -static inline bool xfs_is_cow_inode(struct xfs_inode *ip) -{ - return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); -} - extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, -- cgit v1.2.3-70-g09d2 From acdddbe168040372a8b6b9b5876b92b715322910 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:31 -0700 Subject: xfs: hoist extent size helpers to libxfs Move the extent size helpers to xfs_bmap.c in libxfs since they're used there already. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_bmap.h | 3 +++ fs/xfs/xfs_inode.c | 44 -------------------------------------------- fs/xfs/xfs_inode.h | 3 --- fs/xfs/xfs_iops.c | 1 + 5 files changed, 46 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6af6f744fdd6..f889123126d2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6454,3 +6454,45 @@ xfs_bmap_query_all( return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); } + +/* Helper function to extract extent size hint from inode */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (xfs_is_always_cow_inode(ip)) + return 0; + if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + return ip->i_extsize; + if (XFS_IS_REALTIME_INODE(ip) && + ip->i_mount->m_sb.sb_rextsize > 1) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + +/* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_cowextsize; + b = xfs_get_extsz_hint(ip); + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 667b0c2b33d1..7592d46e97c6 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -296,4 +296,7 @@ typedef int (*xfs_bmap_query_range_fn)( int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, void *priv); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dd2b80d5d344..d16e727aa62a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -46,50 +46,6 @@ struct kmem_cache *xfs_inode_cache; -/* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( - struct xfs_inode *ip) -{ - /* - * No point in aligning allocations if we need to COW to actually - * write to them. - */ - if (xfs_is_always_cow_inode(ip)) - return 0; - if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) - return ip->i_extsize; - if (XFS_IS_REALTIME_INODE(ip) && - ip->i_mount->m_sb.sb_rextsize > 1) - return ip->i_mount->m_sb.sb_rextsize; - return 0; -} - -/* - * Helper function to extract CoW extent size hint from inode. - * Between the extent size hint and the CoW extent size hint, we - * return the greater of the two. If the value is zero (automatic), - * use the default size. - */ -xfs_extlen_t -xfs_get_cowextsz_hint( - struct xfs_inode *ip) -{ - xfs_extlen_t a, b; - - a = 0; - if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) - a = ip->i_cowextsize; - b = xfs_get_extsz_hint(ip); - - a = max(a, b); - if (a == 0) - return XFS_DEFAULT_COWEXTSZ_HINT; - return a; -} - /* * These two are wrapper routines around the xfs_ilock() routine used to * centralize some grungy code. They are used in places that wish to lock the diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e97b2b838c69..0e642afa77a7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -563,9 +563,6 @@ int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); -xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); -xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); - int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ff222827e550..35a84790d26e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -26,6 +26,7 @@ #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_file.h" +#include "xfs_bmap.h" #include #include -- cgit v1.2.3-70-g09d2 From b7c477be396948ce88ea591b91070fa68ac12437 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:32 -0700 Subject: xfs: hoist inode flag conversion functions to libxfs Hoist the inode flag conversion functions into libxfs so that we can keep them in sync. Do this by creating a new xfs_inode_util.c file in libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/libxfs/xfs_inode_util.c | 124 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 14 +++++ fs/xfs/xfs_inode.c | 49 ---------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_ioctl.c | 60 -------------------- 7 files changed, 141 insertions(+), 110 deletions(-) create mode 100644 fs/xfs/libxfs/xfs_inode_util.c create mode 100644 fs/xfs/libxfs/xfs_inode_util.h (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c50447548d65..dd692619bed5 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -40,6 +40,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_iext_tree.o \ xfs_inode_fork.o \ xfs_inode_buf.o \ + xfs_inode_util.o \ xfs_log_rlimit.o \ xfs_ag_resv.o \ xfs_parent.o \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f889123126d2..09e3302a4b72 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -39,6 +39,7 @@ #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" +#include "xfs_inode_util.h" struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c new file mode 100644 index 000000000000..ed5e1a9b4b8c --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_inode_util.h" + +uint16_t +xfs_flags2diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + /* can't set PREALLOC this way, just preserve it */ + uint16_t di_flags = + (ip->i_diflags & XFS_DIFLAG_PREALLOC); + + if (xflags & FS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & FS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & FS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & FS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & FS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & FS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & FS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & FS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & FS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & FS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + return di_flags; +} + +uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); + + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + + return di_flags2; +} + +uint32_t +xfs_ip2xflags( + struct xfs_inode *ip) +{ + uint32_t flags = 0; + + if (ip->i_diflags & XFS_DIFLAG_ANY) { + if (ip->i_diflags & XFS_DIFLAG_REALTIME) + flags |= FS_XFLAG_REALTIME; + if (ip->i_diflags & XFS_DIFLAG_PREALLOC) + flags |= FS_XFLAG_PREALLOC; + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_XFLAG_IMMUTABLE; + if (ip->i_diflags & XFS_DIFLAG_APPEND) + flags |= FS_XFLAG_APPEND; + if (ip->i_diflags & XFS_DIFLAG_SYNC) + flags |= FS_XFLAG_SYNC; + if (ip->i_diflags & XFS_DIFLAG_NOATIME) + flags |= FS_XFLAG_NOATIME; + if (ip->i_diflags & XFS_DIFLAG_NODUMP) + flags |= FS_XFLAG_NODUMP; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) + flags |= FS_XFLAG_RTINHERIT; + if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) + flags |= FS_XFLAG_PROJINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) + flags |= FS_XFLAG_NOSYMLINKS; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + flags |= FS_XFLAG_EXTSIZE; + if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) + flags |= FS_XFLAG_EXTSZINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) + flags |= FS_XFLAG_NODEFRAG; + if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) + flags |= FS_XFLAG_FILESTREAM; + } + + if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; + } + + if (xfs_inode_has_attr_fork(ip)) + flags |= FS_XFLAG_HASATTR; + return flags; +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h new file mode 100644 index 000000000000..6ad1898a0f73 --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_INODE_UTIL_H__ +#define __XFS_INODE_UTIL_H__ + +uint16_t xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags); +uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags); +uint32_t xfs_dic2xflags(struct xfs_inode *ip); +uint32_t xfs_ip2xflags(struct xfs_inode *ip); + +#endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d16e727aa62a..afc798ffa164 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -523,55 +523,6 @@ xfs_lock_two_inodes( } } -uint -xfs_ip2xflags( - struct xfs_inode *ip) -{ - uint flags = 0; - - if (ip->i_diflags & XFS_DIFLAG_ANY) { - if (ip->i_diflags & XFS_DIFLAG_REALTIME) - flags |= FS_XFLAG_REALTIME; - if (ip->i_diflags & XFS_DIFLAG_PREALLOC) - flags |= FS_XFLAG_PREALLOC; - if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) - flags |= FS_XFLAG_IMMUTABLE; - if (ip->i_diflags & XFS_DIFLAG_APPEND) - flags |= FS_XFLAG_APPEND; - if (ip->i_diflags & XFS_DIFLAG_SYNC) - flags |= FS_XFLAG_SYNC; - if (ip->i_diflags & XFS_DIFLAG_NOATIME) - flags |= FS_XFLAG_NOATIME; - if (ip->i_diflags & XFS_DIFLAG_NODUMP) - flags |= FS_XFLAG_NODUMP; - if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) - flags |= FS_XFLAG_RTINHERIT; - if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) - flags |= FS_XFLAG_PROJINHERIT; - if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) - flags |= FS_XFLAG_NOSYMLINKS; - if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) - flags |= FS_XFLAG_EXTSIZE; - if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) - flags |= FS_XFLAG_EXTSZINHERIT; - if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) - flags |= FS_XFLAG_NODEFRAG; - if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) - flags |= FS_XFLAG_FILESTREAM; - } - - if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { - if (ip->i_diflags2 & XFS_DIFLAG2_DAX) - flags |= FS_XFLAG_DAX; - if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) - flags |= FS_XFLAG_COWEXTSIZE; - } - - if (xfs_inode_has_attr_fork(ip)) - flags |= FS_XFLAG_HASATTR; - return flags; -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0e642afa77a7..b20768962e8d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -8,6 +8,7 @@ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" +#include "xfs_inode_util.h" /* * Kernel only inode definitions @@ -549,7 +550,6 @@ void xfs_assert_ilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -uint xfs_ip2xflags(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, struct xfs_inode *); int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f0117188f302..4e933db75b12 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -469,66 +469,6 @@ xfs_fileattr_get( return 0; } -STATIC uint16_t -xfs_flags2diflags( - struct xfs_inode *ip, - unsigned int xflags) -{ - /* can't set PREALLOC this way, just preserve it */ - uint16_t di_flags = - (ip->i_diflags & XFS_DIFLAG_PREALLOC); - - if (xflags & FS_XFLAG_IMMUTABLE) - di_flags |= XFS_DIFLAG_IMMUTABLE; - if (xflags & FS_XFLAG_APPEND) - di_flags |= XFS_DIFLAG_APPEND; - if (xflags & FS_XFLAG_SYNC) - di_flags |= XFS_DIFLAG_SYNC; - if (xflags & FS_XFLAG_NOATIME) - di_flags |= XFS_DIFLAG_NOATIME; - if (xflags & FS_XFLAG_NODUMP) - di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & FS_XFLAG_NODEFRAG) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (xflags & FS_XFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (xflags & FS_XFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (xflags & FS_XFLAG_NOSYMLINKS) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (xflags & FS_XFLAG_EXTSZINHERIT) - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - if (xflags & FS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (xflags & FS_XFLAG_REALTIME) - di_flags |= XFS_DIFLAG_REALTIME; - if (xflags & FS_XFLAG_EXTSIZE) - di_flags |= XFS_DIFLAG_EXTSIZE; - } - - return di_flags; -} - -STATIC uint64_t -xfs_flags2diflags2( - struct xfs_inode *ip, - unsigned int xflags) -{ - uint64_t di_flags2 = - (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME | - XFS_DIFLAG2_NREXT64)); - - if (xflags & FS_XFLAG_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - if (xflags & FS_XFLAG_COWEXTSIZE) - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - - return di_flags2; -} - static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, -- cgit v1.2.3-70-g09d2 From fcea5b35f36233c04003ab8b3eb081b5e20e1aa4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:33 -0700 Subject: xfs: hoist project id get/set functions to libxfs Move the project id get and set functions into libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 10 ++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 2 ++ fs/xfs/xfs_inode.h | 9 --------- fs/xfs/xfs_linux.h | 2 -- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index ed5e1a9b4b8c..3b5397a3f34f 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -122,3 +122,13 @@ xfs_ip2xflags( flags |= FS_XFLAG_HASATTR; return flags; } + +prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) + return dp->i_projid; + + /* Assign to the root project by default. */ + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index 6ad1898a0f73..f7e4d5a8235d 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -11,4 +11,6 @@ uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags); uint32_t xfs_dic2xflags(struct xfs_inode *ip); uint32_t xfs_ip2xflags(struct xfs_inode *ip); +prid_t xfs_get_initial_prid(struct xfs_inode *dp); + #endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b20768962e8d..15ab7a1c79a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -271,15 +271,6 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) return ret; } -static inline prid_t -xfs_get_initial_prid(struct xfs_inode *dp) -{ - if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) - return dp->i_projid; - - return XFS_PROJID_DEFAULT; -} - static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ac355328121a..54a098fe7285 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -135,8 +135,6 @@ typedef __u32 xfs_nlink_t; */ #define __this_address ({ __label__ __here; __here: barrier(); &&__here; }) -#define XFS_PROJID_DEFAULT 0 - #define howmany(x, y) (((x)+((y)-1))/(y)) static inline void delay(long ticks) -- cgit v1.2.3-70-g09d2 From ba4b39fe4c011078469dcd28f51447d75852d21c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:34 -0700 Subject: xfs: pack icreate initialization parameters into a separate structure Callers that want to create an inode currently pass all possible file attribute values for the new inode into xfs_init_new_inode as ten separate parameters. This causes two code maintenance issues: first, we have large multi-line call sites which programmers must read carefully to make sure they did not accidentally invert a value. Second, all three file id parameters must be passed separately to the quota functions; any discrepancy results in quota count errors. Clean this up by creating a new icreate_args structure to hold all this information, some helpers to initialize them properly, and make the callers pass this structure through to the creation function, whose name we shorten to xfs_icreate. This eliminates the issues, enables us to keep the inode init code in sync with userspace via libxfs, and is needed for future metadata directory tree management. (A subsequent cleanup will also fix the quota alloc calls.) Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.h | 22 ++++++++++ fs/xfs/scrub/tempfile.c | 12 +++--- fs/xfs/xfs_inode.c | 91 +++++++++++++++++++++++++++--------------- fs/xfs/xfs_inode.h | 6 +-- fs/xfs/xfs_qm.c | 6 ++- fs/xfs/xfs_symlink.c | 12 ++++-- 6 files changed, 102 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index f7e4d5a8235d..9226482fdee8 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -13,4 +13,26 @@ uint32_t xfs_ip2xflags(struct xfs_inode *ip); prid_t xfs_get_initial_prid(struct xfs_inode *dp); +/* + * File creation context. + * + * Due to our only partial reliance on the VFS to propagate uid and gid values + * according to accepted Unix behaviors, callers must initialize idmap to the + * correct idmapping structure to get the correct inheritance behaviors when + * XFS_MOUNT_GRPID is set. + * + * To create files detached from the directory tree (e.g. quota inodes), set + * idmap to NULL. To create a tree root, set pip to NULL. + */ +struct xfs_icreate_args { + struct mnt_idmap *idmap; + struct xfs_inode *pip; /* parent inode or null */ + dev_t rdev; + umode_t mode; + +#define XFS_ICREATE_TMPFILE (1U << 0) /* create an unlinked file */ +#define XFS_ICREATE_INIT_XATTRS (1U << 1) /* will set xattrs immediately */ + uint16_t flags; +}; + #endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index b747b625c5ee..ee6f93e9f7cb 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -40,6 +40,11 @@ xrep_tempfile_create( struct xfs_scrub *sc, uint16_t mode) { + struct xfs_icreate_args args = { + .pip = sc->mp->m_rootip, + .mode = mode, + .flags = XFS_ICREATE_TMPFILE, + }; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = NULL; struct xfs_dquot *udqp = NULL; @@ -87,14 +92,11 @@ xrep_tempfile_create( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (error) goto out_trans_cancel; - error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0, - 0, false, &sc->tempip); + error = xfs_icreate(tp, ino, &args, &sc->tempip); if (error) goto out_trans_cancel; - /* Change the ownership of the inode to root. */ - VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID; - VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID; + /* We don't touch file data, so drop the realtime flags. */ sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index afc798ffa164..4f13ad0fa14e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -662,18 +662,13 @@ xfs_inode_inherit_flags2( * Caller is responsible for unlocking the inode manually upon return */ int -xfs_init_new_inode( - struct mnt_idmap *idmap, +xfs_icreate( struct xfs_trans *tp, - struct xfs_inode *pip, xfs_ino_t ino, - umode_t mode, - xfs_nlink_t nlink, - dev_t rdev, - prid_t prid, - bool init_xattrs, + const struct xfs_icreate_args *args, struct xfs_inode **ipp) { + struct xfs_inode *pip = args->pip; struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip; @@ -706,26 +701,43 @@ xfs_init_new_inode( ASSERT(ip != NULL); inode = VFS_I(ip); - set_nlink(inode, nlink); - inode->i_rdev = rdev; - ip->i_projid = prid; - - if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { - inode_fsuid_set(inode, idmap); - inode->i_gid = dir->i_gid; - inode->i_mode = mode; + + if (args->flags & XFS_ICREATE_TMPFILE) + set_nlink(inode, 0); + else if (S_ISDIR(args->mode)) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + inode->i_rdev = args->rdev; + + if (!args->idmap || pip == NULL) { + /* creating a tree root, sb rooted, or detached file */ + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + ip->i_projid = 0; + inode->i_mode = args->mode; } else { - inode_init_owner(idmap, inode, dir, mode); - } + /* creating a child in the directory tree */ + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { + inode_fsuid_set(inode, args->idmap); + inode->i_gid = dir->i_gid; + inode->i_mode = args->mode; + } else { + inode_init_owner(args->idmap, inode, dir, args->mode); + } - /* - * If the group ID of the new file does not match the effective group - * ID or one of the supplementary group IDs, the S_ISGID bit is cleared - * (and only if the irix_sgid_inherit compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) - inode->i_mode &= ~S_ISGID; + /* + * If the group ID of the new file does not match the effective + * group ID or one of the supplementary group IDs, the S_ISGID + * bit is cleared (and only if the irix_sgid_inherit + * compatibility variable is set). + */ + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) + inode->i_mode &= ~S_ISGID; + + ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0; + } ip->i_disk_size = 0; ip->i_df.if_nextents = 0; @@ -745,7 +757,7 @@ xfs_init_new_inode( } flags = XFS_ILOG_CORE; - switch (mode & S_IFMT) { + switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: @@ -778,7 +790,7 @@ xfs_init_new_inode( * this saves us from needing to run a separate transaction to set the * fork offset in the immediate future. */ - if (init_xattrs) { + if (args->flags & XFS_ICREATE_INIT_XATTRS) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); @@ -941,6 +953,13 @@ xfs_create( bool init_xattrs, xfs_inode_t **ipp) { + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = dp, + .rdev = rdev, + .mode = mode, + .flags = init_xattrs ? XFS_ICREATE_INIT_XATTRS : 0, + }; int is_dir = S_ISDIR(mode); struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1016,8 +1035,7 @@ xfs_create( */ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(idmap, tp, dp, ino, mode, - is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); + error = xfs_icreate(tp, ino, &args, &ip); if (error) goto out_trans_cancel; @@ -1125,11 +1143,17 @@ xfs_create_tmpfile( bool init_xattrs, struct xfs_inode **ipp) { + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = dp, + .mode = mode, + .flags = XFS_ICREATE_TMPFILE, + }; struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - prid_t prid; + prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; @@ -1141,6 +1165,8 @@ xfs_create_tmpfile( return -EIO; prid = xfs_get_initial_prid(dp); + if (init_xattrs) + args.flags |= XFS_ICREATE_INIT_XATTRS; /* * Make sure that we have allocated dquot(s) on disk. The uid/gid @@ -1164,8 +1190,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(idmap, tp, dp, ino, mode, - 0, 0, prid, init_xattrs, &ip); + error = xfs_icreate(tp, ino, &args, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 15ab7a1c79a6..7d3fea66e069 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,10 +554,8 @@ int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); -int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, - struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, - xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, - struct xfs_inode **ipp); +int xfs_icreate(struct xfs_trans *tp, xfs_ino_t ino, + const struct xfs_icreate_args *args, struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 47120b745c47..78f839630c62 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -793,12 +793,14 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { + struct xfs_icreate_args args = { + .mode = S_IFREG, + }; xfs_ino_t ino; error = xfs_dialloc(&tp, 0, S_IFREG, &ino); if (!error) - error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino, - S_IFREG, 1, 0, 0, false, ipp); + error = xfs_icreate(tp, ino, &args, ipp); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 53ed512c6f21..3b797a39950d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -90,6 +90,11 @@ xfs_symlink( struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = dp, + .mode = S_IFLNK | (mode & ~S_IFMT), + }; struct xfs_trans *tp = NULL; struct xfs_inode *ip = NULL; int error = 0; @@ -111,6 +116,9 @@ xfs_symlink( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_has_parent(mp)) + args.flags |= XFS_ICREATE_INIT_XATTRS; + /* * Check component lengths of the target path name. */ @@ -170,9 +178,7 @@ xfs_symlink( */ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); if (!error) - error = xfs_init_new_inode(idmap, tp, dp, ino, - S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, - xfs_has_parent(mp), &ip); + error = xfs_icreate(tp, ino, &args, &ip); if (error) goto out_trans_cancel; -- cgit v1.2.3-70-g09d2 From 3d1dfb6df9b7b9ffc95499b9ddd92d949e5a60d2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:35 -0700 Subject: xfs: implement atime updates in xfs_trans_ichgtime Enable xfs_trans_ichgtime to change the inode access time so that we can use this function to set inode times when allocating inodes instead of open-coding it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_shared.h | 1 + fs/xfs/libxfs/xfs_trans_inode.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 34f104ed372c..9a705381f9e4 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -183,6 +183,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ #define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ +#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */ /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 69fc5b981352..3c40f37e82c7 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -68,6 +68,8 @@ xfs_trans_ichgtime( inode_set_mtime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CHG) inode_set_ctime_to_ts(inode, tv); + if (flags & XFS_ICHGTIME_ACCESS) + inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } -- cgit v1.2.3-70-g09d2 From a7b12718cb90188bc1a062d6cbb9d9a3f790e20a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:36 -0700 Subject: xfs: use xfs_trans_ichgtime to set times when allocating inode Use xfs_trans_ichgtime to set the inode times when allocating an inode, instead of open-coding them here. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f13ad0fa14e..59e76759e41f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -672,10 +672,11 @@ xfs_icreate( struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip; + struct inode *inode; unsigned int flags; + int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | + XFS_ICHGTIME_ACCESS; int error; - struct timespec64 tv; - struct inode *inode; /* * Protect against obviously corrupt allocation btree records. Later @@ -743,19 +744,17 @@ xfs_icreate( ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); - tv = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, tv); - inode_set_atime_to_ts(inode, tv); - ip->i_extsize = 0; ip->i_diflags = 0; if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); ip->i_cowextsize = 0; - ip->i_crtime = tv; + times |= XFS_ICHGTIME_CREATE; } + xfs_trans_ichgtime(tp, ip, times); + flags = XFS_ILOG_CORE; switch (args->mode & S_IFMT) { case S_IFIFO: -- cgit v1.2.3-70-g09d2 From 38fd3d6a956f1b104f11cd6eee116c54bfe458c4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:36 -0700 Subject: xfs: split new inode creation into two pieces There are two parts to initializing a newly allocated inode: setting up the incore structures, and initializing the new inode core based on the parent inode and the current user's environment. The initialization code is not specific to the kernel, so we would like to share that with userspace by hoisting it to libxfs. Therefore, split xfs_icreate into separate functions to prepare for the next few patches. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ialloc.c | 15 ++++++++++ fs/xfs/xfs_inode.c | 73 +++++++++++++++++++++------------------------- 2 files changed, 48 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 14c81f227c5b..f8d5ed7aedde 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1946,6 +1946,21 @@ retry: } return -ENOSPC; } + + /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); + return -EFSCORRUPTED; + } + *new_ino = ino; return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 59e76759e41f..88c641400f51 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -655,53 +655,20 @@ xfs_inode_inherit_flags2( } } -/* - * Initialise a newly allocated inode and return the in-core inode to the - * caller locked exclusively. - * - * Caller is responsible for unlocking the inode manually upon return - */ -int -xfs_icreate( +/* Initialise an inode's attributes. */ +static void +xfs_inode_init( struct xfs_trans *tp, - xfs_ino_t ino, const struct xfs_icreate_args *args, - struct xfs_inode **ipp) + struct xfs_inode *ip) { struct xfs_inode *pip = args->pip; struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; - struct xfs_inode *ip; - struct inode *inode; + struct inode *inode = VFS_I(ip); unsigned int flags; int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | XFS_ICHGTIME_ACCESS; - int error; - - /* - * Protect against obviously corrupt allocation btree records. Later - * xfs_iget checks will catch re-allocation of other active in-memory - * and on-disk inodes. If we don't catch reallocating the parent inode - * here we will deadlock in xfs_iget() so we have to do these checks - * first. - */ - if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { - xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); - xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), - XFS_SICK_AG_INOBT); - return -EFSCORRUPTED; - } - - /* - * Get the in-core inode with the lock held exclusively to prevent - * others from looking at until we're done. - */ - error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); - if (error) - return error; - - ASSERT(ip != NULL); - inode = VFS_I(ip); if (args->flags & XFS_ICREATE_TMPFILE) set_nlink(inode, 0); @@ -801,11 +768,37 @@ xfs_icreate( } } + xfs_trans_log_inode(tp, ip, flags); +} + +/* + * Initialise a newly allocated inode and return the in-core inode to the + * caller locked exclusively. + * + * Caller is responsible for unlocking the inode manually upon return + */ +int +xfs_icreate( + struct xfs_trans *tp, + xfs_ino_t ino, + const struct xfs_icreate_args *args, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip = NULL; + int error; + /* - * Log the new values stuffed into the inode. + * Get the in-core inode with the lock held exclusively to prevent + * others from looking at until we're done. */ + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); + if (error) + return error; + + ASSERT(ip != NULL); xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, flags); + xfs_inode_init(tp, args, ip); /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); -- cgit v1.2.3-70-g09d2 From e9d2b35bb9d3ff372fad27998fc3969ced3f563d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:37 -0700 Subject: xfs: hoist new inode initialization functions to libxfs Move all the code that initializes a new inode's attributes from the icreate_args structure and the parent directory into libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 212 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 12 +++ fs/xfs/libxfs/xfs_shared.h | 8 -- fs/xfs/xfs_inode.c | 209 +--------------------------------------- fs/xfs/xfs_trans.h | 1 - 5 files changed, 225 insertions(+), 217 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 3b5397a3f34f..8654cb3d79ef 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -3,6 +3,7 @@ * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ +#include #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -13,6 +14,10 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_inode_util.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" +#include "xfs_health.h" +#include "xfs_bmap.h" uint16_t xfs_flags2diflags( @@ -132,3 +137,210 @@ xfs_get_initial_prid(struct xfs_inode *dp) /* Assign to the root project by default. */ return 0; } + +/* Propagate di_flags from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + xfs_failaddr_t failaddr; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_extsize = pip->i_extsize; + } + if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && + xfs_has_realtime(ip->i_mount)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_extsize = pip->i_extsize; + } + } + if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_diflags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_diflags |= di_flags; + + /* + * Inode verifiers on older kernels only check that the extent size + * hint is an integer multiple of the rt extent size on realtime files. + * They did not check the hint alignment on a directory with both + * rtinherit and extszinherit flags set. If the misaligned hint is + * propagated from a directory into a new realtime file, new file + * allocations will fail due to math errors in the rt allocator and/or + * trip the verifiers. Validate the hint settings in the new file so + * that we don't let broken hints propagate. + */ + failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, + VFS_I(ip)->i_mode, ip->i_diflags); + if (failaddr) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + } +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + xfs_failaddr_t failaddr; + + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = pip->i_cowextsize; + } + if (pip->i_diflags2 & XFS_DIFLAG2_DAX) + ip->i_diflags2 |= XFS_DIFLAG2_DAX; + + /* Don't let invalid cowextsize hints propagate. */ + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); + if (failaddr) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + } +} + +/* Initialise an inode's attributes. */ +void +xfs_inode_init( + struct xfs_trans *tp, + const struct xfs_icreate_args *args, + struct xfs_inode *ip) +{ + struct xfs_inode *pip = args->pip; + struct inode *dir = pip ? VFS_I(pip) : NULL; + struct xfs_mount *mp = tp->t_mountp; + struct inode *inode = VFS_I(ip); + unsigned int flags; + int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | + XFS_ICHGTIME_ACCESS; + + if (args->flags & XFS_ICREATE_TMPFILE) + set_nlink(inode, 0); + else if (S_ISDIR(args->mode)) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + inode->i_rdev = args->rdev; + + if (!args->idmap || pip == NULL) { + /* creating a tree root, sb rooted, or detached file */ + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + ip->i_projid = 0; + inode->i_mode = args->mode; + } else { + /* creating a child in the directory tree */ + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { + inode_fsuid_set(inode, args->idmap); + inode->i_gid = dir->i_gid; + inode->i_mode = args->mode; + } else { + inode_init_owner(args->idmap, inode, dir, args->mode); + } + + /* + * If the group ID of the new file does not match the effective + * group ID or one of the supplementary group IDs, the S_ISGID + * bit is cleared (and only if the irix_sgid_inherit + * compatibility variable is set). + */ + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) + inode->i_mode &= ~S_ISGID; + + ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0; + } + + ip->i_disk_size = 0; + ip->i_df.if_nextents = 0; + ASSERT(ip->i_nblocks == 0); + + ip->i_extsize = 0; + ip->i_diflags = 0; + + if (xfs_has_v3inodes(mp)) { + inode_set_iversion(inode, 1); + ip->i_cowextsize = 0; + times |= XFS_ICHGTIME_CREATE; + } + + xfs_trans_ichgtime(tp, ip, times); + + flags = XFS_ILOG_CORE; + switch (args->mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_df.if_format = XFS_DINODE_FMT_DEV; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); + fallthrough; + case S_IFLNK: + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_bytes = 0; + ip->i_df.if_data = NULL; + break; + default: + ASSERT(0); + } + + /* + * If we need to create attributes immediately after allocating the + * inode, initialise an empty attribute fork right now. We use the + * default fork offset for attributes here as we don't know exactly what + * size or how many attributes we might be adding. We can do this + * safely here because we know the data fork is completely empty and + * this saves us from needing to run a separate transaction to set the + * fork offset in the immediate future. + */ + if (args->flags & XFS_ICREATE_INIT_XATTRS) { + ip->i_forkoff = xfs_default_attroffset(ip) >> 3; + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); + + if (!xfs_has_attr(mp)) { + spin_lock(&mp->m_sb_lock); + xfs_add_attr(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } + } + + xfs_trans_log_inode(tp, ip, flags); +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index 9226482fdee8..bf5393db4fde 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -35,4 +35,16 @@ struct xfs_icreate_args { uint16_t flags; }; +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ +#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */ +void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags); + +void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, + struct xfs_inode *ip); + #endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 9a705381f9e4..2f7413afbf46 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -177,14 +177,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_REFC_BTREE_REF 1 #define XFS_SSB_REF 0 -/* - * Flags for xfs_trans_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ -#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ -#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */ - /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { /* Maximum inode count in this filesystem. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 88c641400f51..152def4ca5db 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -42,7 +42,7 @@ #include "xfs_pnfs.h" #include "xfs_parent.h" #include "xfs_xattr.h" -#include "xfs_sb.h" +#include "xfs_inode_util.h" struct kmem_cache *xfs_inode_cache; @@ -564,213 +564,6 @@ out_unlock: return error; } -/* Propagate di_flags from a parent inode to a child inode. */ -static void -xfs_inode_inherit_flags( - struct xfs_inode *ip, - const struct xfs_inode *pip) -{ - unsigned int di_flags = 0; - xfs_failaddr_t failaddr; - umode_t mode = VFS_I(ip)->i_mode; - - if (S_ISDIR(mode)) { - if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_extsize = pip->i_extsize; - } - if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(mode)) { - if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && - xfs_has_realtime(ip->i_mount)) - di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_extsize = pip->i_extsize; - } - } - if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && - xfs_inherit_noatime) - di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && - xfs_inherit_nodump) - di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_diflags & XFS_DIFLAG_SYNC) && - xfs_inherit_sync) - di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && - xfs_inherit_nosymlinks) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && - xfs_inherit_nodefrag) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - - ip->i_diflags |= di_flags; - - /* - * Inode verifiers on older kernels only check that the extent size - * hint is an integer multiple of the rt extent size on realtime files. - * They did not check the hint alignment on a directory with both - * rtinherit and extszinherit flags set. If the misaligned hint is - * propagated from a directory into a new realtime file, new file - * allocations will fail due to math errors in the rt allocator and/or - * trip the verifiers. Validate the hint settings in the new file so - * that we don't let broken hints propagate. - */ - failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, - VFS_I(ip)->i_mode, ip->i_diflags); - if (failaddr) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - } -} - -/* Propagate di_flags2 from a parent inode to a child inode. */ -static void -xfs_inode_inherit_flags2( - struct xfs_inode *ip, - const struct xfs_inode *pip) -{ - xfs_failaddr_t failaddr; - - if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = pip->i_cowextsize; - } - if (pip->i_diflags2 & XFS_DIFLAG2_DAX) - ip->i_diflags2 |= XFS_DIFLAG2_DAX; - - /* Don't let invalid cowextsize hints propagate. */ - failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, - VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); - if (failaddr) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - } -} - -/* Initialise an inode's attributes. */ -static void -xfs_inode_init( - struct xfs_trans *tp, - const struct xfs_icreate_args *args, - struct xfs_inode *ip) -{ - struct xfs_inode *pip = args->pip; - struct inode *dir = pip ? VFS_I(pip) : NULL; - struct xfs_mount *mp = tp->t_mountp; - struct inode *inode = VFS_I(ip); - unsigned int flags; - int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | - XFS_ICHGTIME_ACCESS; - - if (args->flags & XFS_ICREATE_TMPFILE) - set_nlink(inode, 0); - else if (S_ISDIR(args->mode)) - set_nlink(inode, 2); - else - set_nlink(inode, 1); - inode->i_rdev = args->rdev; - - if (!args->idmap || pip == NULL) { - /* creating a tree root, sb rooted, or detached file */ - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - ip->i_projid = 0; - inode->i_mode = args->mode; - } else { - /* creating a child in the directory tree */ - if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { - inode_fsuid_set(inode, args->idmap); - inode->i_gid = dir->i_gid; - inode->i_mode = args->mode; - } else { - inode_init_owner(args->idmap, inode, dir, args->mode); - } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - - ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0; - } - - ip->i_disk_size = 0; - ip->i_df.if_nextents = 0; - ASSERT(ip->i_nblocks == 0); - - ip->i_extsize = 0; - ip->i_diflags = 0; - - if (xfs_has_v3inodes(mp)) { - inode_set_iversion(inode, 1); - ip->i_cowextsize = 0; - times |= XFS_ICHGTIME_CREATE; - } - - xfs_trans_ichgtime(tp, ip, times); - - flags = XFS_ILOG_CORE; - switch (args->mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - ip->i_df.if_format = XFS_DINODE_FMT_DEV; - flags |= XFS_ILOG_DEV; - break; - case S_IFREG: - case S_IFDIR: - if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) - xfs_inode_inherit_flags(ip, pip); - if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) - xfs_inode_inherit_flags2(ip, pip); - fallthrough; - case S_IFLNK: - ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; - ip->i_df.if_bytes = 0; - ip->i_df.if_data = NULL; - break; - default: - ASSERT(0); - } - - /* - * If we need to create attributes immediately after allocating the - * inode, initialise an empty attribute fork right now. We use the - * default fork offset for attributes here as we don't know exactly what - * size or how many attributes we might be adding. We can do this - * safely here because we know the data fork is completely empty and - * this saves us from needing to run a separate transaction to set the - * fork offset in the immediate future. - */ - if (args->flags & XFS_ICREATE_INIT_XATTRS) { - ip->i_forkoff = xfs_default_attroffset(ip) >> 3; - xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); - - if (!xfs_has_attr(mp)) { - spin_lock(&mp->m_sb_lock); - xfs_add_attr(mp); - spin_unlock(&mp->m_sb_lock); - xfs_log_sb(tp); - } - } - - xfs_trans_log_inode(tp, ip, flags); -} - /* * Initialise a newly allocated inode and return the in-core inode to the * caller locked exclusively. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 1636663707dc..f97e8c68641f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -224,7 +224,6 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, uint); -- cgit v1.2.3-70-g09d2 From dfaf884233ba726bf389cbf6f629b3a3a7a93923 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:38 -0700 Subject: xfs: push xfs_icreate_args creation out of xfs_create* Move the initialization of the xfs_icreate_args structure out of xfs_create and xfs_create_tempfile into their callers so that we can set the new inode's attributes in one place and pass that through instead of open coding the collection of attributes all over the code. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 80 ++++++++++++++++++++++++------------------------------ fs/xfs/xfs_inode.h | 9 ++---- fs/xfs/xfs_iops.c | 41 ++++++++++++++++------------ 3 files changed, 63 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 152def4ca5db..0b4f6cf72bae 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -730,35 +730,25 @@ xfs_dir_hook_setup( int xfs_create( - struct mnt_idmap *idmap, - struct xfs_inode *dp, + const struct xfs_icreate_args *args, struct xfs_name *name, - umode_t mode, - dev_t rdev, - bool init_xattrs, - xfs_inode_t **ipp) + struct xfs_inode **ipp) { - struct xfs_icreate_args args = { - .idmap = idmap, - .pip = dp, - .rdev = rdev, - .mode = mode, - .flags = init_xattrs ? XFS_ICREATE_INIT_XATTRS : 0, - }; - int is_dir = S_ISDIR(mode); + struct xfs_inode *dp = args->pip; struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; - int error; - bool unlock_dp_on_error = false; - prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; - uint resblks; - xfs_ino_t ino; struct xfs_parent_args *ppargs; + xfs_ino_t ino; + prid_t prid; + bool unlock_dp_on_error = false; + bool is_dir = S_ISDIR(args->mode); + uint resblks; + int error; trace_xfs_create(dp, name); @@ -774,8 +764,9 @@ xfs_create( * computation code must match what the VFS uses to assign i_[ug]id. * INHERIT adjusts the gid computation for setgid/grpid systems. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), - mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, + error = xfs_qm_vop_dqalloc(dp, + mapped_fsuid(args->idmap, i_user_ns(VFS_I(dp))), + mapped_fsgid(args->idmap, i_user_ns(VFS_I(dp))), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -818,9 +809,9 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino); if (!error) - error = xfs_icreate(tp, ino, &args, &ip); + error = xfs_icreate(tp, ino, args, &ip); if (error) goto out_trans_cancel; @@ -922,44 +913,37 @@ xfs_create( int xfs_create_tmpfile( - struct mnt_idmap *idmap, - struct xfs_inode *dp, - umode_t mode, - bool init_xattrs, + const struct xfs_icreate_args *args, struct xfs_inode **ipp) { - struct xfs_icreate_args args = { - .idmap = idmap, - .pip = dp, - .mode = mode, - .flags = XFS_ICREATE_TMPFILE, - }; + struct xfs_inode *dp = args->pip; struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; - int error; - prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; - uint resblks; xfs_ino_t ino; + prid_t prid; + uint resblks; + int error; + + ASSERT(args->flags & XFS_ICREATE_TMPFILE); if (xfs_is_shutdown(mp)) return -EIO; prid = xfs_get_initial_prid(dp); - if (init_xattrs) - args.flags |= XFS_ICREATE_INIT_XATTRS; /* * Make sure that we have allocated dquot(s) on disk. The uid/gid * computation code must match what the VFS uses to assign i_[ug]id. * INHERIT adjusts the gid computation for setgid/grpid systems. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), - mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, + error = xfs_qm_vop_dqalloc(dp, + mapped_fsuid(args->idmap, i_user_ns(VFS_I(dp))), + mapped_fsgid(args->idmap, i_user_ns(VFS_I(dp))), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -973,9 +957,9 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino); if (!error) - error = xfs_icreate(tp, ino, &args, &ip); + error = xfs_icreate(tp, ino, args, &ip); if (error) goto out_trans_cancel; @@ -2854,12 +2838,20 @@ xfs_rename_alloc_whiteout( struct xfs_inode *dp, struct xfs_inode **wip) { + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = dp, + .mode = S_IFCHR | WHITEOUT_MODE, + .flags = XFS_ICREATE_TMPFILE, + }; struct xfs_inode *tmpfile; struct qstr name; int error; - error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, - xfs_has_parent(dp->i_mount), &tmpfile); + if (xfs_has_parent(dp->i_mount)) + args.flags |= XFS_ICREATE_INIT_XATTRS; + + error = xfs_create_tmpfile(&args, &tmpfile); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7d3fea66e069..bc48e81829b5 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -516,12 +516,9 @@ int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct mnt_idmap *idmap, - struct xfs_inode *dp, struct xfs_name *name, - umode_t mode, dev_t rdev, bool need_xattr, - struct xfs_inode **ipp); -int xfs_create_tmpfile(struct mnt_idmap *idmap, - struct xfs_inode *dp, umode_t mode, bool init_xattrs, +int xfs_create(const struct xfs_icreate_args *iargs, + struct xfs_name *name, struct xfs_inode **ipp); +int xfs_create_tmpfile(const struct xfs_icreate_args *iargs, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 35a84790d26e..4563ba440570 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -173,38 +173,46 @@ xfs_generic_create( dev_t rdev, struct file *tmpfile) /* unnamed file */ { - struct inode *inode; - struct xfs_inode *ip = NULL; - struct posix_acl *default_acl, *acl; - struct xfs_name name; - int error; + struct xfs_icreate_args args = { + .idmap = idmap, + .pip = XFS_I(dir), + .rdev = rdev, + .mode = mode, + }; + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl, *acl; + struct xfs_name name; + int error; /* * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ - if (S_ISCHR(mode) || S_ISBLK(mode)) { - if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) { + if (unlikely(!sysv_valid_dev(args.rdev) || + MAJOR(args.rdev) & ~0x1ff)) return -EINVAL; } else { - rdev = 0; + args.rdev = 0; } - error = posix_acl_create(dir, &mode, &default_acl, &acl); + error = posix_acl_create(dir, &args.mode, &default_acl, &acl); if (error) return error; /* Verify mode is valid also for tmpfile case */ - error = xfs_dentry_mode_to_name(&name, dentry, mode); + error = xfs_dentry_mode_to_name(&name, dentry, args.mode); if (unlikely(error)) goto out_free_acl; if (!tmpfile) { - error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev, - xfs_create_need_xattr(dir, default_acl, acl), - &ip); + if (xfs_create_need_xattr(dir, default_acl, acl)) + args.flags |= XFS_ICREATE_INIT_XATTRS; + + error = xfs_create(&args, &name, &ip); } else { - bool init_xattrs = false; + args.flags |= XFS_ICREATE_TMPFILE; /* * If this temporary file will be linkable, set up the file @@ -212,10 +220,9 @@ xfs_generic_create( */ if (!(tmpfile->f_flags & O_EXCL) && xfs_has_parent(XFS_I(dir)->i_mount)) - init_xattrs = true; + args.flags |= XFS_ICREATE_INIT_XATTRS; - error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, - init_xattrs, &ip); + error = xfs_create_tmpfile(&args, &ip); } if (unlikely(error)) goto out_free_acl; -- cgit v1.2.3-70-g09d2 From c0223b8d66d2b3e8fed86fd80699ef2fef3e53af Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:39 -0700 Subject: xfs: wrap inode creation dqalloc calls Create a helper that calls dqalloc to allocate and grab a reference to dquots for the user, group, and project ids listed in an icreate structure. This simplifies the creat-related dqalloc callsites scattered around the code base. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/tempfile.c | 9 +++--- fs/xfs/xfs_inode.c | 74 ++++++++++++++++++++++++++++--------------------- fs/xfs/xfs_inode.h | 4 +++ fs/xfs/xfs_symlink.c | 20 ++++--------- 4 files changed, 55 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index ee6f93e9f7cb..523971a15a72 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -47,9 +47,9 @@ xrep_tempfile_create( }; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = NULL; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; struct xfs_trans_res *tres; struct xfs_inode *dp = mp->m_rootip; xfs_ino_t ino; @@ -70,8 +70,7 @@ xrep_tempfile_create( * inode should be completely root owned so that we don't fail due to * quota limits. */ - error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); if (error) return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0b4f6cf72bae..5848f7b36cc5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -728,6 +728,38 @@ xfs_dir_hook_setup( } #endif /* CONFIG_XFS_LIVE_HOOKS */ +/* Return dquots for the ids that will be assigned to a new file. */ +int +xfs_icreate_dqalloc( + const struct xfs_icreate_args *args, + struct xfs_dquot **udqpp, + struct xfs_dquot **gdqpp, + struct xfs_dquot **pdqpp) +{ + struct inode *dir = VFS_I(args->pip); + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; + prid_t prid = 0; + unsigned int flags = XFS_QMOPT_QUOTALL; + + if (args->idmap) { + /* + * The uid/gid computation code must match what the VFS uses to + * assign i_[ug]id. INHERIT adjusts the gid computation for + * setgid/grpid systems. + */ + uid = mapped_fsuid(args->idmap, i_user_ns(dir)); + gid = mapped_fsgid(args->idmap, i_user_ns(dir)); + prid = xfs_get_initial_prid(args->pip); + flags |= XFS_QMOPT_INHERIT; + } + + *udqpp = *gdqpp = *pdqpp = NULL; + + return xfs_qm_vop_dqalloc(args->pip, uid, gid, prid, flags, udqpp, + gdqpp, pdqpp); +} + int xfs_create( const struct xfs_icreate_args *args, @@ -738,13 +770,12 @@ xfs_create( struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; struct xfs_trans_res *tres; struct xfs_parent_args *ppargs; xfs_ino_t ino; - prid_t prid; bool unlock_dp_on_error = false; bool is_dir = S_ISDIR(args->mode); uint resblks; @@ -757,18 +788,8 @@ xfs_create( if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) return -EIO; - prid = xfs_get_initial_prid(dp); - - /* - * Make sure that we have allocated dquot(s) on disk. The uid/gid - * computation code must match what the VFS uses to assign i_[ug]id. - * INHERIT adjusts the gid computation for setgid/grpid systems. - */ - error = xfs_qm_vop_dqalloc(dp, - mapped_fsuid(args->idmap, i_user_ns(VFS_I(dp))), - mapped_fsgid(args->idmap, i_user_ns(VFS_I(dp))), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + /* Make sure that we have allocated dquot(s) on disk. */ + error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp); if (error) return error; @@ -920,12 +941,11 @@ xfs_create_tmpfile( struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; struct xfs_trans_res *tres; xfs_ino_t ino; - prid_t prid; uint resblks; int error; @@ -934,18 +954,8 @@ xfs_create_tmpfile( if (xfs_is_shutdown(mp)) return -EIO; - prid = xfs_get_initial_prid(dp); - - /* - * Make sure that we have allocated dquot(s) on disk. The uid/gid - * computation code must match what the VFS uses to assign i_[ug]id. - * INHERIT adjusts the gid computation for setgid/grpid systems. - */ - error = xfs_qm_vop_dqalloc(dp, - mapped_fsuid(args->idmap, i_user_ns(VFS_I(dp))), - mapped_fsgid(args->idmap, i_user_ns(VFS_I(dp))), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + /* Make sure that we have allocated dquot(s) on disk. */ + error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index bc48e81829b5..a905929494bd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -660,4 +660,8 @@ void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); # define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) #endif /* CONFIG_XFS_LIVE_HOOKS */ +int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, + struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp, + struct xfs_dquot **pdqpp); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3b797a39950d..6ff736e5c4e7 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -101,10 +101,9 @@ xfs_symlink( int pathlen; bool unlock_dp_on_error = false; xfs_filblks_t fs_blocks; - prid_t prid; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; uint resblks; xfs_ino_t ino; struct xfs_parent_args *ppargs; @@ -127,17 +126,8 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - prid = xfs_get_initial_prid(dp); - - /* - * Make sure that we have allocated dquot(s) on disk. The uid/gid - * computation code must match what the VFS uses to assign i_[ug]id. - * INHERIT adjusts the gid computation for setgid/grpid systems. - */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, i_user_ns(VFS_I(dp))), - mapped_fsgid(idmap, i_user_ns(VFS_I(dp))), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + /* Make sure that we have allocated dquot(s) on disk. */ + error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); if (error) return error; -- cgit v1.2.3-70-g09d2 From b8a6107921ca799330ff3efdd154b7fa0ff54582 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:40 -0700 Subject: xfs: hoist xfs_iunlink to libxfs Move xfs_iunlink and xfs_iunlink_remove to libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 282 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 4 + fs/xfs/xfs_inode.c | 280 +--------------------------------------- fs/xfs/xfs_inode.h | 5 +- 4 files changed, 289 insertions(+), 282 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 8654cb3d79ef..5739871ac370 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -18,6 +18,10 @@ #include "xfs_ialloc.h" #include "xfs_health.h" #include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" uint16_t xfs_flags2diflags( @@ -344,3 +348,281 @@ xfs_inode_init( xfs_trans_log_inode(tp, ip, flags); } + +/* + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. + * + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. + */ + +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ +static int +xfs_iunlink_update_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_inode *ip; + + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) + return 0; + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -ENOLINK; + + ip->i_prev_unlinked = prev_agino; + return 0; +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +static int +xfs_iunlink_insert_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t next_agino; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(pag, next_agino)) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Update the prev pointer in the next inode to point back to this + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); + if (error) + return error; + + if (next_agino != NULLAGINO) { + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); + if (error) + return error; + ip->i_next_unlinked = next_agino; + } + + /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. + */ +int +xfs_iunlink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + struct xfs_buf *agibp; + int error; + + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + goto out; + + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); +out: + xfs_perag_put(pag); + return error; +} + +static int +xfs_iunlink_remove_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + trace_xfs_iunlink_remove(ip); + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. + */ + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(pag, head_agino)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + agi, sizeof(*agi)); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); + if (error) + return error; + + /* + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. + */ + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; + + if (head_agino != agino) { + struct xfs_inode *prev_ip; + + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_log_inode(tp, prev_ip, pag, + ip->i_next_unlinked); + prev_ip->i_next_unlinked = ip->i_next_unlinked; + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); + } + + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; + return error; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index bf5393db4fde..42a032afe3ca 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -47,4 +47,8 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags); void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, struct xfs_inode *ip); +int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip); + #endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5848f7b36cc5..ac5826ce5fec 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1738,39 +1738,6 @@ out: return error; } -/* - * In-Core Unlinked List Lookups - * ============================= - * - * Every inode is supposed to be reachable from some other piece of metadata - * with the exception of the root directory. Inodes with a connection to a - * file descriptor but not linked from anywhere in the on-disk directory tree - * are collectively known as unlinked inodes, though the filesystem itself - * maintains links to these inodes so that on-disk metadata are consistent. - * - * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI - * header contains a number of buckets that point to an inode, and each inode - * record has a pointer to the next inode in the hash chain. This - * singly-linked list causes scaling problems in the iunlink remove function - * because we must walk that list to find the inode that points to the inode - * being removed from the unlinked hash bucket list. - * - * Hence we keep an in-memory double linked list to link each inode on an - * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer - * based lists would require having 64 list heads in the perag, one for each - * list. This is expensive in terms of memory (think millions of AGs) and cache - * misses on lookups. Instead, use the fact that inodes on the unlinked list - * must be referenced at the VFS level to keep them on the list and hence we - * have an existence guarantee for inodes on the unlinked list. - * - * Given we have an existence guarantee, we can use lockless inode cache lookups - * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode - * for the double linked unlinked list, and we don't need any extra locking to - * keep the list safe as all manipulations are done under the AGI buffer lock. - * Keeping the list up to date does not require memory allocation, just finding - * the XFS inode and updating the next/prev unlinked list aginos. - */ - /* * Find an inode on the unlinked list. This does not take references to the * inode as we have existence guarantees by holding the AGI buffer lock and that @@ -1805,76 +1772,12 @@ xfs_iunlink_lookup( return ip; } -/* - * Update the prev pointer of the next agino. Returns -ENOLINK if the inode - * is not in cache. - */ -static int -xfs_iunlink_update_backref( - struct xfs_perag *pag, - xfs_agino_t prev_agino, - xfs_agino_t next_agino) -{ - struct xfs_inode *ip; - - /* No update necessary if we are at the end of the list. */ - if (next_agino == NULLAGINO) - return 0; - - ip = xfs_iunlink_lookup(pag, next_agino); - if (!ip) - return -ENOLINK; - - ip->i_prev_unlinked = prev_agino; - return 0; -} - -/* - * Point the AGI unlinked bucket at an inode and log the results. The caller - * is responsible for validating the old value. - */ -STATIC int -xfs_iunlink_update_bucket( - struct xfs_trans *tp, - struct xfs_perag *pag, - struct xfs_buf *agibp, - unsigned int bucket_index, - xfs_agino_t new_agino) -{ - struct xfs_agi *agi = agibp->b_addr; - xfs_agino_t old_value; - int offset; - - ASSERT(xfs_verify_agino_or_null(pag, new_agino)); - - old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); - trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, - old_value, new_agino); - - /* - * We should never find the head of the list already set to the value - * passed in because either we're adding or removing ourselves from the - * head of the list. - */ - if (old_value == new_agino) { - xfs_buf_mark_corrupt(agibp); - xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); - return -EFSCORRUPTED; - } - - agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); - offset = offsetof(struct xfs_agi, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); - return 0; -} - /* * Load the inode @next_agino into the cache and set its prev_unlinked pointer * to @prev_agino. Caller must hold the AGI to synchronize with other changes * to the unlinked list. */ -STATIC int +int xfs_iunlink_reload_next( struct xfs_trans *tp, struct xfs_buf *agibp, @@ -1930,187 +1833,6 @@ rele: return error; } -static int -xfs_iunlink_insert_inode( - struct xfs_trans *tp, - struct xfs_perag *pag, - struct xfs_buf *agibp, - struct xfs_inode *ip) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = agibp->b_addr; - xfs_agino_t next_agino; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - int error; - - /* - * Get the index into the agi hash table for the list this inode will - * go on. Make sure the pointer isn't garbage and that this inode - * isn't already on the list. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (next_agino == agino || - !xfs_verify_agino_or_null(pag, next_agino)) { - xfs_buf_mark_corrupt(agibp); - xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); - return -EFSCORRUPTED; - } - - /* - * Update the prev pointer in the next inode to point back to this - * inode. - */ - error = xfs_iunlink_update_backref(pag, agino, next_agino); - if (error == -ENOLINK) - error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); - if (error) - return error; - - if (next_agino != NULLAGINO) { - /* - * There is already another inode in the bucket, so point this - * inode to the current head of the list. - */ - error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); - if (error) - return error; - ip->i_next_unlinked = next_agino; - } - - /* Point the head of the list to point to this inode. */ - ip->i_prev_unlinked = NULLAGINO; - return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); -} - -/* - * This is called when the inode's link count has gone to 0 or we are creating - * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. - * - * We place the on-disk inode on a list in the AGI. It will be pulled from this - * list when the inode is freed. - */ -int -xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; - struct xfs_buf *agibp; - int error; - - ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(VFS_I(ip)->i_mode != 0); - trace_xfs_iunlink(ip); - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, 0, &agibp); - if (error) - goto out; - - error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); -out: - xfs_perag_put(pag); - return error; -} - -static int -xfs_iunlink_remove_inode( - struct xfs_trans *tp, - struct xfs_perag *pag, - struct xfs_buf *agibp, - struct xfs_inode *ip) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = agibp->b_addr; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - xfs_agino_t head_agino; - short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - int error; - - trace_xfs_iunlink_remove(ip); - - /* - * Get the index into the agi hash table for the list this inode will - * go on. Make sure the head pointer isn't garbage. - */ - head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(pag, head_agino)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - agi, sizeof(*agi)); - xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); - return -EFSCORRUPTED; - } - - /* - * Set our inode's next_unlinked pointer to NULL and then return - * the old pointer value so that we can update whatever was previous - * to us in the list to point to whatever was next in the list. - */ - error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); - if (error) - return error; - - /* - * Update the prev pointer in the next inode to point back to previous - * inode in the chain. - */ - error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, - ip->i_next_unlinked); - if (error == -ENOLINK) - error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, - ip->i_next_unlinked); - if (error) - return error; - - if (head_agino != agino) { - struct xfs_inode *prev_ip; - - prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); - if (!prev_ip) { - xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); - return -EFSCORRUPTED; - } - - error = xfs_iunlink_log_inode(tp, prev_ip, pag, - ip->i_next_unlinked); - prev_ip->i_next_unlinked = ip->i_next_unlinked; - } else { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, - ip->i_next_unlinked); - } - - ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = 0; - return error; -} - -/* - * Pull the on-disk inode from the AGI unlinked list. - */ -int -xfs_iunlink_remove( - struct xfs_trans *tp, - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - struct xfs_buf *agibp; - int error; - - trace_xfs_iunlink_remove(ip); - - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, 0, &agibp); - if (error) - return error; - - return xfs_iunlink_remove_inode(tp, pag, agibp, ip); -} - /* * Look up the inode number specified and if it is not already marked XFS_ISTALE * mark it stale. We should only find clean inodes in this lookup that aren't diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a905929494bd..47d3a11a0e7e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -606,10 +606,9 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); -int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); -int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, - struct xfs_inode *ip); struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino); +int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp, + xfs_agino_t prev_agino, xfs_agino_t next_agino); void xfs_end_io(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From a9e583d34facc64b6edf3c9afb2ff4891038176d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:41 -0700 Subject: xfs: hoist xfs_{bump,drop}link to libxfs Move xfs_bumplink and xfs_droplink to libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 53 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 2 ++ fs/xfs/xfs_inode.c | 53 ------------------------------------------ fs/xfs/xfs_inode.h | 2 -- 4 files changed, 55 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 5739871ac370..214976ecefd7 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -626,3 +626,56 @@ xfs_iunlink_remove( return xfs_iunlink_remove_inode(tp, pag, agibp, ip); } + +/* + * Decrement the link count on an inode & log the change. If this causes the + * link count to go to zero, move the inode to AGI unlinked list so that it can + * be freed when the last active reference goes away via xfs_inactive(). + */ +int +xfs_droplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == 0) { + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count dropped below zero. Pinning link count.", + ip->i_ino); + set_nlink(inode, XFS_NLINK_PINNED); + } + if (inode->i_nlink != XFS_NLINK_PINNED) + drop_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (inode->i_nlink) + return 0; + + return xfs_iunlink(tp, ip); +} + +/* + * Increment the link count on an inode & log the change. + */ +void +xfs_bumplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == XFS_NLINK_PINNED - 1) + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count exceeded maximum. Pinning link count.", + ip->i_ino); + if (inode->i_nlink != XFS_NLINK_PINNED) + inc_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index 42a032afe3ca..50c14ba6ca5a 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -50,5 +50,7 @@ void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *ip); +int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); #endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ac5826ce5fec..004f277e5891 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -600,59 +600,6 @@ xfs_icreate( return 0; } -/* - * Decrement the link count on an inode & log the change. If this causes the - * link count to go to zero, move the inode to AGI unlinked list so that it can - * be freed when the last active reference goes away via xfs_inactive(). - */ -int -xfs_droplink( - struct xfs_trans *tp, - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - if (inode->i_nlink == 0) { - xfs_info_ratelimited(tp->t_mountp, - "Inode 0x%llx link count dropped below zero. Pinning link count.", - ip->i_ino); - set_nlink(inode, XFS_NLINK_PINNED); - } - if (inode->i_nlink != XFS_NLINK_PINNED) - drop_nlink(inode); - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - if (inode->i_nlink) - return 0; - - return xfs_iunlink(tp, ip); -} - -/* - * Increment the link count on an inode & log the change. - */ -void -xfs_bumplink( - struct xfs_trans *tp, - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - if (inode->i_nlink == XFS_NLINK_PINNED - 1) - xfs_info_ratelimited(tp->t_mountp, - "Inode 0x%llx link count exceeded maximum. Pinning link count.", - ip->i_ino); - if (inode->i_nlink != XFS_NLINK_PINNED) - inc_nlink(inode); - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -} - #ifdef CONFIG_XFS_LIVE_HOOKS /* * Use a static key here to reduce the overhead of directory live update hooks. diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 47d3a11a0e7e..5ee044674c3a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -615,8 +615,6 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); -int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); -void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode); void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); -- cgit v1.2.3-70-g09d2 From b11b11e3b7a72606cfef527255a9467537bcaaa5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:41 -0700 Subject: xfs: separate the icreate logic around INIT_XATTRS INIT_XATTRS is overloaded here -- it's set during the creat process when we think that we're immediately going to set some ACL xattrs to save time. However, it's also used by the parent pointers code to enable the attr fork in preparation to receive ppptr xattrs. This results in xfs_has_parent() branches scattered around the codebase to turn on INIT_XATTRS. Linkable files are created far more commonly than unlinkable temporary files or directory tree roots, so we should centralize this logic in xfs_inode_init. For the three callers that don't want parent pointers (online repiar tempfiles, unlinkable tempfiles, rootdir creation) we provide an UNLINKABLE flag to skip attr fork initialization. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 36 ++++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_inode_util.h | 1 + fs/xfs/scrub/tempfile.c | 2 +- fs/xfs/xfs_inode.c | 3 --- fs/xfs/xfs_iops.c | 11 ++++------- fs/xfs/xfs_qm.c | 1 + fs/xfs/xfs_symlink.c | 3 --- 7 files changed, 33 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 214976ecefd7..5795445ef4bd 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -233,6 +233,31 @@ xfs_inode_inherit_flags2( } } +/* + * If we need to create attributes immediately after allocating the inode, + * initialise an empty attribute fork right now. We use the default fork offset + * for attributes here as we don't know exactly what size or how many + * attributes we might be adding. We can do this safely here because we know + * the data fork is completely empty and this saves us from needing to run a + * separate transaction to set the fork offset in the immediate future. + * + * If we have parent pointers and the caller hasn't told us that the file will + * never be linked into a directory tree, we /must/ create the attr fork. + */ +static inline bool +xfs_icreate_want_attrfork( + struct xfs_mount *mp, + const struct xfs_icreate_args *args) +{ + if (args->flags & XFS_ICREATE_INIT_XATTRS) + return true; + + if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp)) + return true; + + return false; +} + /* Initialise an inode's attributes. */ void xfs_inode_init( @@ -325,16 +350,7 @@ xfs_inode_init( ASSERT(0); } - /* - * If we need to create attributes immediately after allocating the - * inode, initialise an empty attribute fork right now. We use the - * default fork offset for attributes here as we don't know exactly what - * size or how many attributes we might be adding. We can do this - * safely here because we know the data fork is completely empty and - * this saves us from needing to run a separate transaction to set the - * fork offset in the immediate future. - */ - if (args->flags & XFS_ICREATE_INIT_XATTRS) { + if (xfs_icreate_want_attrfork(mp, args)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index 50c14ba6ca5a..1c54c3b0cf26 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -32,6 +32,7 @@ struct xfs_icreate_args { #define XFS_ICREATE_TMPFILE (1U << 0) /* create an unlinked file */ #define XFS_ICREATE_INIT_XATTRS (1U << 1) /* will set xattrs immediately */ +#define XFS_ICREATE_UNLINKABLE (1U << 2) /* cannot link into dir tree */ uint16_t flags; }; diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 523971a15a72..d390d56cd875 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -43,7 +43,7 @@ xrep_tempfile_create( struct xfs_icreate_args args = { .pip = sc->mp->m_rootip, .mode = mode, - .flags = XFS_ICREATE_TMPFILE, + .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE, }; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = NULL; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 004f277e5891..0062ba92bcc8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2527,9 +2527,6 @@ xfs_rename_alloc_whiteout( struct qstr name; int error; - if (xfs_has_parent(dp->i_mount)) - args.flags |= XFS_ICREATE_INIT_XATTRS; - error = xfs_create_tmpfile(&args, &tmpfile); if (error) return error; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 4563ba440570..07f736c42460 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -158,8 +158,6 @@ xfs_create_need_xattr( if (dir->i_sb->s_security) return true; #endif - if (xfs_has_parent(XFS_I(dir)->i_mount)) - return true; return false; } @@ -215,12 +213,11 @@ xfs_generic_create( args.flags |= XFS_ICREATE_TMPFILE; /* - * If this temporary file will be linkable, set up the file - * with an attr fork to receive a parent pointer. + * If this temporary file will not be linkable, don't bother + * creating an attr fork to receive a parent pointer. */ - if (!(tmpfile->f_flags & O_EXCL) && - xfs_has_parent(XFS_I(dir)->i_mount)) - args.flags |= XFS_ICREATE_INIT_XATTRS; + if (tmpfile->f_flags & O_EXCL) + args.flags |= XFS_ICREATE_UNLINKABLE; error = xfs_create_tmpfile(&args, &ip); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 78f839630c62..9490b913a4ab 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -795,6 +795,7 @@ xfs_qm_qino_alloc( if (need_alloc) { struct xfs_icreate_args args = { .mode = S_IFREG, + .flags = XFS_ICREATE_UNLINKABLE, }; xfs_ino_t ino; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 6ff736e5c4e7..e471369f6b63 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -115,9 +115,6 @@ xfs_symlink( if (xfs_is_shutdown(mp)) return -EIO; - if (xfs_has_parent(mp)) - args.flags |= XFS_ICREATE_INIT_XATTRS; - /* * Check component lengths of the target path name. */ -- cgit v1.2.3-70-g09d2 From 1fa2e81957cf11620867729fb613b121692ee0d3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:42 -0700 Subject: xfs: create libxfs helper to link a new inode into a directory Create a new libxfs function to link a newly created inode into a directory. The upcoming metadata directory feature will need this to create a metadata directory tree. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 12 ++++++++++ fs/xfs/xfs_inode.c | 57 +++++++++++++++--------------------------------- fs/xfs/xfs_symlink.c | 45 ++++++++++++++++---------------------- 4 files changed, 102 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 457f9a38f850..bbed03441f5c 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,6 +19,9 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_health.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_parent.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -756,3 +759,53 @@ xfs_dir2_compname( return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } + +/* + * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip + * into @dp under the given @name. If @ip is a directory, it will be + * initialized. Both inodes must have the ILOCK held and the transaction must + * have sufficient blocks reserved. + */ +int +xfs_dir_create_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOSPC); + return error; + } + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + error = xfs_dir_init(tp, ip, dp); + if (error) + return error; + + xfs_bumplink(tp, dp); + } + + /* + * If we have parent pointers, we need to add the attribute containing + * the parent information now. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 6dbe6e9ecb49..a1ba6fd0a725 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -309,4 +309,16 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) return c; } +struct xfs_parent_args; + +struct xfs_dir_update { + struct xfs_inode *dp; + const struct xfs_name *name; + struct xfs_inode *ip; + struct xfs_parent_args *ppargs; +}; + +int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0062ba92bcc8..e80548ac2b27 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -714,14 +714,16 @@ xfs_create( struct xfs_inode **ipp) { struct xfs_inode *dp = args->pip; + struct xfs_dir_update du = { + .dp = dp, + .name = name, + }; struct xfs_mount *mp = dp->i_mount; - struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; struct xfs_dquot *udqp; struct xfs_dquot *gdqp; struct xfs_dquot *pdqp; struct xfs_trans_res *tres; - struct xfs_parent_args *ppargs; xfs_ino_t ino; bool unlock_dp_on_error = false; bool is_dir = S_ISDIR(args->mode); @@ -748,7 +750,7 @@ xfs_create( tres = &M_RES(mp)->tr_create; } - error = xfs_parent_start(mp, &ppargs); + error = xfs_parent_start(mp, &du.ppargs); if (error) goto out_release_dquots; @@ -779,7 +781,7 @@ xfs_create( */ error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino); if (!error) - error = xfs_icreate(tp, ino, args, &ip); + error = xfs_icreate(tp, ino, args, &du.ip); if (error) goto out_trans_cancel; @@ -792,38 +794,15 @@ xfs_create( */ xfs_trans_ijoin(tp, dp, 0); - error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks - XFS_IALLOC_SPACE_RES(mp)); - if (error) { - ASSERT(error != -ENOSPC); + error = xfs_dir_create_child(tp, resblks, &du); + if (error) goto out_trans_cancel; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - if (is_dir) { - error = xfs_dir_init(tp, ip, dp); - if (error) - goto out_trans_cancel; - - xfs_bumplink(tp, dp); - } - - /* - * If we have parent pointers, we need to add the attribute containing - * the parent information now. - */ - if (ppargs) { - error = xfs_parent_addname(tp, ppargs, dp, name, ip); - if (error) - goto out_trans_cancel; - } /* * Create ip with a reference from dp, and add '.' and '..' references * if it's a directory. */ - xfs_dir_update_hook(dp, ip, 1, name); + xfs_dir_update_hook(dp, du.ip, 1, name); /* * If this is a synchronous mount, make sure that the @@ -838,7 +817,7 @@ xfs_create( * These ids of the inode couldn't have changed since the new * inode has been locked ever since it was created. */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp); error = xfs_trans_commit(tp); if (error) @@ -848,10 +827,10 @@ xfs_create( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); - *ipp = ip; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + *ipp = du.ip; + xfs_iunlock(du.ip, XFS_ILOCK_EXCL); xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); return 0; out_trans_cancel: @@ -862,13 +841,13 @@ xfs_create( * setup of the inode and release the inode. This prevents recursive * transactions and deadlocks from xfs_inactive. */ - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_finish_inode_setup(ip); - xfs_irele(ip); + if (du.ip) { + xfs_iunlock(du.ip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(du.ip); + xfs_irele(du.ip); } out_parent: - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index e471369f6b63..c0f5c2e1f215 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -95,8 +95,11 @@ xfs_symlink( .pip = dp, .mode = S_IFLNK | (mode & ~S_IFMT), }; + struct xfs_dir_update du = { + .dp = dp, + .name = link_name, + }; struct xfs_trans *tp = NULL; - struct xfs_inode *ip = NULL; int error = 0; int pathlen; bool unlock_dp_on_error = false; @@ -106,7 +109,6 @@ xfs_symlink( struct xfs_dquot *pdqp; uint resblks; xfs_ino_t ino; - struct xfs_parent_args *ppargs; *ipp = NULL; @@ -140,7 +142,7 @@ xfs_symlink( fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks); - error = xfs_parent_start(mp, &ppargs); + error = xfs_parent_start(mp, &du.ppargs); if (error) goto out_release_dquots; @@ -165,7 +167,7 @@ xfs_symlink( */ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); if (!error) - error = xfs_icreate(tp, ino, &args, &ip); + error = xfs_icreate(tp, ino, &args, &du.ip); if (error) goto out_trans_cancel; @@ -181,33 +183,24 @@ xfs_symlink( /* * Also attach the dquot(s) to it, if applicable. */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); - error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path, + error = xfs_symlink_write_target(tp, du.ip, du.ip->i_ino, target_path, pathlen, fs_blocks, resblks); if (error) goto out_trans_cancel; resblks -= fs_blocks; - i_size_write(VFS_I(ip), ip->i_disk_size); + i_size_write(VFS_I(du.ip), du.ip->i_disk_size); /* * Create the directory entry for the symlink. */ - error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, resblks); + error = xfs_dir_create_child(tp, resblks, &du); if (error) goto out_trans_cancel; - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* Add parent pointer for the new symlink. */ - if (ppargs) { - error = xfs_parent_addname(tp, ppargs, dp, link_name, ip); - if (error) - goto out_trans_cancel; - } - xfs_dir_update_hook(dp, ip, 1, link_name); + xfs_dir_update_hook(dp, du.ip, 1, link_name); /* * If this is a synchronous mount, make sure that the @@ -225,10 +218,10 @@ xfs_symlink( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); - *ipp = ip; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + *ipp = du.ip; + xfs_iunlock(du.ip, XFS_ILOCK_EXCL); xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); return 0; out_trans_cancel: @@ -239,13 +232,13 @@ out_release_inode: * setup of the inode and release the inode. This prevents recursive * transactions and deadlocks from xfs_inactive. */ - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_finish_inode_setup(ip); - xfs_irele(ip); + if (du.ip) { + xfs_iunlock(du.ip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(du.ip); + xfs_irele(du.ip); } out_parent: - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); -- cgit v1.2.3-70-g09d2 From c1f0bad4232fd309b2fe849153fcf473e775b1f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:43 -0700 Subject: xfs: create libxfs helper to link an existing inode into a directory Create a new libxfs function to link an existing inode into a directory. The upcoming metadata directory feature will need this to create a metadata directory tree. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_dir2.h | 4 ++- fs/xfs/xfs_inode.c | 52 +++++++---------------------------- 3 files changed, 81 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index bbed03441f5c..5a75f60e8518 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -22,6 +22,7 @@ #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_parent.h" +#include "xfs_ag.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -587,9 +588,9 @@ xfs_dir_replace( */ int xfs_dir_canenter( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name) /* name of entry to add */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } @@ -809,3 +810,67 @@ xfs_dir_create_child( return 0; } + +/* + * Given a directory @dp, an existing non-directory inode @ip, and a @name, + * link @ip into @dp under the given @name. Both inodes must have the ILOCK + * held. + */ +int +xfs_dir_add_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + struct xfs_mount *mp = tp->t_mountp; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + ASSERT(!S_ISDIR(VFS_I(ip)->i_mode)); + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + return error; + } + + /* + * Handle initial link state of O_TMPFILE inode + */ + if (VFS_I(ip)->i_nlink == 0) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) + return error; + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + xfs_bumplink(tp, ip); + + /* + * If we have parent pointers, we now need to add the parent record to + * the attribute fork of the inode. If this is the initial parent + * attribute, we need to create it correctly, otherwise we can just add + * the parent to the inode. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index a1ba6fd0a725..4f9711509571 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -74,7 +74,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name); + const struct xfs_name *name); int xfs_dir_lookup_args(struct xfs_da_args *args); int xfs_dir_createname_args(struct xfs_da_args *args); @@ -320,5 +320,7 @@ struct xfs_dir_update { int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du); +int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e80548ac2b27..959fdaef8409 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -952,11 +952,15 @@ xfs_link( struct xfs_inode *sip, struct xfs_name *target_name) { + struct xfs_dir_update du = { + .dp = tdp, + .name = target_name, + .ip = sip, + }; struct xfs_mount *mp = tdp->i_mount; struct xfs_trans *tp; int error, nospace_error = 0; int resblks; - struct xfs_parent_args *ppargs; trace_xfs_link(tdp, target_name); @@ -975,7 +979,7 @@ xfs_link( if (error) goto std_return; - error = xfs_parent_start(mp, &ppargs); + error = xfs_parent_start(mp, &du.ppargs); if (error) goto std_return; @@ -990,7 +994,7 @@ xfs_link( * pointers are enabled because we can't back out if the xattrs must * grow. */ - if (ppargs && nospace_error) { + if (du.ppargs && nospace_error) { error = nospace_error; goto error_return; } @@ -1017,45 +1021,9 @@ xfs_link( } } - if (!resblks) { - error = xfs_dir_canenter(tp, tdp, target_name); - if (error) - goto error_return; - } - - /* - * Handle initial link state of O_TMPFILE inode - */ - if (VFS_I(sip)->i_nlink == 0) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); - error = xfs_iunlink_remove(tp, pag, sip); - xfs_perag_put(pag); - if (error) - goto error_return; - } - - error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, - resblks); + error = xfs_dir_add_child(tp, resblks, &du); if (error) goto error_return; - xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - - xfs_bumplink(tp, sip); - - /* - * If we have parent pointers, we now need to add the parent record to - * the attribute fork of the inode. If this is the initial parent - * attribute, we need to create it correctly, otherwise we can just add - * the parent to the inode. - */ - if (ppargs) { - error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip); - if (error) - goto error_return; - } xfs_dir_update_hook(tdp, sip, 1, target_name); @@ -1070,7 +1038,7 @@ xfs_link( error = xfs_trans_commit(tp); xfs_iunlock(tdp, XFS_ILOCK_EXCL); xfs_iunlock(sip, XFS_ILOCK_EXCL); - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); return error; error_return: @@ -1078,7 +1046,7 @@ xfs_link( xfs_iunlock(tdp, XFS_ILOCK_EXCL); xfs_iunlock(sip, XFS_ILOCK_EXCL); out_parent: - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); std_return: if (error == -ENOSPC && nospace_error) error = nospace_error; -- cgit v1.2.3-70-g09d2 From 1964435d19d947b8626379d09db3e33b9669f333 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:44 -0700 Subject: xfs: hoist inode free function to libxfs Create a libxfs helper function that marks an inode free on disk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_util.c | 52 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_util.h | 5 ++++ fs/xfs/xfs_inode.c | 35 +--------------------------- 3 files changed, 58 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 5795445ef4bd..032333289113 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_ag.h" #include "xfs_iunlink_item.h" +#include "xfs_inode_item.h" uint16_t xfs_flags2diflags( @@ -695,3 +696,54 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } + +/* Free an inode in the ondisk index and zero it out. */ +int +xfs_inode_uninit( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip, + struct xfs_icluster *xic) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. + */ + error = xfs_difree(tp, pag, ip->i_ino, xic); + if (error) + return error; + + error = xfs_iunlink_remove(tp, pag, ip); + if (error) + return error; + + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kfree(ip->i_df.if_data); + ip->i_df.if_data = NULL; + ip->i_df.if_bytes = 0; + } + + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ + ip->i_diflags = 0; + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; + ip->i_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + VFS_I(ip)->i_generation++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h index 1c54c3b0cf26..060242998a23 100644 --- a/fs/xfs/libxfs/xfs_inode_util.h +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -6,6 +6,8 @@ #ifndef __XFS_INODE_UTIL_H__ #define __XFS_INODE_UTIL_H__ +struct xfs_icluster; + uint16_t xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags); uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags); uint32_t xfs_dic2xflags(struct xfs_inode *ip); @@ -48,6 +50,9 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags); void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, struct xfs_inode *ip); +int xfs_inode_uninit(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip, struct xfs_icluster *xic); + int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 959fdaef8409..caccb6296a48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1945,36 +1945,10 @@ xfs_ifree( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - /* - * Free the inode first so that we guarantee that the AGI lock is going - * to be taken before we remove the inode from the unlinked list. This - * makes the AGI lock -> unlinked list modification order the same as - * used in O_TMPFILE creation. - */ - error = xfs_difree(tp, pag, ip->i_ino, &xic); + error = xfs_inode_uninit(tp, pag, ip, &xic); if (error) goto out; - error = xfs_iunlink_remove(tp, pag, ip); - if (error) - goto out; - - /* - * Free any local-format data sitting around before we reset the - * data fork to extents format. Note that the attr fork data has - * already been freed by xfs_attr_inactive. - */ - if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kfree(ip->i_df.if_data); - ip->i_df.if_data = NULL; - ip->i_df.if_bytes = 0; - } - - VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ - ip->i_diflags = 0; - ip->i_diflags2 = mp->m_ino_geo.new_diflags2; - ip->i_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); @@ -1983,13 +1957,6 @@ xfs_ifree( iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); spin_unlock(&iip->ili_lock); - /* - * Bump the generation count so no one will be confused - * by reincarnations of this inode. - */ - VFS_I(ip)->i_generation++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (xic.deleted) error = xfs_ifree_cluster(tp, pag, ip, &xic); out: -- cgit v1.2.3-70-g09d2 From 90636e4531a8bfb5ef37d38a76eb97e5f5793deb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:45 -0700 Subject: xfs: create libxfs helper to remove an existing inode/name from a directory Create a new libxfs function to remove a (name, inode) entry from a directory. The upcoming metadata directory feature will need this to create a metadata directory tree. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 2 ++ fs/xfs/xfs_inode.c | 74 ++++++------------------------------------- 3 files changed, 92 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 5a75f60e8518..f3ac3d55bc38 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -874,3 +874,84 @@ xfs_dir_add_child( return 0; } + +/* + * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip) + * entry from the directory. Both inodes must have the ILOCK held. + */ +int +xfs_dir_remove_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + ASSERT(VFS_I(ip)->i_nlink >= 2); + if (VFS_I(ip)->i_nlink != 2) + return -ENOTEMPTY; + if (!xfs_dir_isempty(ip)) + return -ENOTEMPTY; + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + return error; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + /* + * Point the unlinked child directory's ".." entry to the root + * directory to eliminate back-references to inodes that may + * get freed before the child directory is closed. If the fs + * gets shrunk, this can lead to dirent inode validation errors. + */ + if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) + return error; + } + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOENT); + return error; + } + + /* Remove parent pointer. */ + if (du->ppargs) { + error = xfs_parent_removename(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 4f9711509571..c89916d1c040 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -322,5 +322,7 @@ int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du); int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du); +int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index caccb6296a48..8da67322791f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2040,13 +2040,17 @@ xfs_remove( struct xfs_name *name, struct xfs_inode *ip) { + struct xfs_dir_update du = { + .dp = dp, + .name = name, + .ip = ip, + }; struct xfs_mount *mp = dp->i_mount; struct xfs_trans *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int dontcare; int error = 0; uint resblks; - struct xfs_parent_args *ppargs; trace_xfs_remove(dp, name); @@ -2063,7 +2067,7 @@ xfs_remove( if (error) goto std_return; - error = xfs_parent_start(mp, &ppargs); + error = xfs_parent_start(mp, &du.ppargs); if (error) goto std_return; @@ -2086,70 +2090,10 @@ xfs_remove( goto out_parent; } - /* - * If we're removing a directory perform some additional validation. - */ - if (is_dir) { - ASSERT(VFS_I(ip)->i_nlink >= 2); - if (VFS_I(ip)->i_nlink != 2) { - error = -ENOTEMPTY; - goto out_trans_cancel; - } - if (!xfs_dir_isempty(ip)) { - error = -ENOTEMPTY; - goto out_trans_cancel; - } - - /* Drop the link from ip's "..". */ - error = xfs_droplink(tp, dp); - if (error) - goto out_trans_cancel; - - /* Drop the "." link from ip to self. */ - error = xfs_droplink(tp, ip); - if (error) - goto out_trans_cancel; - - /* - * Point the unlinked child directory's ".." entry to the root - * directory to eliminate back-references to inodes that may - * get freed before the child directory is closed. If the fs - * gets shrunk, this can lead to dirent inode validation errors. - */ - if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { - error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, - tp->t_mountp->m_sb.sb_rootino, 0); - if (error) - goto out_trans_cancel; - } - } else { - /* - * When removing a non-directory we need to log the parent - * inode here. For a directory this is done implicitly - * by the xfs_droplink call for the ".." entry. - */ - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* Drop the link from dp to ip. */ - error = xfs_droplink(tp, ip); + error = xfs_dir_remove_child(tp, resblks, &du); if (error) goto out_trans_cancel; - error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); - if (error) { - ASSERT(error != -ENOENT); - goto out_trans_cancel; - } - - /* Remove parent pointer. */ - if (ppargs) { - error = xfs_parent_removename(tp, ppargs, dp, name, ip); - if (error) - goto out_trans_cancel; - } - /* * Drop the link from dp to ip, and if ip was a directory, remove the * '.' and '..' references since we freed the directory. @@ -2173,7 +2117,7 @@ xfs_remove( xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); return 0; out_trans_cancel: @@ -2182,7 +2126,7 @@ xfs_remove( xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(dp, XFS_ILOCK_EXCL); out_parent: - xfs_parent_finish(mp, ppargs); + xfs_parent_finish(mp, du.ppargs); std_return: return error; } -- cgit v1.2.3-70-g09d2 From a55712b35c065eee4ab1195233a5478fb7c93efa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:46 -0700 Subject: xfs: create libxfs helper to exchange two directory entries Create a new libxfs function to exchange two directory entries. The upcoming metadata directory feature will need this to replace a metadata inode directory entry. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 3 ++ fs/xfs/xfs_inode.c | 112 ++++++------------------------------------ 3 files changed, 142 insertions(+), 98 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index f3ac3d55bc38..d650cfa023fd 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -955,3 +955,128 @@ xfs_dir_remove_child( return 0; } + +/* + * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2, + * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed. + * @ip1 and @ip2 need not be of the same type. + * + * All inodes must have the ILOCK held, and both entries must already exist. + */ +int +xfs_dir_exchange_children( + struct xfs_trans *tp, + struct xfs_dir_update *du1, + struct xfs_dir_update *du2, + unsigned int spaceres) +{ + struct xfs_inode *dp1 = du1->dp; + const struct xfs_name *name1 = du1->name; + struct xfs_inode *ip1 = du1->ip; + struct xfs_inode *dp2 = du2->dp; + const struct xfs_name *name2 = du2->name; + struct xfs_inode *ip2 = du2->ip; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + int error; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); + if (error) + return error; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); + if (error) + return error; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, spaceres); + if (error) + return error; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + return error; + xfs_bumplink(tp, dp1); + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, spaceres); + if (error) + return error; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + return error; + xfs_bumplink(tp, dp2); + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* Schedule parent pointer replacements */ + if (du1->ppargs) { + error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1, + dp2, name2, ip1); + if (error) + return error; + } + + if (du2->ppargs) { + error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2, + dp1, name1, ip2); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c89916d1c040..8b1e192bd7a8 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -325,4 +325,7 @@ int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks, int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du); +int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1, + struct xfs_dir_update *du2, unsigned int spaceres); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8da67322791f..363e98ee974c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2238,108 +2238,24 @@ xfs_cross_rename( struct xfs_parent_args *ip2_ppargs, int spaceres) { - int error = 0; - int ip1_flags = 0; - int ip2_flags = 0; - int dp2_flags = 0; - - /* Swap inode number for dirent in first parent */ - error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); - if (error) - goto out_trans_abort; + struct xfs_dir_update du1 = { + .dp = dp1, + .name = name1, + .ip = ip1, + .ppargs = ip1_ppargs, + }; + struct xfs_dir_update du2 = { + .dp = dp2, + .name = name2, + .ip = ip2, + .ppargs = ip2_ppargs, + }; + int error; - /* Swap inode number for dirent in second parent */ - error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); + error = xfs_dir_exchange_children(tp, &du1, &du2, spaceres); if (error) goto out_trans_abort; - /* - * If we're renaming one or more directories across different parents, - * update the respective ".." entries (and link counts) to match the new - * parents. - */ - if (dp1 != dp2) { - dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - - if (S_ISDIR(VFS_I(ip2)->i_mode)) { - error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, - dp1->i_ino, spaceres); - if (error) - goto out_trans_abort; - - /* transfer ip2 ".." reference to dp1 */ - if (!S_ISDIR(VFS_I(ip1)->i_mode)) { - error = xfs_droplink(tp, dp2); - if (error) - goto out_trans_abort; - xfs_bumplink(tp, dp1); - } - - /* - * Although ip1 isn't changed here, userspace needs - * to be warned about the change, so that applications - * relying on it (like backup ones), will properly - * notify the change - */ - ip1_flags |= XFS_ICHGTIME_CHG; - ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - } - - if (S_ISDIR(VFS_I(ip1)->i_mode)) { - error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, - dp2->i_ino, spaceres); - if (error) - goto out_trans_abort; - - /* transfer ip1 ".." reference to dp2 */ - if (!S_ISDIR(VFS_I(ip2)->i_mode)) { - error = xfs_droplink(tp, dp1); - if (error) - goto out_trans_abort; - xfs_bumplink(tp, dp2); - } - - /* - * Although ip2 isn't changed here, userspace needs - * to be warned about the change, so that applications - * relying on it (like backup ones), will properly - * notify the change - */ - ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - ip2_flags |= XFS_ICHGTIME_CHG; - } - } - - /* Schedule parent pointer replacements */ - if (ip1_ppargs) { - error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2, - name2, ip1); - if (error) - goto out_trans_abort; - } - - if (ip2_ppargs) { - error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1, - name1, ip2); - if (error) - goto out_trans_abort; - } - - if (ip1_flags) { - xfs_trans_ichgtime(tp, ip1, ip1_flags); - xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); - } - if (ip2_flags) { - xfs_trans_ichgtime(tp, ip2, ip2_flags); - xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); - } - if (dp2_flags) { - xfs_trans_ichgtime(tp, dp2, dp2_flags); - xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); - } - xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); - /* * Inform our hook clients that we've finished an exchange operation as * follows: removed the source and target files from their directories; -- cgit v1.2.3-70-g09d2 From 28d0d813444645689fefa232bcf88e86a5a3a746 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:46 -0700 Subject: xfs: create libxfs helper to rename two directory entries Create a new libxfs function to rename two directory entries. The upcoming metadata directory feature will need this to replace a metadata inode directory entry. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 227 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 3 + fs/xfs/xfs_inode.c | 258 ++++++++--------------------------------------- 3 files changed, 273 insertions(+), 215 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index d650cfa023fd..34b63ba2e4f7 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -23,6 +23,7 @@ #include "xfs_trans_space.h" #include "xfs_parent.h" #include "xfs_ag.h" +#include "xfs_ialloc.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -1080,3 +1081,229 @@ xfs_dir_exchange_children( return 0; } + +/* + * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry + * @target_name in directory @target_dp point to @src_ip and remove the + * original entry, cleaning up everything left behind. + * + * Cleanup involves dropping a link count on @target_ip, and either removing + * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry + * with (@src_name, @wip) if a whiteout inode @wip is supplied. + * + * All inodes must have the ILOCK held. We assume that if @src_ip is a + * directory then its '..' doesn't already point to @target_dp, and that @wip + * is a freshly allocated whiteout. + */ +int +xfs_dir_rename_children( + struct xfs_trans *tp, + struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, + unsigned int spaceres, + struct xfs_dir_update *du_wip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *src_dp = du_src->dp; + const struct xfs_name *src_name = du_src->name; + struct xfs_inode *src_ip = du_src->ip; + struct xfs_inode *target_dp = du_tgt->dp; + const struct xfs_name *target_name = du_tgt->name; + struct xfs_inode *target_ip = du_tgt->ip; + bool new_parent = (src_dp != target_dp); + bool src_is_directory; + int error; + + src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); + + /* + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + return error; + } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) + return -EEXIST; + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (du_wip->ip) { + struct xfs_perag *pag; + + ASSERT(VFS_I(du_wip->ip)->i_nlink == 0); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, du_wip->ip); + xfs_perag_put(pag); + if (error) + return error; + + xfs_bumplink(tp, du_wip->ip); + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + xfs_bumplink(tp, target_dp); + } + } else { /* target_ip != NULL */ + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, spaceres); + ASSERT(error != -EEXIST); + if (error) + return error; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + return error; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (du_wip->ip) + error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino, + spaceres); + else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* Schedule parent pointer updates. */ + if (du_wip->ppargs) { + error = xfs_parent_addname(tp, du_wip->ppargs, src_dp, + src_name, du_wip->ip); + if (error) + return error; + } + + if (du_src->ppargs) { + error = xfs_parent_replacename(tp, du_src->ppargs, src_dp, + src_name, target_dp, target_name, src_ip); + if (error) + return error; + } + + if (du_tgt->ppargs) { + error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp, + target_name, target_ip); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 8b1e192bd7a8..df6d4bbe3d6f 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -327,5 +327,8 @@ int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks, int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1, struct xfs_dir_update *du2, unsigned int spaceres); +int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, unsigned int spaceres, + struct xfs_dir_update *du_wip); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 363e98ee974c..3e4ec5337a3b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2341,13 +2341,20 @@ xfs_rename( struct xfs_inode *target_ip, unsigned int flags) { + struct xfs_dir_update du_src = { + .dp = src_dp, + .name = src_name, + .ip = src_ip, + }; + struct xfs_dir_update du_tgt = { + .dp = target_dp, + .name = target_name, + .ip = target_ip, + }; + struct xfs_dir_update du_wip = { }; struct xfs_mount *mp = src_dp->i_mount; struct xfs_trans *tp; - struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; - struct xfs_parent_args *src_ppargs = NULL; - struct xfs_parent_args *tgt_ppargs = NULL; - struct xfs_parent_args *wip_ppargs = NULL; int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); @@ -2367,8 +2374,8 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(idmap, src_name, - target_dp, &wip); + error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp, + &du_wip.ip); if (error) return error; @@ -2376,21 +2383,21 @@ xfs_rename( src_name->type = XFS_DIR3_FT_CHRDEV; } - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, - inodes, &num_inodes); + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, du_wip.ip, + inodes, &num_inodes); - error = xfs_parent_start(mp, &src_ppargs); + error = xfs_parent_start(mp, &du_src.ppargs); if (error) goto out_release_wip; - if (wip) { - error = xfs_parent_start(mp, &wip_ppargs); + if (du_wip.ip) { + error = xfs_parent_start(mp, &du_wip.ppargs); if (error) goto out_src_ppargs; } if (target_ip) { - error = xfs_parent_start(mp, &tgt_ppargs); + error = xfs_parent_start(mp, &du_tgt.ppargs); if (error) goto out_wip_ppargs; } @@ -2398,7 +2405,7 @@ xfs_rename( retry: nospace_error = 0; spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL, - target_name->len, wip != NULL); + target_name->len, du_wip.ip != NULL); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { nospace_error = error; @@ -2413,7 +2420,7 @@ retry: * We don't allow reservationless renaming when parent pointers are * enabled because we can't back out if the xattrs must grow. */ - if (src_ppargs && nospace_error) { + if (du_src.ppargs && nospace_error) { error = nospace_error; xfs_trans_cancel(tp); goto out_tgt_ppargs; @@ -2445,8 +2452,8 @@ retry: xfs_trans_ijoin(tp, src_ip, 0); if (target_ip) xfs_trans_ijoin(tp, target_ip, 0); - if (wip) - xfs_trans_ijoin(tp, wip, 0); + if (du_wip.ip) + xfs_trans_ijoin(tp, du_wip.ip, 0); /* * If we are using project inheritance, we only allow renames @@ -2462,8 +2469,8 @@ retry: /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - src_ppargs, target_dp, target_name, target_ip, - tgt_ppargs, spaceres); + du_src.ppargs, target_dp, target_name, + target_ip, du_tgt.ppargs, spaceres); nospace_error = 0; goto out_unlock; } @@ -2498,38 +2505,11 @@ retry: * We don't allow quotaless renaming when parent pointers are enabled * because we can't back out if the xattrs must grow. */ - if (src_ppargs && nospace_error) { + if (du_src.ppargs && nospace_error) { error = nospace_error; goto out_trans_cancel; } - /* - * Check for expected errors before we dirty the transaction - * so we can return an error without a transaction abort. - */ - if (target_ip == NULL) { - /* - * If there's no space reservation, check the entry will - * fit before actually inserting it. - */ - if (!spaceres) { - error = xfs_dir_canenter(tp, target_dp, target_name); - if (error) - goto out_trans_cancel; - } - } else { - /* - * If target exists and it's a directory, check that whether - * it can be destroyed. - */ - if (S_ISDIR(VFS_I(target_ip)->i_mode) && - (!xfs_dir_isempty(target_ip) || - (VFS_I(target_ip)->i_nlink > 2))) { - error = -EEXIST; - goto out_trans_cancel; - } - } - /* * Lock the AGI buffers we need to handle bumping the nlink of the * whiteout inode off the unlinked list and to handle dropping the @@ -2541,7 +2521,7 @@ retry: * target_ip is either null or an empty directory. */ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { - if (inodes[i] == wip || + if (inodes[i] == du_wip.ip || (inodes[i] == target_ip && (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { struct xfs_perag *pag; @@ -2556,172 +2536,20 @@ retry: } } - /* - * Directory entry creation below may acquire the AGF. Remove - * the whiteout from the unlinked list first to preserve correct - * AGI/AGF locking order. This dirties the transaction so failures - * after this point will abort and log recovery will clean up the - * mess. - * - * For whiteouts, we need to bump the link count on the whiteout - * inode. After this point, we have a real link, clear the tmpfile - * state flag from the inode so it doesn't accidentally get misused - * in future. - */ - if (wip) { - struct xfs_perag *pag; - - ASSERT(VFS_I(wip)->i_nlink == 0); - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); - error = xfs_iunlink_remove(tp, pag, wip); - xfs_perag_put(pag); - if (error) - goto out_trans_cancel; - - xfs_bumplink(tp, wip); - VFS_I(wip)->i_state &= ~I_LINKABLE; - } - - /* - * Set up the target. - */ - if (target_ip == NULL) { - /* - * If target does not exist and the rename crosses - * directories, adjust the target directory link count - * to account for the ".." reference from the new entry. - */ - error = xfs_dir_createname(tp, target_dp, target_name, - src_ip->i_ino, spaceres); - if (error) - goto out_trans_cancel; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (new_parent && src_is_directory) { - xfs_bumplink(tp, target_dp); - } - } else { /* target_ip != NULL */ - /* - * Link the source inode under the target name. - * If the source inode is a directory and we are moving - * it across directories, its ".." entry will be - * inconsistent until we replace that down below. - * - * In case there is already an entry with the same - * name at the destination directory, remove it first. - */ - error = xfs_dir_replace(tp, target_dp, target_name, - src_ip->i_ino, spaceres); - if (error) - goto out_trans_cancel; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Decrement the link count on the target since the target - * dir no longer points to it. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto out_trans_cancel; - - if (src_is_directory) { - /* - * Drop the link from the old "." entry. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto out_trans_cancel; - } - } /* target_ip != NULL */ - - /* - * Remove the source. - */ - if (new_parent && src_is_directory) { - /* - * Rewrite the ".." entry to point to the new - * directory. - */ - error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, - target_dp->i_ino, spaceres); - ASSERT(error != -EEXIST); - if (error) - goto out_trans_cancel; - } - - /* - * We always want to hit the ctime on the source inode. - * - * This isn't strictly required by the standards since the source - * inode isn't really being changed, but old unix file systems did - * it and some incremental backup programs won't work without it. - */ - xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); - - /* - * Adjust the link count on src_dp. This is necessary when - * renaming a directory, either within one parent when - * the target existed, or across two parent directories. - */ - if (src_is_directory && (new_parent || target_ip != NULL)) { - - /* - * Decrement link count on src_directory since the - * entry that's moved no longer points to it. - */ - error = xfs_droplink(tp, src_dp); - if (error) - goto out_trans_cancel; - } - - /* - * For whiteouts, we only need to update the source dirent with the - * inode number of the whiteout inode rather than removing it - * altogether. - */ - if (wip) - error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, - spaceres); - else - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, - spaceres); - + error = xfs_dir_rename_children(tp, &du_src, &du_tgt, spaceres, + &du_wip); if (error) goto out_trans_cancel; - /* Schedule parent pointer updates. */ - if (wip_ppargs) { - error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name, - wip); - if (error) - goto out_trans_cancel; - } - - if (src_ppargs) { - error = xfs_parent_replacename(tp, src_ppargs, src_dp, - src_name, target_dp, target_name, src_ip); - if (error) - goto out_trans_cancel; - } - - if (tgt_ppargs) { - error = xfs_parent_removename(tp, tgt_ppargs, target_dp, - target_name, target_ip); - if (error) - goto out_trans_cancel; + if (du_wip.ip) { + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; } - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - /* * Inform our hook clients that we've finished a rename operation as * follows: removed the source and target files from their directories; @@ -2734,8 +2562,8 @@ retry: xfs_dir_update_hook(target_dp, target_ip, -1, target_name); xfs_dir_update_hook(src_dp, src_ip, -1, src_name); xfs_dir_update_hook(target_dp, src_ip, 1, target_name); - if (wip) - xfs_dir_update_hook(src_dp, wip, 1, src_name); + if (du_wip.ip) + xfs_dir_update_hook(src_dp, du_wip.ip, 1, src_name); error = xfs_finish_rename(tp); nospace_error = 0; @@ -2746,14 +2574,14 @@ out_trans_cancel: out_unlock: xfs_iunlock_rename(inodes, num_inodes); out_tgt_ppargs: - xfs_parent_finish(mp, tgt_ppargs); + xfs_parent_finish(mp, du_tgt.ppargs); out_wip_ppargs: - xfs_parent_finish(mp, wip_ppargs); + xfs_parent_finish(mp, du_wip.ppargs); out_src_ppargs: - xfs_parent_finish(mp, src_ppargs); + xfs_parent_finish(mp, du_src.ppargs); out_release_wip: - if (wip) - xfs_irele(wip); + if (du_wip.ip) + xfs_irele(du_wip.ip); if (error == -ENOSPC && nospace_error) error = nospace_error; return error; -- cgit v1.2.3-70-g09d2 From 62bbf50bea21b1c76990fd1bae58a65660a11c27 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:47 -0700 Subject: xfs: move dirent update hooks to xfs_dir2.c Move the directory entry update hook code to xfs_dir2 so that it is mostly consolidated with the higher level directory functions. Retain the exports so that online fsck can still send notifications through the hooks. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 104 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 25 ++++++++++ fs/xfs/scrub/common.c | 1 + fs/xfs/xfs_inode.c | 117 ----------------------------------------------- fs/xfs/xfs_inode.h | 25 ---------- fs/xfs/xfs_symlink.c | 2 - 6 files changed, 130 insertions(+), 144 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 34b63ba2e4f7..202468223bf9 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -762,6 +762,81 @@ xfs_dir2_compname( return xfs_da_compname(args, name, len); } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip * into @dp under the given @name. If @ip is a directory, it will be @@ -809,6 +884,7 @@ xfs_dir_create_child( return error; } + xfs_dir_update_hook(dp, ip, 1, name); return 0; } @@ -873,6 +949,7 @@ xfs_dir_add_child( return error; } + xfs_dir_update_hook(dp, ip, 1, name); return 0; } @@ -954,6 +1031,7 @@ xfs_dir_remove_child( return error; } + xfs_dir_update_hook(dp, ip, -1, name); return 0; } @@ -1079,6 +1157,18 @@ xfs_dir_exchange_children( return error; } + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); return 0; } @@ -1305,5 +1395,19 @@ xfs_dir_rename_children( return error; } + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (du_wip->ip) + xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name); return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index df6d4bbe3d6f..576068ed81fa 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -309,6 +309,31 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) return c; } +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + struct xfs_parent_args; struct xfs_dir_update { diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 1ad8ec63a7f4..22f5f1a9d3f0 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" +#include "xfs_dir2.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e4ec5337a3b..438651cf33f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -600,81 +600,6 @@ xfs_icreate( return 0; } -#ifdef CONFIG_XFS_LIVE_HOOKS -/* - * Use a static key here to reduce the overhead of directory live update hooks. - * If the compiler supports jump labels, the static branch will be replaced by - * a nop sled when there are no hook users. Online fsck is currently the only - * caller, so this is a reasonable tradeoff. - * - * Note: Patching the kernel code requires taking the cpu hotplug lock. Other - * parts of the kernel allocate memory with that lock held, which means that - * XFS callers cannot hold any locks that might be used by memory reclaim or - * writeback when calling the static_branch_{inc,dec} functions. - */ -DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); - -void -xfs_dir_hook_disable(void) -{ - xfs_hooks_switch_off(&xfs_dir_hooks_switch); -} - -void -xfs_dir_hook_enable(void) -{ - xfs_hooks_switch_on(&xfs_dir_hooks_switch); -} - -/* Call hooks for a directory update relating to a child dirent update. */ -inline void -xfs_dir_update_hook( - struct xfs_inode *dp, - struct xfs_inode *ip, - int delta, - const struct xfs_name *name) -{ - if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { - struct xfs_dir_update_params p = { - .dp = dp, - .ip = ip, - .delta = delta, - .name = name, - }; - struct xfs_mount *mp = ip->i_mount; - - xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); - } -} - -/* Call the specified function during a directory update. */ -int -xfs_dir_hook_add( - struct xfs_mount *mp, - struct xfs_dir_hook *hook) -{ - return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); -} - -/* Stop calling the specified function during a directory update. */ -void -xfs_dir_hook_del( - struct xfs_mount *mp, - struct xfs_dir_hook *hook) -{ - xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); -} - -/* Configure directory update hook functions. */ -void -xfs_dir_hook_setup( - struct xfs_dir_hook *hook, - notifier_fn_t mod_fn) -{ - xfs_hook_setup(&hook->dirent_hook, mod_fn); -} -#endif /* CONFIG_XFS_LIVE_HOOKS */ - /* Return dquots for the ids that will be assigned to a new file. */ int xfs_icreate_dqalloc( @@ -798,12 +723,6 @@ xfs_create( if (error) goto out_trans_cancel; - /* - * Create ip with a reference from dp, and add '.' and '..' references - * if it's a directory. - */ - xfs_dir_update_hook(dp, du.ip, 1, name); - /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to @@ -1025,8 +944,6 @@ xfs_link( if (error) goto error_return; - xfs_dir_update_hook(tdp, sip, 1, target_name); - /* * If this is a synchronous mount, make sure that the * link transaction goes to disk before returning to @@ -2094,12 +2011,6 @@ xfs_remove( if (error) goto out_trans_cancel; - /* - * Drop the link from dp to ip, and if ip was a directory, remove the - * '.' and '..' references since we freed the directory. - */ - xfs_dir_update_hook(dp, ip, -1, name); - /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to @@ -2256,19 +2167,6 @@ xfs_cross_rename( if (error) goto out_trans_abort; - /* - * Inform our hook clients that we've finished an exchange operation as - * follows: removed the source and target files from their directories; - * added the target to the source directory; and added the source to - * the target directory. All inodes are locked, so it's ok to model a - * rename this way so long as we say we deleted entries before we add - * new ones. - */ - xfs_dir_update_hook(dp1, ip1, -1, name1); - xfs_dir_update_hook(dp2, ip2, -1, name2); - xfs_dir_update_hook(dp1, ip2, 1, name1); - xfs_dir_update_hook(dp2, ip1, 1, name2); - return xfs_finish_rename(tp); out_trans_abort: @@ -2550,21 +2448,6 @@ retry: VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; } - /* - * Inform our hook clients that we've finished a rename operation as - * follows: removed the source and target files from their directories; - * that we've added the source to the target directory; and finally - * that we've added the whiteout, if there was one. All inodes are - * locked, so it's ok to model a rename this way so long as we say we - * deleted entries before we add new ones. - */ - if (target_ip) - xfs_dir_update_hook(target_dp, target_ip, -1, target_name); - xfs_dir_update_hook(src_dp, src_ip, -1, src_name); - xfs_dir_update_hook(target_dp, src_ip, 1, target_name); - if (du_wip.ip) - xfs_dir_update_hook(src_dp, du_wip.ip, 1, src_name); - error = xfs_finish_rename(tp); nospace_error = 0; goto out_unlock; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 5ee044674c3a..51defdebef30 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -632,31 +632,6 @@ void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); -struct xfs_dir_update_params { - const struct xfs_inode *dp; - const struct xfs_inode *ip; - const struct xfs_name *name; - int delta; -}; - -#ifdef CONFIG_XFS_LIVE_HOOKS -void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, - int delta, const struct xfs_name *name); - -struct xfs_dir_hook { - struct xfs_hook dirent_hook; -}; - -void xfs_dir_hook_disable(void); -void xfs_dir_hook_enable(void); - -int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); -void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); -void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); -#else -# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) -#endif /* CONFIG_XFS_LIVE_HOOKS */ - int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp, struct xfs_dquot **pdqpp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index c0f5c2e1f215..77f19e2f66e0 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -200,8 +200,6 @@ xfs_symlink( if (error) goto out_trans_cancel; - xfs_dir_update_hook(dp, du.ip, 1, link_name); - /* * If this is a synchronous mount, make sure that the * symlink transaction goes to disk before returning to -- cgit v1.2.3-70-g09d2 From 47d4d5961fb9069803576ed3adb85b57a575a1b9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:48 -0700 Subject: xfs: get rid of trivial rename helpers Get rid of the largely pointless xfs_cross_rename and xfs_finish_rename now that we've refactored its parent. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 77 ++++++++++-------------------------------------------- 1 file changed, 14 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 438651cf33f1..62ca6c75117c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2117,63 +2117,6 @@ xfs_sort_inodes( } } -static int -xfs_finish_rename( - struct xfs_trans *tp) -{ - /* - * If this is a synchronous mount, make sure that the rename transaction - * goes to disk before returning to the user. - */ - if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) - xfs_trans_set_sync(tp); - - return xfs_trans_commit(tp); -} - -/* - * xfs_cross_rename() - * - * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall - */ -STATIC int -xfs_cross_rename( - struct xfs_trans *tp, - struct xfs_inode *dp1, - struct xfs_name *name1, - struct xfs_inode *ip1, - struct xfs_parent_args *ip1_ppargs, - struct xfs_inode *dp2, - struct xfs_name *name2, - struct xfs_inode *ip2, - struct xfs_parent_args *ip2_ppargs, - int spaceres) -{ - struct xfs_dir_update du1 = { - .dp = dp1, - .name = name1, - .ip = ip1, - .ppargs = ip1_ppargs, - }; - struct xfs_dir_update du2 = { - .dp = dp2, - .name = name2, - .ip = ip2, - .ppargs = ip2_ppargs, - }; - int error; - - error = xfs_dir_exchange_children(tp, &du1, &du2, spaceres); - if (error) - goto out_trans_abort; - - return xfs_finish_rename(tp); - -out_trans_abort: - xfs_trans_cancel(tp); - return error; -} - /* * xfs_rename_alloc_whiteout() * @@ -2366,11 +2309,11 @@ retry: /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - du_src.ppargs, target_dp, target_name, - target_ip, du_tgt.ppargs, spaceres); - nospace_error = 0; - goto out_unlock; + error = xfs_dir_exchange_children(tp, &du_src, &du_tgt, + spaceres); + if (error) + goto out_trans_cancel; + goto out_commit; } /* @@ -2448,7 +2391,15 @@ retry: VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; } - error = xfs_finish_rename(tp); +out_commit: + /* + * If this is a synchronous mount, make sure that the rename + * transaction goes to disk before returning to the user. + */ + if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp); nospace_error = 0; goto out_unlock; -- cgit v1.2.3-70-g09d2 From ac3a0275165b4f80d9b7b516d6a8f8b308644fff Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:49 -0700 Subject: xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Currently, the XFS_SB_CRC_OFF macro uses the incore superblock struct (xfs_sb) to compute the address of sb_crc within the ondisk superblock struct (xfs_dsb). This is a landmine if we ever change the layout of the incore superblock (as we're about to do), so redefine the macro to use xfs_dsb to compute the layout of xfs_dsb. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_format.h | 9 ++++----- fs/xfs/libxfs/xfs_ondisk.h | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 61f51becff4f..e1bfee0c3b1a 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -90,8 +90,7 @@ struct xfs_ifork; #define XFSLABEL_MAX 12 /* - * Superblock - in core version. Must match the ondisk version below. - * Must be padded to 64 bit alignment. + * Superblock - in core version. Must be padded to 64 bit alignment. */ typedef struct xfs_sb { uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ @@ -178,10 +177,8 @@ typedef struct xfs_sb { /* must be padded to 64 bit alignment */ } xfs_sb_t; -#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) - /* - * Superblock - on disk version. Must match the in core version above. + * Superblock - on disk version. * Must be padded to 64 bit alignment. */ struct xfs_dsb { @@ -265,6 +262,8 @@ struct xfs_dsb { /* must be padded to 64 bit alignment */ }; +#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc) + /* * Misc. Flags - warning - these will be cleared by xfs_repair unless * a feature bit is set when the flag is used. diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index e8cdd77d03fa..23c133fd36f5 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -85,6 +85,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224); XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); -- cgit v1.2.3-70-g09d2 From 4e0e2c0fe35b44cd4db6a138ed4316178ed60b5c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:50 -0700 Subject: xfs: clean up extent free log intent item tracepoint callsites Pass the incore EFI structure to the tracepoints instead of open-coding the argument passing. This cleans up the call sites a bit. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 7 +++---- fs/xfs/xfs_extfree_item.c | 6 ++---- fs/xfs/xfs_trace.h | 33 +++++++++++++++------------------ 3 files changed, 20 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 63315ddc46c6..4d4fc37d738c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2544,7 +2544,7 @@ xfs_defer_agfl_block( xefi->xefi_owner = oinfo->oi_owner; xefi->xefi_agresv = XFS_AG_RESV_AGFL; - trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + trace_xfs_agfl_free_defer(mp, xefi); xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); @@ -2606,9 +2606,8 @@ xfs_defer_extent_free( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(mp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + + trace_xfs_extent_free_defer(mp, xefi); xfs_extent_free_get_group(mp, xefi); *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 01ebbd7691a5..5a76af9d8560 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -464,8 +464,7 @@ xfs_extent_free_finish_item( if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); + trace_xfs_extent_free_deferred(mp, xefi); /* * If we need a new transaction to make progress, the caller will log a @@ -542,8 +541,7 @@ xfs_agfl_free_finish_item( agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno, - xefi->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, xefi); error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); if (!error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ba839ce6a9cf..b2ea9d5141a7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -90,6 +90,7 @@ struct xfs_exchrange; struct xfs_getparents; struct xfs_parent_irec; struct xfs_attrlist_cursor_kern; +struct xfs_extent_free_item; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -2710,41 +2711,37 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, type, agbno, len), + TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), + TP_ARGS(mp, free), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, type) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) + __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->type = type; - __entry->agbno = agbno; - __entry->len = len; + __entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + __entry->len = free->xefi_blockcount; + __entry->flags = free->xefi_flags; ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, __entry->agno, __entry->agbno, - __entry->len) + __entry->len, + __entry->flags) ); #define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \ DEFINE_EVENT(xfs_free_extent_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int type, \ - xfs_agblock_t bno, \ - xfs_extlen_t len), \ - TP_ARGS(mp, agno, type, bno, len)) -DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer); -DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred); + TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \ + TP_ARGS(mp, free)) DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred); DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, -- cgit v1.2.3-70-g09d2 From 980faece91a60c279e7c24cb1d1a378bbbb74bb9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:51 -0700 Subject: xfs: convert "skip_discard" to a proper flags bitset Convert the boolean to skip discard on free into a proper flags field so that we can add more flags in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 13 +++++++------ fs/xfs/libxfs/xfs_alloc.h | 9 +++++++-- fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++---- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 5 ++--- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/scrub/newbt.c | 5 +++-- fs/xfs/scrub/reap.c | 7 ++++--- fs/xfs/xfs_reflink.c | 2 +- 12 files changed, 39 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 240e079cb3fb..7e80732cb547 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -1008,7 +1008,7 @@ xfs_ag_shrink_space( goto resv_err; err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, - XFS_AG_RESV_NONE, true); + XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4d4fc37d738c..089031151eed 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2562,7 +2562,7 @@ xfs_defer_extent_free( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard, + unsigned int free_flags, struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; @@ -2582,6 +2582,7 @@ xfs_defer_extent_free( ASSERT(len < mp->m_sb.sb_agblocks); ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif + ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); ASSERT(xfs_extfree_item_cache != NULL); ASSERT(type != XFS_AG_RESV_AGFL); @@ -2593,7 +2594,7 @@ xfs_defer_extent_free( xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; xefi->xefi_agresv = type; - if (skip_discard) + if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { ASSERT(oinfo->oi_offset == 0); @@ -2621,11 +2622,11 @@ xfs_free_extent_later( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + unsigned int free_flags) { struct xfs_defer_pending *dontcare = NULL; - return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags, &dontcare); } @@ -2650,13 +2651,13 @@ xfs_free_extent_later( int xfs_alloc_schedule_autoreap( const struct xfs_alloc_arg *args, - bool skip_discard, + unsigned int free_flags, struct xfs_alloc_autoreap *aarp) { int error; error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, - &args->oinfo, args->resv, skip_discard, &aarp->dfp); + &args->oinfo, args->resv, free_flags, &aarp->dfp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 3dc8e44fea76..7f51b3cb0349 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -235,7 +235,12 @@ xfs_buf_to_agfl_bno( int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type, bool skip_discard); + enum xfs_ag_resv_type type, unsigned int free_flags); + +/* Don't issue a discard for the blocks freed. */ +#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0) + +#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD) /* * List of extents to be free "later". @@ -264,7 +269,7 @@ struct xfs_alloc_autoreap { }; int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, - bool skip_discard, struct xfs_alloc_autoreap *aarp); + unsigned int free_flags, struct xfs_alloc_autoreap *aarp); void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, struct xfs_alloc_autoreap *aarp); void xfs_alloc_commit_autoreap(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 09e3302a4b72..7df74c35d9f9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -605,7 +605,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -5381,11 +5381,15 @@ xfs_bmap_del_extent_real( error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); } else { + unsigned int efi_flags = 0; + + if ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN) + efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - XFS_AG_RESV_NONE, - ((bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN)); + XFS_AG_RESV_NONE, efi_flags); } if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f5d84dcb58da..d1b06ccde19e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -282,7 +282,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f8d5ed7aedde..0af5b7a33d05 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1990,7 +1990,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -2036,8 +2036,7 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, - false); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 42e9fd47f6c7..496e2f72a85b 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -170,7 +170,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv, false); + &XFS_RMAP_OINFO_INOBT, resv, 0); } STATIC int diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 511c912d515c..4d8bb760c723 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1173,7 +1173,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) goto out_error; } @@ -1237,7 +1237,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) goto out_error; } @@ -2022,7 +2022,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index ca59f6c89f3e..cb3b1d42ae9a 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -109,7 +109,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0); } STATIC int diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 4a0271123d94..2aa14b7ab630 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -160,7 +160,8 @@ xrep_newbt_add_blocks( if (args->tp) { ASSERT(xnr->oinfo.oi_offset == 0); - error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + error = xfs_alloc_schedule_autoreap(args, + XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); if (error) goto out_pag; } @@ -414,7 +415,7 @@ xrep_newbt_free_extent( */ fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, - xnr->resv, true); + xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index be283153c254..53697f3c5e1b 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -451,7 +451,7 @@ xreap_agextent_iter( xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, - rs->resv, true); + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -477,7 +477,7 @@ xreap_agextent_iter( * system with large EFIs. */ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, - rs->resv, true); + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -943,7 +943,8 @@ xrep_reap_bmapi_iter( xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, - imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true); + imap->br_blockcount, NULL, XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_SKIP_DISCARD); } /* diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 265a2a418bc7..6fde6ec8092f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -603,7 +603,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) break; -- cgit v1.2.3-70-g09d2 From 62d597a197e390a89eadff60b98231e91b32ab83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:51 -0700 Subject: xfs: pass the fsbno to xfs_perag_intent_get All callers of xfs_perag_intent_get have a fsbno and need boilerplate code to turn that into an agno. Just pass the fsbno to xfs_perag_intent_get and look up the agno there. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_item.c | 6 +----- fs/xfs/xfs_drain.c | 8 ++++---- fs/xfs/xfs_drain.h | 5 +++-- fs/xfs/xfs_extfree_item.c | 5 +---- fs/xfs/xfs_refcount_item.c | 5 +---- fs/xfs/xfs_rmap_item.c | 5 +---- 6 files changed, 11 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index a19d62e78aa1..e224b49b7cff 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -324,13 +324,9 @@ xfs_bmap_update_get_group( struct xfs_mount *mp, struct xfs_bmap_intent *bi) { - xfs_agnumber_t agno; - if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) return; - agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); - /* * Bump the intent count on behalf of the deferred rmap and refcount * intent items that that we can queue when we finish this bmap work. @@ -338,7 +334,7 @@ xfs_bmap_update_get_group( * intent drops the intent count, ensuring that the intent count * remains nonzero across the transaction roll. */ - bi->bi_pag = xfs_perag_intent_get(mp, agno); + bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock); } /* Add this deferred BUI to the transaction. */ diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c index 005a66be44a2..7bdb9688c0f5 100644 --- a/fs/xfs/xfs_drain.c +++ b/fs/xfs/xfs_drain.c @@ -94,17 +94,17 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) } /* - * Get a passive reference to an AG and declare an intent to update its - * metadata. + * Get a passive reference to the AG that contains a fsbno and declare an intent + * to update its metadata. */ struct xfs_perag * xfs_perag_intent_get( struct xfs_mount *mp, - xfs_agnumber_t agno) + xfs_fsblock_t fsbno) { struct xfs_perag *pag; - pag = xfs_perag_get(mp, agno); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno)); if (!pag) return NULL; diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h index 50a5772a8296..775164f54ea6 100644 --- a/fs/xfs/xfs_drain.h +++ b/fs/xfs/xfs_drain.h @@ -62,7 +62,7 @@ void xfs_drain_wait_enable(void); * until the item is finished or cancelled. */ struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, - xfs_agnumber_t agno); + xfs_fsblock_t fsbno); void xfs_perag_intent_put(struct xfs_perag *pag); void xfs_perag_intent_hold(struct xfs_perag *pag); @@ -76,7 +76,8 @@ struct xfs_defer_drain { /* empty */ }; #define xfs_defer_drain_free(dr) ((void)0) #define xfs_defer_drain_init(dr) ((void)0) -#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno)) +#define xfs_perag_intent_get(mp, fsbno) \ + xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno)) #define xfs_perag_intent_put(pag) xfs_perag_put(pag) static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 5a76af9d8560..98d0fe0175f1 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -424,10 +424,7 @@ xfs_extent_free_get_group( struct xfs_mount *mp, struct xfs_extent_free_item *xefi) { - xfs_agnumber_t agno; - - agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); - xefi->xefi_pag = xfs_perag_intent_get(mp, agno); + xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock); } /* Release a passive AG ref after some freeing work. */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 14919b33e4fe..78e106d05aa2 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -330,10 +330,7 @@ xfs_refcount_update_get_group( struct xfs_mount *mp, struct xfs_refcount_intent *ri) { - xfs_agnumber_t agno; - - agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); - ri->ri_pag = xfs_perag_intent_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock); } /* Release a passive AG ref after finishing refcounting work. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index e473124e29cc..2e732aded58e 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -356,10 +356,7 @@ xfs_rmap_update_get_group( struct xfs_mount *mp, struct xfs_rmap_intent *ri) { - xfs_agnumber_t agno; - - agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); - ri->ri_pag = xfs_perag_intent_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock); } /* Release a passive AG ref after finishing rmapping work. */ -- cgit v1.2.3-70-g09d2 From 649c0c2b86ee944a1a9962b310b1b97ead12e97a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:52 -0700 Subject: xfs: add a xefi_entry helper Add a helper to translate from the item list head to the xfs_extent_free_item structure and use it so shorten assignments and avoid the need for extra local variables. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extfree_item.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 98d0fe0175f1..27b0a47cf650 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -303,6 +303,11 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_intent = xfs_efd_item_intent, }; +static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_extent_free_item, xefi_list); +} + /* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. @@ -338,11 +343,8 @@ xfs_extent_free_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_extent_free_item *ra; - struct xfs_extent_free_item *rb; - - ra = container_of(a, struct xfs_extent_free_item, xefi_list); - rb = container_of(b, struct xfs_extent_free_item, xefi_list); + struct xfs_extent_free_item *ra = xefi_entry(a); + struct xfs_extent_free_item *rb = xefi_entry(b); return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; } @@ -444,7 +446,7 @@ xfs_extent_free_finish_item( struct xfs_btree_cur **state) { struct xfs_owner_info oinfo = { }; - struct xfs_extent_free_item *xefi; + struct xfs_extent_free_item *xefi = xefi_entry(item); struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; @@ -452,7 +454,6 @@ xfs_extent_free_finish_item( xfs_agblock_t agbno; int error = 0; - xefi = container_of(item, struct xfs_extent_free_item, xefi_list); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; @@ -504,9 +505,7 @@ STATIC void xfs_extent_free_cancel_item( struct list_head *item) { - struct xfs_extent_free_item *xefi; - - xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + struct xfs_extent_free_item *xefi = xefi_entry(item); xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); @@ -526,14 +525,13 @@ xfs_agfl_free_finish_item( struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); - struct xfs_extent_free_item *xefi; + struct xfs_extent_free_item *xefi = xefi_entry(item); struct xfs_extent *extp; struct xfs_buf *agbp; int error; xfs_agblock_t agbno; uint next_extent; - xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(xefi->xefi_blockcount == 1); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; -- cgit v1.2.3-70-g09d2 From 61665fae4e4302f2a48de56749640a9f1a4c2ec5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:53 -0700 Subject: xfs: reuse xfs_extent_free_cancel_item Reuse xfs_extent_free_cancel_item to put the AG/RTG and free the item in a few places that currently open code the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extfree_item.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 27b0a47cf650..dec655a8c1d6 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -437,6 +437,17 @@ xfs_extent_free_put_group( xfs_perag_intent_put(xefi->xefi_pag); } +/* Cancel a free extent. */ +STATIC void +xfs_extent_free_cancel_item( + struct list_head *item) +{ + struct xfs_extent_free_item *xefi = xefi_entry(item); + + xfs_extent_free_put_group(xefi); + kmem_cache_free(xfs_extfree_item_cache, xefi); +} + /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( @@ -487,8 +498,7 @@ xfs_extent_free_finish_item( extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; - xfs_extent_free_put_group(xefi); - kmem_cache_free(xfs_extfree_item_cache, xefi); + xfs_extent_free_cancel_item(item); return error; } @@ -500,17 +510,6 @@ xfs_extent_free_abort_intent( xfs_efi_release(EFI_ITEM(intent)); } -/* Cancel a free extent. */ -STATIC void -xfs_extent_free_cancel_item( - struct list_head *item) -{ - struct xfs_extent_free_item *xefi = xefi_entry(item); - - xfs_extent_free_put_group(xefi); - kmem_cache_free(xfs_extfree_item_cache, xefi); -} - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -550,8 +549,7 @@ xfs_agfl_free_finish_item( extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; - xfs_extent_free_put_group(xefi); - kmem_cache_free(xfs_extfree_item_cache, xefi); + xfs_extent_free_cancel_item(&xefi->xefi_list); return error; } -- cgit v1.2.3-70-g09d2 From 81927e6ec621e0607e2c061c7bc768f135cb5dc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:54 -0700 Subject: xfs: factor out a xfs_efd_add_extent helper Factor out a helper to add an extent to and EFD instead of duplicating the logic in two places. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extfree_item.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index dec655a8c1d6..c755037a64d2 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -336,6 +336,22 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } +static void +xfs_efd_add_extent( + struct xfs_efd_log_item *efdp, + struct xfs_extent_free_item *xefi) +{ + struct xfs_extent *extp; + + ASSERT(efdp->efd_next_extent < efdp->efd_format.efd_nextents); + + extp = &efdp->efd_format.efd_extents[efdp->efd_next_extent]; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + + efdp->efd_next_extent++; +} + /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -460,8 +476,6 @@ xfs_extent_free_finish_item( struct xfs_extent_free_item *xefi = xefi_entry(item); struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; xfs_agblock_t agbno; int error = 0; @@ -490,14 +504,7 @@ xfs_extent_free_finish_item( return error; } - /* Add the work we finished to the EFD, even though nobody uses that */ - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - + xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(item); return error; } @@ -525,11 +532,9 @@ xfs_agfl_free_finish_item( struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_extent_free_item *xefi = xefi_entry(item); - struct xfs_extent *extp; struct xfs_buf *agbp; int error; xfs_agblock_t agbno; - uint next_extent; ASSERT(xefi->xefi_blockcount == 1); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); @@ -542,13 +547,7 @@ xfs_agfl_free_finish_item( error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno, agbno, 1, &oinfo, XFS_AG_RESV_AGFL); - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - + xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(&xefi->xefi_list); return error; } -- cgit v1.2.3-70-g09d2 From 851a6781895a0f6e0ba75168dc7aecc132d13e6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:55 -0700 Subject: xfs: remove duplicate asserts in xfs_defer_extent_free The bno/len verification is already done by the calls to xfs_verify_rtbext / xfs_verify_fsbext, and reporting a corruption error seem like the better handling than tripping an assert anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 089031151eed..adae37eb3d88 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2567,23 +2567,10 @@ xfs_defer_extent_free( { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - ASSERT(xfs_extfree_item_cache != NULL); ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) -- cgit v1.2.3-70-g09d2 From 7272f77c67c0710918e5678266f8dad6e3bfc8d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:22:56 -0700 Subject: xfs: remove xfs_defer_agfl_block xfs_free_extent_later can handle the extra AGFL special casing with very little extra logic. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 68 +++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index adae37eb3d88..fecfd61f5de8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2509,48 +2509,6 @@ xfs_agfl_reset( clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); } -/* - * Defer an AGFL block free. This is effectively equivalent to - * xfs_free_extent_later() with some special handling particular to AGFL blocks. - * - * Deferring AGFL frees helps prevent log reservation overruns due to too many - * allocation operations in a transaction. AGFL frees are prone to this problem - * because for one they are always freed one at a time. Further, an immediate - * AGFL block free can cause a btree join and require another block free before - * the real allocation can proceed. Deferring the free disconnects freeing up - * the AGFL slot from freeing the block. - */ -static int -xfs_defer_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_owner_info *oinfo) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *xefi; - xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); - - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(oinfo != NULL); - - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) - return -EFSCORRUPTED; - - xefi = kmem_cache_zalloc(xfs_extfree_item_cache, - GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = fsbno; - xefi->xefi_blockcount = 1; - xefi->xefi_owner = oinfo->oi_owner; - xefi->xefi_agresv = XFS_AG_RESV_AGFL; - - trace_xfs_agfl_free_defer(mp, xefi); - - xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); - return 0; -} - /* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). @@ -2571,7 +2529,6 @@ xfs_defer_extent_free( ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) return -EFSCORRUPTED; @@ -2598,7 +2555,13 @@ xfs_defer_extent_free( trace_xfs_extent_free_defer(mp, xefi); xfs_extent_free_get_group(mp, xefi); - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); + + if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, + &xfs_agfl_free_defer_type); + else + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, + &xfs_extent_free_defer_type); return 0; } @@ -2856,8 +2819,21 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; - /* defer agfl frees */ - error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + /* + * Defer the AGFL block free. + * + * This helps to prevent log reservation overruns due to too + * many allocation operations in a transaction. AGFL frees are + * prone to this problem because for one they are always freed + * one at a time. Further, an immediate AGFL block free can + * cause a btree join and require another block free before the + * real allocation can proceed. + * Deferring the free disconnects freeing up the AGFL slot from + * freeing the block. + */ + error = xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, args->agno, bno), 1, + &targs.oinfo, XFS_AG_RESV_AGFL, 0); if (error) goto out_agbp_relse; } -- cgit v1.2.3-70-g09d2 From 84a3c1576c5aade32170fae6c61d51bd2d16010f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:56 -0700 Subject: xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Move the code that adds the incore xfs_extent_free_item deferred work data to a transaction to live with the EFI log item code. This means that the allocator code no longer has to know about the inner workings of the EFI log items. As a consequence, we can get rid of the _{get,put}_group helpers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 12 ++---------- fs/xfs/libxfs/xfs_alloc.h | 3 --- fs/xfs/xfs_extfree_item.c | 31 +++++++++++++++++-------------- fs/xfs/xfs_extfree_item.h | 6 ++++++ 4 files changed, 25 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fecfd61f5de8..ef4f5972da5d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -27,6 +27,7 @@ #include "xfs_ag_resv.h" #include "xfs_bmap.h" #include "xfs_health.h" +#include "xfs_extfree_item.h" struct kmem_cache *xfs_extfree_item_cache; @@ -2552,16 +2553,7 @@ xfs_defer_extent_free( xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_extent_free_defer(mp, xefi); - - xfs_extent_free_get_group(mp, xefi); - - if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_agfl_free_defer_type); - else - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_extent_free_defer_type); + xfs_extent_free_defer_add(tp, xefi, dfpp); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7f51b3cb0349..fae170825be0 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -256,9 +256,6 @@ struct xfs_extent_free_item { enum xfs_ag_resv_type xefi_agresv; }; -void xfs_extent_free_get_group(struct xfs_mount *mp, - struct xfs_extent_free_item *xefi); - #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c755037a64d2..abffc74a924f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -436,21 +436,24 @@ xfs_extent_free_create_done( return &efdp->efd_item; } -/* Take a passive ref to the AG containing the space we're freeing. */ +/* Add this deferred EFI to the transaction. */ void -xfs_extent_free_get_group( - struct xfs_mount *mp, - struct xfs_extent_free_item *xefi) +xfs_extent_free_defer_add( + struct xfs_trans *tp, + struct xfs_extent_free_item *xefi, + struct xfs_defer_pending **dfpp) { - xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock); -} + struct xfs_mount *mp = tp->t_mountp; -/* Release a passive AG ref after some freeing work. */ -static inline void -xfs_extent_free_put_group( - struct xfs_extent_free_item *xefi) -{ - xfs_perag_intent_put(xefi->xefi_pag); + trace_xfs_extent_free_defer(mp, xefi); + + xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock); + if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, + &xfs_agfl_free_defer_type); + else + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, + &xfs_extent_free_defer_type); } /* Cancel a free extent. */ @@ -460,7 +463,7 @@ xfs_extent_free_cancel_item( { struct xfs_extent_free_item *xefi = xefi_entry(item); - xfs_extent_free_put_group(xefi); + xfs_perag_intent_put(xefi->xefi_pag); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -575,7 +578,7 @@ xfs_efi_recover_work( xefi->xefi_blockcount = extp->ext_len; xefi->xefi_agresv = XFS_AG_RESV_NONE; xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; - xfs_extent_free_get_group(mp, xefi); + xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start); xfs_defer_add_item(dfp, &xefi->xefi_list); } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index da6a5afa607c..41b7c4306079 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -88,4 +88,10 @@ xfs_efd_log_item_sizeof( extern struct kmem_cache *xfs_efi_cache; extern struct kmem_cache *xfs_efd_cache; +struct xfs_extent_free_item; + +void xfs_extent_free_defer_add(struct xfs_trans *tp, + struct xfs_extent_free_item *xefi, + struct xfs_defer_pending **dfpp); + #endif /* __XFS_EXTFREE_ITEM_H__ */ -- cgit v1.2.3-70-g09d2 From 71f5a17e526775f001f643c9d54e5b59fa29d7ac Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:57 -0700 Subject: xfs: give rmap btree cursor error tracepoints their own class Create a new tracepoint class for btree-related errors, then convert all the rmap tracepoints to use it. Also fix the one tracepoint that was abusing the old class by making it a separate tracepoint. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 33 +++++-------- fs/xfs/xfs_trace.h | 117 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 110 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ef16f6f9cef6..bf047cdb95a4 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -111,8 +111,7 @@ xfs_rmap_update( xfs_rmap_irec_offset_pack(irec)); error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_update_error(cur, error, _RET_IP_); return error; } @@ -155,8 +154,7 @@ xfs_rmap_insert( } done: if (error) - trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_insert_error(rcur, error, _RET_IP_); return error; } @@ -194,8 +192,7 @@ xfs_rmap_delete( } done: if (error) - trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_delete_error(rcur, error, _RET_IP_); return error; } @@ -816,8 +813,7 @@ out_done: unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } @@ -1148,8 +1144,7 @@ xfs_rmap_map( unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } @@ -1344,8 +1339,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* reset the cursor back to PREV */ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); @@ -1698,8 +1692,7 @@ xfs_rmap_convert( unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -1822,8 +1815,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. */ @@ -2125,8 +2117,7 @@ xfs_rmap_convert_shared( unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -2325,8 +2316,7 @@ xfs_rmap_unmap_shared( unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } @@ -2486,8 +2476,7 @@ xfs_rmap_map_shared( unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b2ea9d5141a7..fb956ffb9d06 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2826,46 +2826,98 @@ DEFINE_EVENT(xfs_rmap_class, name, \ const struct xfs_owner_info *oinfo), \ TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) -/* simple AG-based error/%ip tracepoint class */ -DECLARE_EVENT_CLASS(xfs_ag_error_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, +/* btree cursor error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_btree_error_class, + TP_PROTO(struct xfs_btree_cur *cur, int error, unsigned long caller_ip), - TP_ARGS(mp, agno, error, caller_ip), + TP_ARGS(cur, error, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) __field(int, error) __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_INODE: + __entry->agno = 0; + __entry->ino = cur->bc_ino.ip->i_ino; + break; + case XFS_BTREE_TYPE_AG: + __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->ino = 0; + break; + case XFS_BTREE_TYPE_MEM: + __entry->agno = 0; + __entry->ino = 0; + break; + } __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x error %d caller %pS", + TP_printk("dev %d:%d agno 0x%x ino 0x%llx error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->ino, __entry->error, (char *)__entry->caller_ip) ); -#define DEFINE_AG_ERROR_EVENT(name) \ -DEFINE_EVENT(xfs_ag_error_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ +#define DEFINE_BTREE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_btree_error_class, name, \ + TP_PROTO(struct xfs_btree_cur *cur, int error, \ unsigned long caller_ip), \ - TP_ARGS(mp, agno, error, caller_ip)) + TP_ARGS(cur, error, caller_ip)) DEFINE_RMAP_EVENT(xfs_rmap_unmap); DEFINE_RMAP_EVENT(xfs_rmap_unmap_done); -DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_unmap_error); DEFINE_RMAP_EVENT(xfs_rmap_map); DEFINE_RMAP_EVENT(xfs_rmap_map_done); -DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_map_error); DEFINE_RMAP_EVENT(xfs_rmap_convert); DEFINE_RMAP_EVENT(xfs_rmap_convert_done); -DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error); -DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_convert_error); + +TRACE_EVENT(xfs_rmap_convert_state, + TP_PROTO(struct xfs_btree_cur *cur, int state, + unsigned long caller_ip), + TP_ARGS(cur, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(int, state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_INODE: + __entry->agno = 0; + __entry->ino = cur->bc_ino.ip->i_ino; + break; + case XFS_BTREE_TYPE_AG: + __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->ino = 0; + break; + case XFS_BTREE_TYPE_MEM: + __entry->agno = 0; + __entry->ino = 0; + break; + } + __entry->state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->ino, + __entry->state, + (char *)__entry->caller_ip) +); DECLARE_EVENT_CLASS(xfs_rmapbt_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -2966,9 +3018,9 @@ DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); DEFINE_RMAPBT_EVENT(xfs_rmap_update); DEFINE_RMAPBT_EVENT(xfs_rmap_insert); DEFINE_RMAPBT_EVENT(xfs_rmap_delete); -DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); -DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); -DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_insert_error); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_delete_error); +DEFINE_BTREE_ERROR_EVENT(xfs_rmap_update_error); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); @@ -3094,6 +3146,35 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); +/* simple AG-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_ag_error_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, + unsigned long caller_ip), + TP_ARGS(mp, agno, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x error %d caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_AG_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_ag_error_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, error, caller_ip)) DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); /* refcount tracepoint classes */ -- cgit v1.2.3-70-g09d2 From 47492ed124219b37acf65cd931c1e45d5bc0c274 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:58 -0700 Subject: xfs: pass btree cursors to rmap btree tracepoints Prepare the rmap btree tracepoints for use with realtime rmap btrees by making them take the btree cursor object as a parameter. This will save us a lot of trouble later on. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 184 +++++++++++++++++++---------------------------- fs/xfs/xfs_trace.h | 24 +++---- 2 files changed, 85 insertions(+), 123 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index bf047cdb95a4..ce8ea3c84283 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -100,8 +100,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, - irec->rm_startblock, irec->rm_blockcount, + trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); @@ -127,8 +126,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) @@ -170,8 +168,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) @@ -339,8 +336,7 @@ xfs_rmap_find_left_neighbor_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -390,8 +386,8 @@ xfs_rmap_find_left_neighbor( info.high.rm_blockcount = 0; info.irec = irec; - trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); + trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset, + flags); /* * Historically, we always used the range query to walk every reverse @@ -422,8 +418,7 @@ xfs_rmap_find_left_neighbor( return error; *stat = 1; - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -438,8 +433,7 @@ xfs_rmap_lookup_le_range_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -486,8 +480,7 @@ xfs_rmap_lookup_le_range( *stat = 0; info.irec = irec; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, - bno, 0, owner, offset, flags); + trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags); /* * Historically, we always used the range query to walk every reverse @@ -518,8 +511,7 @@ xfs_rmap_lookup_le_range( return error; *stat = 1; - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -631,8 +623,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -648,10 +639,9 @@ xfs_rmap_unmap( goto out_error; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, - ltrec.rm_blockcount, ltrec.rm_owner, - ltrec.rm_offset, ltrec.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, + ltrec.rm_flags); ltoff = ltrec.rm_offset; /* @@ -718,10 +708,9 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - ltrec.rm_startblock, ltrec.rm_blockcount, - ltrec.rm_owner, ltrec.rm_offset, - ltrec.rm_flags); + trace_xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; @@ -797,8 +786,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - cur->bc_rec.r.rm_startblock, + trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, cur->bc_rec.r.rm_offset, @@ -809,8 +797,7 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); @@ -983,8 +970,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* @@ -997,8 +983,7 @@ xfs_rmap_map( if (error) goto out_error; if (have_lt) { - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -1036,10 +1021,10 @@ xfs_rmap_map( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; } @@ -1076,12 +1061,9 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - gtrec.rm_startblock, - gtrec.rm_blockcount, - gtrec.rm_owner, - gtrec.rm_offset, - gtrec.rm_flags); + trace_xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; @@ -1128,8 +1110,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, flags); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) goto out_error; @@ -1140,8 +1121,7 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur, error, _RET_IP_); @@ -1218,8 +1198,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for an exact match or the left-adjacent @@ -1235,10 +1214,9 @@ xfs_rmap_convert( goto done; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, PREV.rm_startblock, - PREV.rm_blockcount, PREV.rm_owner, - PREV.rm_offset, PREV.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); ASSERT(PREV.rm_offset <= offset); ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); @@ -1279,10 +1257,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, - LEFT.rm_blockcount, LEFT.rm_owner, - LEFT.rm_offset, LEFT.rm_flags); + trace_xfs_rmap_find_left_neighbor_result(cur, + LEFT.rm_startblock, LEFT.rm_blockcount, + LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && LEFT.rm_offset + LEFT.rm_blockcount == offset && xfs_rmap_is_mergeable(&LEFT, owner, newext)) @@ -1320,10 +1297,10 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && offset + len == RIGHT.rm_offset && xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1370,10 +1347,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1390,10 +1366,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1422,10 +1397,9 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1462,10 +1436,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1543,8 +1516,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1602,8 +1574,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1634,9 +1605,8 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - NEW.rm_startblock, NEW.rm_blockcount, - NEW.rm_owner, NEW.rm_offset, + trace_xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); error = xfs_btree_insert(cur, &i); if (error) @@ -1663,8 +1633,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1688,8 +1657,7 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur, error, _RET_IP_); @@ -1728,8 +1696,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for and exact match or the left-adjacent @@ -1798,10 +1765,10 @@ xfs_rmap_convert_shared( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) state |= RMAP_RIGHT_CONTIG; } @@ -2113,8 +2080,7 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur, error, _RET_IP_); @@ -2155,8 +2121,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -2312,8 +2277,7 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); @@ -2351,8 +2315,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, @@ -2377,10 +2340,10 @@ xfs_rmap_map_shared( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; @@ -2472,8 +2435,7 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur, error, _RET_IP_); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fb956ffb9d06..462eb84f9a85 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2786,10 +2786,10 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, const struct xfs_owner_info *oinfo), - TP_ARGS(mp, agno, agbno, len, unwritten, oinfo), + TP_ARGS(cur, agbno, len, unwritten, oinfo), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2800,8 +2800,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, __field(unsigned long, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = agbno; __entry->len = len; __entry->owner = oinfo->oi_owner; @@ -2821,10 +2821,10 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, ); #define DEFINE_RMAP_EVENT(name) \ DEFINE_EVENT(xfs_rmap_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + TP_PROTO(struct xfs_btree_cur *cur, \ xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ const struct xfs_owner_info *oinfo), \ - TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) + TP_ARGS(cur, agbno, len, unwritten, oinfo)) /* btree cursor error/%ip tracepoint class */ DECLARE_EVENT_CLASS(xfs_btree_error_class, @@ -2920,10 +2920,10 @@ TRACE_EVENT(xfs_rmap_convert_state, ); DECLARE_EVENT_CLASS(xfs_rmapbt_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags), - TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_ARGS(cur, agbno, len, owner, offset, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2934,8 +2934,8 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = agbno; __entry->len = len; __entry->owner = owner; @@ -2953,10 +2953,10 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, ); #define DEFINE_RMAPBT_EVENT(name) \ DEFINE_EVENT(xfs_rmapbt_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + TP_PROTO(struct xfs_btree_cur *cur, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ - TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) + TP_ARGS(cur, agbno, len, owner, offset, flags)) DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, -- cgit v1.2.3-70-g09d2 From fbe8c7e167a6b226ae0234c26ebb65d8401473a5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:22:59 -0700 Subject: xfs: clean up rmap log intent item tracepoint callsites Pass the incore rmap structure to the tracepoints instead of open-coding the argument passing. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 22 ++++-------------- fs/xfs/libxfs/xfs_rmap.h | 10 ++++++++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 59 +++++++++++++++++++++++------------------------- 4 files changed, 44 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ce8ea3c84283..637a4b1db9b9 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2585,20 +2585,15 @@ xfs_rmap_finish_one( struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { + struct xfs_owner_info oinfo; struct xfs_mount *mp = tp->t_mountp; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; - int error = 0; - struct xfs_owner_info oinfo; xfs_agblock_t bno; bool unwritten; + int error = 0; - bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - - trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, - ri->ri_owner, ri->ri_whichfork, - ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, - ri->ri_bmap.br_state); + trace_xfs_rmap_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; @@ -2673,15 +2668,6 @@ __xfs_rmap_add( { struct xfs_rmap_intent *ri; - trace_xfs_rmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - owner, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; @@ -2689,6 +2675,8 @@ __xfs_rmap_add( ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + trace_xfs_rmap_defer(tp->t_mountp, ri); + xfs_rmap_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 9d01fe689497..731c97137b5a 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -157,6 +157,16 @@ enum xfs_rmap_intent_type { XFS_RMAP_FREE, }; +#define XFS_RMAP_INTENT_STRINGS \ + { XFS_RMAP_MAP, "map" }, \ + { XFS_RMAP_MAP_SHARED, "map_shared" }, \ + { XFS_RMAP_UNMAP, "unmap" }, \ + { XFS_RMAP_UNMAP_SHARED, "unmap_shared" }, \ + { XFS_RMAP_CONVERT, "cvt" }, \ + { XFS_RMAP_CONVERT_SHARED, "cvt_shared" }, \ + { XFS_RMAP_ALLOC, "alloc" }, \ + { XFS_RMAP_FREE, "free" } + struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index e1ec56d95791..ae3017812089 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -41,6 +41,7 @@ #include "xfs_exchmaps.h" #include "xfs_exchrange.h" #include "xfs_parent.h" +#include "xfs_rmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 462eb84f9a85..0c235bdea738 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -91,6 +91,7 @@ struct xfs_getparents; struct xfs_parent_irec; struct xfs_attrlist_cursor_kern; struct xfs_extent_free_item; +struct xfs_rmap_intent; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -2958,20 +2959,22 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(cur, agbno, len, owner, offset, flags)) +TRACE_DEFINE_ENUM(XFS_RMAP_MAP); +TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC); +TRACE_DEFINE_ENUM(XFS_RMAP_FREE); + DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int op, - xfs_agblock_t agbno, - xfs_ino_t ino, - int whichfork, - xfs_fileoff_t offset, - xfs_filblks_t len, - xfs_exntst_t state), - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri), + TP_ARGS(mp, ri), TP_STRUCT__entry( __field(dev_t, dev) + __field(unsigned long long, owner) __field(xfs_agnumber_t, agno) - __field(xfs_ino_t, ino) __field(xfs_agblock_t, agbno) __field(int, whichfork) __field(xfs_fileoff_t, l_loff) @@ -2981,21 +2984,22 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->ino = ino; - __entry->agbno = agbno; - __entry->whichfork = whichfork; - __entry->l_loff = offset; - __entry->l_len = len; - __entry->l_state = state; - __entry->op = op; - ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + __entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(mp, + ri->ri_bmap.br_startblock); + __entry->owner = ri->ri_owner; + __entry->whichfork = ri->ri_whichfork; + __entry->l_loff = ri->ri_bmap.br_startoff; + __entry->l_len = ri->ri_bmap.br_blockcount; + __entry->l_state = ri->ri_bmap.br_state; + __entry->op = ri->ri_type; + ), + TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->op, + __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS), __entry->agno, __entry->agbno, - __entry->ino, + __entry->owner, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, @@ -3003,15 +3007,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, ); #define DEFINE_RMAP_DEFERRED_EVENT(name) \ DEFINE_EVENT(xfs_rmap_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int op, \ - xfs_agblock_t agbno, \ - xfs_ino_t ino, \ - int whichfork, \ - xfs_fileoff_t offset, \ - xfs_filblks_t len, \ - xfs_exntst_t state), \ - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) + TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri), \ + TP_ARGS(mp, ri)) DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); -- cgit v1.2.3-70-g09d2 From c9099a28c264a9284171a3d56932e44f0e8b4cfa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:00 -0700 Subject: xfs: remove xfs_trans_set_rmap_flags Remove this single-use helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rmap_item.c | 79 ++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2e732aded58e..7e998a7eb042 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -226,49 +226,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -/* Set the map extent flags for this reverse mapping. */ -static void -xfs_trans_set_rmap_flags( - struct xfs_map_extent *map, - enum xfs_rmap_intent_type type, - int whichfork, - xfs_exntst_t state) -{ - map->me_flags = 0; - if (state == XFS_EXT_UNWRITTEN) - map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) - map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; - switch (type) { - case XFS_RMAP_MAP: - map->me_flags |= XFS_RMAP_EXTENT_MAP; - break; - case XFS_RMAP_MAP_SHARED: - map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; - break; - case XFS_RMAP_UNMAP: - map->me_flags |= XFS_RMAP_EXTENT_UNMAP; - break; - case XFS_RMAP_UNMAP_SHARED: - map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; - break; - case XFS_RMAP_CONVERT: - map->me_flags |= XFS_RMAP_EXTENT_CONVERT; - break; - case XFS_RMAP_CONVERT_SHARED: - map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; - break; - case XFS_RMAP_ALLOC: - map->me_flags |= XFS_RMAP_EXTENT_ALLOC; - break; - case XFS_RMAP_FREE: - map->me_flags |= XFS_RMAP_EXTENT_FREE; - break; - default: - ASSERT(0); - } -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -307,8 +264,40 @@ xfs_rmap_update_log_item( map->me_startblock = ri->ri_bmap.br_startblock; map->me_startoff = ri->ri_bmap.br_startoff; map->me_len = ri->ri_bmap.br_blockcount; - xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, - ri->ri_bmap.br_state); + + map->me_flags = 0; + if (ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN) + map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + if (ri->ri_whichfork == XFS_ATTR_FORK) + map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + switch (ri->ri_type) { + case XFS_RMAP_MAP: + map->me_flags |= XFS_RMAP_EXTENT_MAP; + break; + case XFS_RMAP_MAP_SHARED: + map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; + case XFS_RMAP_UNMAP: + map->me_flags |= XFS_RMAP_EXTENT_UNMAP; + break; + case XFS_RMAP_UNMAP_SHARED: + map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; + case XFS_RMAP_CONVERT: + map->me_flags |= XFS_RMAP_EXTENT_CONVERT; + break; + case XFS_RMAP_CONVERT_SHARED: + map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; + case XFS_RMAP_ALLOC: + map->me_flags |= XFS_RMAP_EXTENT_ALLOC; + break; + case XFS_RMAP_FREE: + map->me_flags |= XFS_RMAP_EXTENT_FREE; + break; + default: + ASSERT(0); + } } static struct xfs_log_item * -- cgit v1.2.3-70-g09d2 From f93963779b438a33ca4b13384c070a6864ce2b2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:23:01 -0700 Subject: xfs: add a ri_entry helper Add a helper to translate from the item list head to the rmap_intent_item structure and use it so shorten assignments and avoid the need for extra local variables. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_rmap_item.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 7e998a7eb042..1cc1ec597a0b 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -226,6 +226,11 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; +static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_rmap_intent, ri_list); +} + /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -233,11 +238,8 @@ xfs_rmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_rmap_intent *ra; - struct xfs_rmap_intent *rb; - - ra = container_of(a, struct xfs_rmap_intent, ri_list); - rb = container_of(b, struct xfs_rmap_intent, ri_list); + struct xfs_rmap_intent *ra = ri_entry(a); + struct xfs_rmap_intent *rb = ri_entry(b); return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } @@ -364,11 +366,9 @@ xfs_rmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_rmap_intent *ri; + struct xfs_rmap_intent *ri = ri_entry(item); int error; - ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); @@ -389,9 +389,7 @@ STATIC void xfs_rmap_update_cancel_item( struct list_head *item) { - struct xfs_rmap_intent *ri; - - ri = container_of(item, struct xfs_rmap_intent, ri_list); + struct xfs_rmap_intent *ri = ri_entry(item); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); -- cgit v1.2.3-70-g09d2 From 37f9d1db03ba0511403c5d25ba0baaddf5208ba7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:23:01 -0700 Subject: xfs: reuse xfs_rmap_update_cancel_item Reuse xfs_rmap_update_cancel_item to put the AG/RTG and free the item in a few places that currently open code the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_rmap_item.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 1cc1ec597a0b..68e4ce0dbd72 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -358,6 +358,17 @@ xfs_rmap_update_put_group( xfs_perag_intent_put(ri->ri_pag); } +/* Cancel a deferred rmap update. */ +STATIC void +xfs_rmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_rmap_intent *ri = ri_entry(item); + + xfs_rmap_update_put_group(ri); + kmem_cache_free(xfs_rmap_intent_cache, ri); +} + /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( @@ -371,8 +382,7 @@ xfs_rmap_update_finish_item( error = xfs_rmap_finish_one(tp, ri, state); - xfs_rmap_update_put_group(ri); - kmem_cache_free(xfs_rmap_intent_cache, ri); + xfs_rmap_update_cancel_item(item); return error; } @@ -384,17 +394,6 @@ xfs_rmap_update_abort_intent( xfs_rui_release(RUI_ITEM(intent)); } -/* Cancel a deferred rmap update. */ -STATIC void -xfs_rmap_update_cancel_item( - struct list_head *item) -{ - struct xfs_rmap_intent *ri = ri_entry(item); - - xfs_rmap_update_put_group(ri); - kmem_cache_free(xfs_rmap_intent_cache, ri); -} - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( -- cgit v1.2.3-70-g09d2 From 8363b4361997044ecb99880a1a9bfdebf9145eed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:23:02 -0700 Subject: xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one In xfs_rmap_finish_one we known the cursor is non-zero when calling xfs_rmap_finish_one_cleanup and we pass a 0 error variable. This means xfs_rmap_finish_one_cleanup is just doing a xfs_btree_del_cursor. Open code that and move xfs_rmap_finish_one_cleanup to fs/xfs/xfs_rmap_item.c. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [djwong: minor porting changes] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_rmap.c | 19 +------------------ fs/xfs/libxfs/xfs_rmap.h | 2 -- fs/xfs/xfs_rmap_item.c | 18 ++++++++++++++++++ 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 637a4b1db9b9..0ee97f1698e9 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2523,23 +2523,6 @@ xfs_rmap_query_all( return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); } -/* Clean up after calling xfs_rmap_finish_one. */ -void -xfs_rmap_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) -{ - struct xfs_buf *agbp; - - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); -} - /* Commit an rmap operation into the ondisk tree. */ int __xfs_rmap_finish_intent( @@ -2604,7 +2587,7 @@ xfs_rmap_finish_one( */ rcur = *pcur; if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - xfs_rmap_finish_one_cleanup(tp, rcur, 0); + xfs_btree_del_cursor(rcur, 0); rcur = NULL; *pcur = NULL; } diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 731c97137b5a..9d85dd2a6553 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -192,8 +192,6 @@ void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 68e4ce0dbd72..44a9b77c1763 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -21,6 +21,7 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ag.h" +#include "xfs_btree.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -386,6 +387,23 @@ xfs_rmap_update_finish_item( return error; } +/* Clean up after calling xfs_rmap_finish_one. */ +STATIC void +xfs_rmap_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp = NULL; + + if (rcur == NULL) + return; + agbp = rcur->bc_ag.agbp; + xfs_btree_del_cursor(rcur, error); + if (error && agbp) + xfs_trans_brelse(tp, agbp); +} + /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( -- cgit v1.2.3-70-g09d2 From 905af72610d90f58f994feff4ead1fc258f5d2b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 11:23:03 -0700 Subject: xfs: simplify usage of the rcur local variable in xfs_rmap_finish_one Only update rcur when we know the final *pcur value. Signed-off-by: Christoph Hellwig [djwong: don't leave the caller with a dangling ref] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_rmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 0ee97f1698e9..a5a0fa6a5b5d 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2570,7 +2570,7 @@ xfs_rmap_finish_one( { struct xfs_owner_info oinfo; struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; + struct xfs_btree_cur *rcur = *pcur; struct xfs_buf *agbp = NULL; xfs_agblock_t bno; bool unwritten; @@ -2585,7 +2585,6 @@ xfs_rmap_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { xfs_btree_del_cursor(rcur, 0); rcur = NULL; @@ -2607,9 +2606,8 @@ xfs_rmap_finish_one( return -EFSCORRUPTED; } - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); + *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } - *pcur = rcur; xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff); -- cgit v1.2.3-70-g09d2 From ea7b0820d960d5a3ee72bc67cbd8b5d47c67aa4c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:04 -0700 Subject: xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Move the code that adds the incore xfs_rmap_update_item deferred work data to a transaction to live with the RUI log item code. This means that the rmap code no longer has to know about the inner workings of the RUI log items. As a consequence, we can get rid of the _{get,put}_group helpers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 6 ++---- fs/xfs/libxfs/xfs_rmap.h | 3 --- fs/xfs/xfs_rmap_item.c | 24 +++++++++++------------- fs/xfs/xfs_rmap_item.h | 4 ++++ 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index a5a0fa6a5b5d..6ef4687b3aba 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -24,6 +24,7 @@ #include "xfs_inode.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rmap_item.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -2656,10 +2657,7 @@ __xfs_rmap_add( ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; - trace_xfs_rmap_defer(tp->t_mountp, ri); - - xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); + xfs_rmap_defer_add(tp, ri); } /* Map an extent into a file. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 9d85dd2a6553..b783dd4dd95d 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -176,9 +176,6 @@ struct xfs_rmap_intent { struct xfs_perag *ri_pag; }; -void xfs_rmap_update_get_group(struct xfs_mount *mp, - struct xfs_rmap_intent *ri); - /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 44a9b77c1763..88b5580e1e19 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -22,6 +22,7 @@ #include "xfs_log_recover.h" #include "xfs_ag.h" #include "xfs_btree.h" +#include "xfs_trace.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -342,21 +343,18 @@ xfs_rmap_update_create_done( return &rudp->rud_item; } -/* Take a passive ref to the AG containing the space we're rmapping. */ +/* Add this deferred RUI to the transaction. */ void -xfs_rmap_update_get_group( - struct xfs_mount *mp, +xfs_rmap_defer_add( + struct xfs_trans *tp, struct xfs_rmap_intent *ri) { - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock); -} + struct xfs_mount *mp = tp->t_mountp; -/* Release a passive AG ref after finishing rmapping work. */ -static inline void -xfs_rmap_update_put_group( - struct xfs_rmap_intent *ri) -{ - xfs_perag_intent_put(ri->ri_pag); + trace_xfs_rmap_defer(mp, ri); + + ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Cancel a deferred rmap update. */ @@ -366,7 +364,7 @@ xfs_rmap_update_cancel_item( { struct xfs_rmap_intent *ri = ri_entry(item); - xfs_rmap_update_put_group(ri); + xfs_perag_intent_put(ri->ri_pag); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -496,7 +494,7 @@ xfs_rui_recover_work( ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - xfs_rmap_update_get_group(mp, ri); + ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock); xfs_defer_add_item(dfp, &ri->ri_list); } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 802e5119eaca..40d331555675 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -71,4 +71,8 @@ struct xfs_rud_log_item { extern struct kmem_cache *xfs_rui_cache; extern struct kmem_cache *xfs_rud_cache; +struct xfs_rmap_intent; + +void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); + #endif /* __XFS_RMAP_ITEM_H__ */ -- cgit v1.2.3-70-g09d2 From 7cf2663ff1cfb20f5fe025122016b68920b28041 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:05 -0700 Subject: xfs: give refcount btree cursor error tracepoints their own class Convert all the refcount tracepoints to use the btree error tracepoint class. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 42 ++++++++++++++---------------------------- fs/xfs/xfs_trace.h | 26 +++++++++++++------------- 2 files changed, 27 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 4d8bb760c723..77acd311aa55 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -211,8 +211,7 @@ xfs_refcount_update( error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_update_error(cur, error, _RET_IP_); return error; } @@ -247,8 +246,7 @@ xfs_refcount_insert( out_error: if (error) - trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_insert_error(cur, error, _RET_IP_); return error; } @@ -288,8 +286,7 @@ xfs_refcount_delete( &found_rec); out_error: if (error) - trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_delete_error(cur, error, _RET_IP_); return error; } @@ -438,8 +435,7 @@ xfs_refcount_split_extent( return error; out_error: - trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_); return error; } @@ -522,8 +518,7 @@ xfs_refcount_merge_center_extents( return error; out_error: - trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_); return error; } @@ -589,8 +584,7 @@ xfs_refcount_merge_left_extent( return error; out_error: - trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_); return error; } @@ -658,8 +652,7 @@ xfs_refcount_merge_right_extent( return error; out_error: - trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_); return error; } @@ -753,8 +746,7 @@ not_found: return error; out_error: - trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_); return error; } @@ -848,8 +840,7 @@ not_found: return error; out_error: - trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_); return error; } @@ -1254,8 +1245,7 @@ advloop: return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1315,8 +1305,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_error(cur, error, _RET_IP_); return error; } @@ -1630,8 +1619,7 @@ done: out_error: if (error) - trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_); return error; } @@ -1786,8 +1774,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1833,8 +1820,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0c235bdea738..42a8f89c8d42 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3461,9 +3461,9 @@ DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); -DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error); /* refcount adjustment tracepoints */ DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); @@ -3478,20 +3478,20 @@ DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); -DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); -DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error); /* reflink helpers */ DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); -DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error); DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, -- cgit v1.2.3-70-g09d2 From bb0efb0d0a2885b4c65ca31e2815da2281b99153 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:05 -0700 Subject: xfs: create specialized classes for refcount tracepoints The only user of the "ag" tracepoint event classes is the refcount btree, so rename them to make that obvious and make them take the btree cursor to simplify the arguments. This will save us a lot of trouble later on. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 24 +++++++---------- fs/xfs/xfs_trace.h | 61 ++++++++++++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 77acd311aa55..1916f8281450 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -51,7 +51,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -71,7 +71,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; @@ -91,7 +91,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -1262,11 +1262,9 @@ xfs_refcount_adjust( int error; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_increase(cur, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_decrease(cur, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. @@ -1526,8 +1524,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_find_shared(cur, agbno, aglen); /* By default, skip the whole range */ *fbno = NULLAGBLOCK; @@ -1614,8 +1611,7 @@ xfs_refcount_find_shared( } done: - trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *fbno, *flen); + trace_xfs_refcount_find_shared_result(cur, *fbno, *flen); out_error: if (error) @@ -1833,8 +1829,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_increase(rcur, agbno, aglen); /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, @@ -1850,8 +1845,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_decrease(rcur, agbno, aglen); /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 42a8f89c8d42..c945b2d9b6bc 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3176,17 +3176,41 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); /* refcount tracepoint classes */ -/* reuse the discard trace class for agbno/aglen-based traces */ -#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) +DECLARE_EVENT_CLASS(xfs_refcount_class, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(cur, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REFCOUNT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_class, name, \ + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(cur, agbno, len)) -/* ag btree lookup tracepoint class */ TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); -DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_lookup_t dir), - TP_ARGS(mp, agno, agbno, dir), +TRACE_EVENT(xfs_refcount_lookup, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + xfs_lookup_t dir), + TP_ARGS(cur, agbno, dir), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3194,8 +3218,8 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __field(xfs_lookup_t, dir) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = agbno; __entry->dir = dir; ), @@ -3207,12 +3231,6 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->dir) ) -#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ -DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_lookup_t dir), \ - TP_ARGS(mp, agno, agbno, dir)) - /* single-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -3456,7 +3474,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ TP_ARGS(mp, agno, i1, i2, i3)) /* refcount btree tracepoints */ -DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); @@ -3466,10 +3483,10 @@ DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error); DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error); /* refcount adjustment tracepoints */ -DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); -DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); -DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); -DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_EVENT(xfs_refcount_increase); +DEFINE_REFCOUNT_EVENT(xfs_refcount_decrease); +DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase); +DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease); DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); @@ -3489,8 +3506,8 @@ DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error); DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error); /* reflink helpers */ -DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); -DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared); +DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result); DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error); DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, -- cgit v1.2.3-70-g09d2 From 8fbac2f1a0947dc45ecf13e9b5aa17b5942b4a2d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:06 -0700 Subject: xfs: pass btree cursors to refcount btree tracepoints Prepare the rest of refcount btree tracepoints for use with realtime reflink by making them take the btree cursor object as a parameter. This will save us a lot of trouble later on. Remove the xfs_refcount_recover_extent tracepoint since it's already covered by other refcount tracepoints. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 42 ++++++++-------------- fs/xfs/xfs_trace.h | 83 ++++++++++++++++++++------------------------ 2 files changed, 53 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 1916f8281450..b777762494e7 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -183,7 +183,7 @@ xfs_refcount_get_rec( if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur, irec); return 0; } @@ -201,7 +201,7 @@ xfs_refcount_update( uint32_t start; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_update(cur, irec); start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); @@ -228,7 +228,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_insert(cur, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; @@ -273,7 +273,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); + trace_xfs_refcount_delete(cur, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { xfs_btree_mark_sick(cur); @@ -410,8 +410,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - &rcext, agbno); + trace_xfs_refcount_split_extent(cur, &rcext, agbno); /* Establish the right extent. */ tmp = rcext; @@ -454,8 +453,7 @@ xfs_refcount_merge_center_extents( int error; int found_rec; - trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, center, right); + trace_xfs_refcount_merge_center_extents(cur, left, center, right); ASSERT(left->rc_domain == center->rc_domain); ASSERT(right->rc_domain == center->rc_domain); @@ -536,8 +534,7 @@ xfs_refcount_merge_left_extent( int error; int found_rec; - trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, cleft); + trace_xfs_refcount_merge_left_extent(cur, left, cleft); ASSERT(left->rc_domain == cleft->rc_domain); @@ -601,8 +598,7 @@ xfs_refcount_merge_right_extent( int error; int found_rec; - trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, cright, right); + trace_xfs_refcount_merge_right_extent(cur, cright, right); ASSERT(right->rc_domain == cright->rc_domain); @@ -741,8 +737,7 @@ not_found: cleft->rc_refcount = 1; cleft->rc_domain = domain; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, cleft, agbno); + trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno); return error; out_error: @@ -835,8 +830,8 @@ not_found: cright->rc_refcount = 1; cright->rc_domain = domain; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - cright, right, agbno + aglen); + trace_xfs_refcount_find_right_extent(cur, cright, right, + agbno + aglen); return error; out_error: @@ -1139,8 +1134,7 @@ xfs_refcount_adjust_extents( tmp.rc_refcount = 1 + adj; tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); /* * Either cover the hole (increment) or @@ -1205,8 +1199,7 @@ xfs_refcount_adjust_extents( if (ext.rc_refcount == MAXREFCOUNT) goto skip; ext.rc_refcount += adj; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); + trace_xfs_refcount_modify_extent(cur, &ext); cur->bc_refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); @@ -1721,8 +1714,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_refcount = 1; tmp.rc_domain = XFS_REFC_DOMAIN_COW; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1753,8 +1745,7 @@ xfs_refcount_adjust_cow_extents( } ext.rc_refcount = 0; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); + trace_xfs_refcount_modify_extent(cur, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1990,9 +1981,6 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, pag->pag_agno, - &rr->rr_rrec); - /* Free the orphan record */ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, rr->rr_rrec.rc_startblock); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c945b2d9b6bc..d4725d66a9b2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3233,9 +3233,8 @@ TRACE_EVENT(xfs_refcount_lookup, /* single-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec), + TP_ARGS(cur, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3245,8 +3244,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -3263,15 +3262,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, #define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - struct xfs_refcount_irec *irec), \ - TP_ARGS(mp, agno, irec)) + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec), \ + TP_ARGS(cur, irec)) /* single-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec, xfs_agblock_t agbno), - TP_ARGS(mp, agno, irec, agbno), + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, + xfs_agblock_t agbno), + TP_ARGS(cur, irec, agbno), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3282,8 +3280,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __field(xfs_agblock_t, agbno) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -3302,15 +3300,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ - TP_ARGS(mp, agno, irec, agbno)) + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \ + xfs_agblock_t agbno), \ + TP_ARGS(cur, irec, agbno)) /* double-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), - TP_ARGS(mp, agno, i1, i2), + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, + struct xfs_refcount_irec *i2), + TP_ARGS(cur, i1, i2), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3324,8 +3322,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __field(xfs_nlink_t, i2_refcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3351,16 +3349,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, #define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ - TP_ARGS(mp, agno, i1, i2)) + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ + struct xfs_refcount_irec *i2), \ + TP_ARGS(cur, i1, i2)) /* double-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, - xfs_agblock_t agbno), - TP_ARGS(mp, agno, i1, i2, agbno), + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, + struct xfs_refcount_irec *i2, xfs_agblock_t agbno), + TP_ARGS(cur, i1, i2, agbno), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3375,8 +3372,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __field(xfs_agblock_t, agbno) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3404,17 +3401,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ - xfs_agblock_t agbno), \ - TP_ARGS(mp, agno, i1, i2, agbno)) + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ + struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \ + TP_ARGS(cur, i1, i2, agbno)) /* triple-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, - struct xfs_refcount_irec *i3), - TP_ARGS(mp, agno, i1, i2, i3), + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, + struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3), + TP_ARGS(cur, i1, i2, i3), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3432,8 +3427,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, __field(xfs_nlink_t, i3_refcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3468,10 +3463,9 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, #define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ - struct xfs_refcount_irec *i3), \ - TP_ARGS(mp, agno, i1, i2, i3)) + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ + struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3), \ + TP_ARGS(cur, i1, i2, i3)) /* refcount btree tracepoints */ DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); @@ -3489,7 +3483,6 @@ DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase); DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease); DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); -DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); -- cgit v1.2.3-70-g09d2 From 886f11c797722650d98c554b28e66f12317a33e4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:07 -0700 Subject: xfs: clean up refcount log intent item tracepoint callsites Pass the incore refcount intent structure to the tracepoints instead of open-coding the argument passing. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 14 +++-------- fs/xfs/libxfs/xfs_refcount.h | 6 +++++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 59 ++++++++++++++------------------------------ 4 files changed, 29 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b777762494e7..c0572bb86cdb 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1367,9 +1367,7 @@ xfs_refcount_finish_one( bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), - ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), - ri->ri_blockcount); + trace_xfs_refcount_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; @@ -1432,8 +1430,7 @@ xfs_refcount_finish_one( return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, - ri->ri_type, bno, ri->ri_blockcount); + trace_xfs_refcount_finish_one_leftover(mp, ri); return error; } @@ -1449,11 +1446,6 @@ __xfs_refcount_add( { struct xfs_refcount_intent *ri; - trace_xfs_refcount_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), - type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - blockcount); - ri = kmem_cache_alloc(xfs_refcount_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); @@ -1461,6 +1453,8 @@ __xfs_refcount_add( ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + trace_xfs_refcount_defer(tp->t_mountp, ri); + xfs_refcount_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9b56768a590c..01a20621192e 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -48,6 +48,12 @@ enum xfs_refcount_intent_type { XFS_REFCOUNT_FREE_COW, }; +#define XFS_REFCOUNT_INTENT_STRINGS \ + { XFS_REFCOUNT_INCREASE, "incr" }, \ + { XFS_REFCOUNT_DECREASE, "decr" }, \ + { XFS_REFCOUNT_ALLOC_COW, "alloc_cow" }, \ + { XFS_REFCOUNT_FREE_COW, "free_cow" } + struct xfs_refcount_intent { struct list_head ri_list; struct xfs_perag *ri_pag; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index ae3017812089..f98fb86ff8d7 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -42,6 +42,7 @@ #include "xfs_exchrange.h" #include "xfs_parent.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d4725d66a9b2..56c8333a470b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -92,6 +92,7 @@ struct xfs_parent_irec; struct xfs_attrlist_cursor_kern; struct xfs_extent_free_item; struct xfs_rmap_intent; +struct xfs_refcount_intent; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -3503,66 +3504,42 @@ DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared); DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result); DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error); +TRACE_DEFINE_ENUM(XFS_REFCOUNT_INCREASE); +TRACE_DEFINE_ENUM(XFS_REFCOUNT_DECREASE); +TRACE_DEFINE_ENUM(XFS_REFCOUNT_ALLOC_COW); +TRACE_DEFINE_ENUM(XFS_REFCOUNT_FREE_COW); + DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, type, agbno, len), + TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc), + TP_ARGS(mp, refc), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, type) + __field(int, op) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->type = type; - __entry->agbno = agbno; - __entry->len = len; + __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock); + __entry->op = refc->ri_type; + __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock); + __entry->len = refc->ri_blockcount; ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS), __entry->agno, __entry->agbno, __entry->len) ); #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ DEFINE_EVENT(xfs_refcount_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int type, \ - xfs_agblock_t bno, \ - xfs_extlen_t len), \ - TP_ARGS(mp, agno, type, bno, len)) + TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc), \ + TP_ARGS(mp, refc)) DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); - -TRACE_EVENT(xfs_refcount_finish_one_leftover, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, type, agbno, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(int, type) - __field(xfs_agblock_t, agbno) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->type = type; - __entry->agbno = agbno; - __entry->len = len; - ), - TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->agno, - __entry->agbno, - __entry->len) -); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_finish_one_leftover); /* simple inode-based error/%ip tracepoint class */ DECLARE_EVENT_CLASS(xfs_inode_error_class, -- cgit v1.2.3-70-g09d2 From e69682e5a12d1ea7fd3f3b8243a506228665ee79 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:08 -0700 Subject: xfs: remove xfs_trans_set_refcount_flags Remove this single-use helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_refcount_item.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 78e106d05aa2..deb8b4aaa954 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -243,25 +243,6 @@ xfs_refcount_update_diff_items( return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } -/* Set the phys extent flags for this reverse mapping. */ -static void -xfs_trans_set_refcount_flags( - struct xfs_phys_extent *pmap, - enum xfs_refcount_intent_type type) -{ - pmap->pe_flags = 0; - switch (type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - pmap->pe_flags |= type; - break; - default: - ASSERT(0); - } -} - /* Log refcount updates in the intent item. */ STATIC void xfs_refcount_update_log_item( @@ -282,7 +263,18 @@ xfs_refcount_update_log_item( pmap = &cuip->cui_format.cui_extents[next_extent]; pmap->pe_startblock = ri->ri_startblock; pmap->pe_len = ri->ri_blockcount; - xfs_trans_set_refcount_flags(pmap, ri->ri_type); + + pmap->pe_flags = 0; + switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + pmap->pe_flags |= ri->ri_type; + break; + default: + ASSERT(0); + } } static struct xfs_log_item * -- cgit v1.2.3-70-g09d2 From 0e9254861f980bd60a58b7c2b57ba0414c038409 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:09 -0700 Subject: xfs: add a ci_entry helper Add a helper to translate from the item list head to the refcount_intent_item structure and use it so shorten assignments and avoid the need for extra local variables. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_refcount_item.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index deb8b4aaa954..cc53c733bef1 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -227,6 +227,11 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; +static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_refcount_intent, ri_list); +} + /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -234,11 +239,8 @@ xfs_refcount_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_refcount_intent *ra; - struct xfs_refcount_intent *rb; - - ra = container_of(a, struct xfs_refcount_intent, ri_list); - rb = container_of(b, struct xfs_refcount_intent, ri_list); + struct xfs_refcount_intent *ra = ci_entry(a); + struct xfs_refcount_intent *rb = ci_entry(b); return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } @@ -341,11 +343,9 @@ xfs_refcount_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_refcount_intent *ri; + struct xfs_refcount_intent *ri = ci_entry(item); int error; - ri = container_of(item, struct xfs_refcount_intent, ri_list); - /* Did we run out of reservation? Requeue what we didn't finish. */ error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { @@ -372,9 +372,7 @@ STATIC void xfs_refcount_update_cancel_item( struct list_head *item) { - struct xfs_refcount_intent *ri; - - ri = container_of(item, struct xfs_refcount_intent, ri_list); + struct xfs_refcount_intent *ri = ci_entry(item); xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); -- cgit v1.2.3-70-g09d2 From 8aef79928b3ddd8c10a3235f982933addc15a977 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:10 -0700 Subject: xfs: reuse xfs_refcount_update_cancel_item Reuse xfs_refcount_update_cancel_item to put the AG/RTG and free the item in a few places that currently open code the logic. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_refcount_item.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index cc53c733bef1..90a019ddcc1f 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -335,6 +335,17 @@ xfs_refcount_update_put_group( xfs_perag_intent_put(ri->ri_pag); } +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *ri = ci_entry(item); + + xfs_refcount_update_put_group(ri); + kmem_cache_free(xfs_refcount_intent_cache, ri); +} + /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( @@ -354,8 +365,7 @@ xfs_refcount_update_finish_item( return -EAGAIN; } - xfs_refcount_update_put_group(ri); - kmem_cache_free(xfs_refcount_intent_cache, ri); + xfs_refcount_update_cancel_item(item); return error; } @@ -367,17 +377,6 @@ xfs_refcount_update_abort_intent( xfs_cui_release(CUI_ITEM(intent)); } -/* Cancel a deferred refcount update. */ -STATIC void -xfs_refcount_update_cancel_item( - struct list_head *item) -{ - struct xfs_refcount_intent *ri = ci_entry(item); - - xfs_refcount_update_put_group(ri); - kmem_cache_free(xfs_refcount_intent_cache, ri); -} - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( -- cgit v1.2.3-70-g09d2 From bac3f784925299b5e69a857e7e03e59c88aa14be Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:10 -0700 Subject: xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one In xfs_refcount_finish_one we know the cursor is non-zero when calling xfs_refcount_finish_one_cleanup and we pass a 0 error variable. This means xfs_refcount_finish_one_cleanup is just doing a xfs_btree_del_cursor. Open code that and move xfs_refcount_finish_one_cleanup to fs/xfs/xfs_refcount_item.c. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 19 +------------------ fs/xfs/libxfs/xfs_refcount.h | 2 -- fs/xfs/xfs_refcount_item.c | 18 ++++++++++++++++++ 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index c0572bb86cdb..10a16635d93f 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1300,23 +1300,6 @@ out_error: return error; } -/* Clean up after calling xfs_refcount_finish_one. */ -void -xfs_refcount_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) -{ - struct xfs_buf *agbp; - - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); -} - /* * Set up a continuation a deferred refcount operation by updating the intent. * Checks to make sure we're not going to run off the end of the AG. @@ -1380,7 +1363,7 @@ xfs_refcount_finish_one( if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { nr_ops = rcur->bc_refc.nr_ops; shape_changes = rcur->bc_refc.shape_changes; - xfs_refcount_finish_one_cleanup(tp, rcur, 0); + xfs_btree_del_cursor(rcur, 0); rcur = NULL; *pcur = NULL; } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 01a20621192e..c94b8f71d407 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -82,8 +82,6 @@ void xfs_refcount_increase_extent(struct xfs_trans *tp, void xfs_refcount_decrease_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); -extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); extern int xfs_refcount_finish_one(struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 90a019ddcc1f..4e06cadb924d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -21,6 +21,7 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ag.h" +#include "xfs_btree.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -369,6 +370,23 @@ xfs_refcount_update_finish_item( return error; } +/* Clean up after calling xfs_refcount_finish_one. */ +STATIC void +xfs_refcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_ag.agbp; + xfs_btree_del_cursor(rcur, error); + if (error) + xfs_trans_brelse(tp, agbp); +} + /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( -- cgit v1.2.3-70-g09d2 From e51987a12cb57ca3702bff5df8a615037b2c8f8a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:11 -0700 Subject: xfs: simplify usage of the rcur local variable in xfs_refcount_finish_one Only update rcur when we know the final *pcur value. Inspired-by: Christoph Hellwig [djwong: don't leave the caller with a dangling ref] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 10a16635d93f..4137a8d1ac13 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1341,7 +1341,7 @@ xfs_refcount_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; + struct xfs_btree_cur *rcur = *pcur; struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; @@ -1359,7 +1359,6 @@ xfs_refcount_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { nr_ops = rcur->bc_refc.nr_ops; shape_changes = rcur->bc_refc.shape_changes; @@ -1373,11 +1372,11 @@ xfs_refcount_finish_one( if (error) return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); + *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, + ri->ri_pag); rcur->bc_refc.nr_ops = nr_ops; rcur->bc_refc.shape_changes = shape_changes; } - *pcur = rcur; switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: -- cgit v1.2.3-70-g09d2 From 783e8a7c9cab6744ebc5dfe75081248ac39181b2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Jul 2024 11:23:12 -0700 Subject: xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Move the code that adds the incore xfs_refcount_update_item deferred work data to a transaction live with the CUI log item code. This means that the refcount code no longer has to know about the inner workings of the CUI log items. As a consequence, we can get rid of the _{get,put}_group helpers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 6 ++---- fs/xfs/libxfs/xfs_refcount.h | 3 --- fs/xfs/xfs_refcount_item.c | 24 +++++++++++------------- fs/xfs/xfs_refcount_item.h | 5 +++++ 4 files changed, 18 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 4137a8d1ac13..198b84117df1 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -24,6 +24,7 @@ #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_refcount_item.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -1435,10 +1436,7 @@ __xfs_refcount_add( ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; - trace_xfs_refcount_defer(tp->t_mountp, ri); - - xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); + xfs_refcount_defer_add(tp, ri); } /* diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index c94b8f71d407..68acb0b1b4a8 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -74,9 +74,6 @@ xfs_refcount_check_domain( return true; } -void xfs_refcount_update_get_group(struct xfs_mount *mp, - struct xfs_refcount_intent *ri); - void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 4e06cadb924d..27398512b179 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -22,6 +22,7 @@ #include "xfs_log_recover.h" #include "xfs_ag.h" #include "xfs_btree.h" +#include "xfs_trace.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -319,21 +320,18 @@ xfs_refcount_update_create_done( return &cudp->cud_item; } -/* Take a passive ref to the AG containing the space we're refcounting. */ +/* Add this deferred CUI to the transaction. */ void -xfs_refcount_update_get_group( - struct xfs_mount *mp, +xfs_refcount_defer_add( + struct xfs_trans *tp, struct xfs_refcount_intent *ri) { - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock); -} + struct xfs_mount *mp = tp->t_mountp; -/* Release a passive AG ref after finishing refcounting work. */ -static inline void -xfs_refcount_update_put_group( - struct xfs_refcount_intent *ri) -{ - xfs_perag_intent_put(ri->ri_pag); + trace_xfs_refcount_defer(mp, ri); + + ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* Cancel a deferred refcount update. */ @@ -343,7 +341,7 @@ xfs_refcount_update_cancel_item( { struct xfs_refcount_intent *ri = ci_entry(item); - xfs_refcount_update_put_group(ri); + xfs_perag_intent_put(ri->ri_pag); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -433,7 +431,7 @@ xfs_cui_recover_work( ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; - xfs_refcount_update_get_group(mp, ri); + ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock); xfs_defer_add_item(dfp, &ri->ri_list); } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index eb0ab13682d0..bfee8f30c63c 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -71,4 +71,9 @@ struct xfs_cud_log_item { extern struct kmem_cache *xfs_cui_cache; extern struct kmem_cache *xfs_cud_cache; +struct xfs_refcount_intent; + +void xfs_refcount_defer_add(struct xfs_trans *tp, + struct xfs_refcount_intent *ri); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ -- cgit v1.2.3-70-g09d2 From 94a0333b9212a114d19096a77903f76d0d5bca26 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Mon, 1 Jul 2024 14:02:36 +0800 Subject: xfs: Avoid races with cnt_btree lastrec updates A concurrent file creation and little writing could unexpectedly return -ENOSPC error since there is a race window that the allocator could get the wrong agf->agf_longest. Write file process steps: 1) Find the entry that best meets the conditions, then calculate the start address and length of the remaining part of the entry after allocation. 2) Delete this entry and update the -current- agf->agf_longest. 3) Insert the remaining unused parts of this entry based on the calculations in 1), and update the agf->agf_longest again if necessary. Create file process steps: 1) Check whether there are free inodes in the inode chunk. 2) If there is no free inode, check whether there has space for creating inode chunks, perform the no-lock judgment first. 3) If the judgment succeeds, the judgment is performed again with agf lock held. Otherwire, an error is returned directly. If the write process is in step 2) but not go to 3) yet, the create file process goes to 2) at this time, it may be mistaken for no space, resulting in the file system still has space but the file creation fails. We have sent two different commits to the community in order to fix this problem[1][2]. Unfortunately, both solutions have flaws. In [2], I discussed with Dave and Darrick, realized that a better solution to this problem requires the "last cnt record tracking" to be ripped out of the generic btree code. And surprisingly, Dave directly provided his fix code. This patch includes appropriate modifications based on his tmp-code to address this issue. The entire fix can be roughly divided into two parts: 1) Delete the code related to lastrec-update in the generic btree code. 2) Place the process of updating longest freespace with cntbt separately to the end of the cntbt modifications. Move the cursor to the rightmost firstly, and update the longest free extent based on the record. Note that we can not update the longest with xfs_alloc_get_rec() after find the longest record, as xfs_verify_agbno() may not pass because pag->block_count is updated on the outside. Therefore, use xfs_btree_get_rec() as a replacement. [1] https://lore.kernel.org/all/20240419061848.1032366-2-yebin10@huawei.com [2] https://lore.kernel.org/all/20240604071121.3981686-1-wozizhi@huawei.com Reported by: Ye Bin Signed-off-by: Zizhi Wo Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 114 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_alloc_btree.c | 64 ---------------------- fs/xfs/libxfs/xfs_btree.c | 51 ------------------ fs/xfs/libxfs/xfs_btree.h | 16 +----- 4 files changed, 115 insertions(+), 130 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ef4f5972da5d..59326f84f6a5 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -466,6 +466,97 @@ xfs_alloc_fix_len( args->len = rlen; } +/* + * Determine if the cursor points to the block that contains the right-most + * block of records in the by-count btree. This block contains the largest + * contiguous free extent in the AG, so if we modify a record in this block we + * need to call xfs_alloc_fixup_longest() once the modifications are done to + * ensure the agf->agf_longest field is kept up to date with the longest free + * extent tracked by the by-count btree. + */ +static bool +xfs_alloc_cursor_at_lastrec( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cnt_cur, 0, &bp); + + xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB); + return xfs_btree_ptr_is_null(cnt_cur, &ptr); +} + +/* + * Find the rightmost record of the cntbt, and return the longest free space + * recorded in it. Simply set both the block number and the length to their + * maximum values before searching. + */ +static int +xfs_cntbt_longest( + struct xfs_btree_cur *cnt_cur, + xfs_extlen_t *longest) +{ + struct xfs_alloc_rec_incore irec; + union xfs_btree_rec *rec; + int stat = 0; + int error; + + memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec)); + error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + /* totally empty tree */ + *longest = 0; + return 0; + } + + error = xfs_btree_get_rec(cnt_cur, &rec, &stat); + if (error) + return error; + if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } + + xfs_alloc_btrec_to_irec(rec, &irec); + *longest = irec.ar_blockcount; + return 0; +} + +/* + * Update the longest contiguous free extent in the AG from the by-count cursor + * that is passed to us. This should be done at the end of any allocation or + * freeing operation that touches the longest extent in the btree. + * + * Needing to update the longest extent can be determined by calling + * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record + * modification but before the modification begins. + */ +static int +xfs_alloc_fixup_longest( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_perag *pag = cnt_cur->bc_ag.pag; + struct xfs_buf *bp = cnt_cur->bc_ag.agbp; + struct xfs_agf *agf = bp->b_addr; + xfs_extlen_t longest = 0; + int error; + + /* Lookup last rec in order to update AGF. */ + error = xfs_cntbt_longest(cnt_cur, &longest); + if (error) + return error; + + pag->pagf_longest = longest; + agf->agf_longest = cpu_to_be32(pag->pagf_longest); + xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST); + + return 0; +} + /* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the @@ -490,6 +581,7 @@ xfs_alloc_fixup_trees( xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ struct xfs_mount *mp; + bool fixup_longest = false; mp = cnt_cur->bc_mp; @@ -578,6 +670,10 @@ xfs_alloc_fixup_trees( nfbno2 = rbno + rlen; nflen2 = (fbno + flen) - nfbno2; } + + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; + /* * Delete the entry from the by-size btree. */ @@ -655,6 +751,10 @@ xfs_alloc_fixup_trees( return -EFSCORRUPTED; } } + + if (fixup_longest) + return xfs_alloc_fixup_longest(cnt_cur); + return 0; } @@ -1957,6 +2057,7 @@ xfs_free_ag_extent( int i; int error; struct xfs_perag *pag = agbp->b_pag; + bool fixup_longest = false; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; @@ -2220,8 +2321,13 @@ xfs_free_ag_extent( } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; + /* * In all cases we need to insert the new freespace in the by-size tree. + * + * If this new freespace is being inserted in the block that contains + * the largest free space in the btree, make sure we also fix up the + * agf->agf-longest tracker field. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; @@ -2230,6 +2336,8 @@ xfs_free_ag_extent( error = -EFSCORRUPTED; goto error0; } + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2237,6 +2345,12 @@ xfs_free_ag_extent( error = -EFSCORRUPTED; goto error0; } + if (fixup_longest) { + error = xfs_alloc_fixup_longest(cnt_cur); + if (error) + goto error0; + } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6ef5ddd89600..585e98e87ef9 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -115,67 +115,6 @@ xfs_allocbt_free_block( return 0; } -/* - * Update the longest extent in the AGF - */ -STATIC void -xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, - int reason) -{ - struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - struct xfs_perag *pag; - __be32 len; - int numrecs; - - ASSERT(!xfs_btree_is_bno(cur->bc_ops)); - - switch (reason) { - case LASTREC_UPDATE: - /* - * If this is the last leaf block and it's the last record, - * then update the size of the longest extent in the AG. - */ - if (ptr != xfs_btree_get_numrecs(block)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_INSREC: - if (be32_to_cpu(rec->alloc.ar_blockcount) <= - be32_to_cpu(agf->agf_longest)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_DELREC: - numrecs = xfs_btree_get_numrecs(block); - if (ptr <= numrecs) - return; - ASSERT(ptr == numrecs + 1); - - if (numrecs) { - xfs_alloc_rec_t *rrp; - - rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); - len = rrp->ar_blockcount; - } else { - len = 0; - } - - break; - default: - ASSERT(0); - return; - } - - agf->agf_longest = len; - pag = cur->bc_ag.agbp->b_pag; - pag->pagf_longest = be32_to_cpu(len); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); -} - STATIC int xfs_allocbt_get_minrecs( struct xfs_btree_cur *cur, @@ -493,7 +432,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, @@ -511,7 +449,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = { const struct xfs_btree_ops xfs_cntbt_ops = { .name = "cnt", .type = XFS_BTREE_TYPE_AG, - .geom_flags = XFS_BTGEO_LASTREC_UPDATE, .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -525,7 +462,6 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index d29547572a68..a5c4af148853 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1331,30 +1331,6 @@ xfs_btree_init_block_cur( xfs_btree_owner(cur)); } -/* - * Return true if ptr is the last record in the btree and - * we need to track updates to this record. The decision - * will be further refined in the update_lastrec method. - */ -STATIC int -xfs_btree_is_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - int level) -{ - union xfs_btree_ptr ptr; - - if (level > 0) - return 0; - if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE)) - return 0; - - xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); - if (!xfs_btree_ptr_is_null(cur, &ptr)) - return 0; - return 1; -} - STATIC void xfs_btree_buf_to_ptr( struct xfs_btree_cur *cur, @@ -2420,15 +2396,6 @@ xfs_btree_update( xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_log_recs(cur, bp, ptr, ptr); - /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, 0)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_UPDATE); - } - /* Pass new key value up to our parent. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, 0); @@ -3617,15 +3584,6 @@ xfs_btree_insrec( goto error0; } - /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_INSREC); - } - /* * Return the new block number, if any. * If there is one, give back a record value and a cursor too. @@ -3983,15 +3941,6 @@ xfs_btree_delrec( xfs_btree_set_numrecs(block, --numrecs); xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); - /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, NULL, - ptr, LASTREC_DELREC); - } - /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index f93374278aa1..10b7ddc3b2b3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -154,12 +154,6 @@ struct xfs_btree_ops { int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); - /* update last record information */ - void (*update_lastrec)(struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, int reason); - /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); @@ -222,15 +216,7 @@ struct xfs_btree_ops { }; /* btree geometry flags */ -#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */ -#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */ - -/* - * Reasons for the update_lastrec method to be called. - */ -#define LASTREC_UPDATE 0 -#define LASTREC_INSREC 1 -#define LASTREC_DELREC 2 +#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ union xfs_btree_irec { -- cgit v1.2.3-70-g09d2 From 613e2fdbbc7b4bd0cd324e3a025b3061eb8c947d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:19 +0200 Subject: xfs: move and rename xfs_trans_committed_bulk Ever since the CIL and delayed logging was introduced, xfs_trans_committed_bulk() has been a purely CIL checkpoint completion function and not a transaction commit completion function. Now that we are adding log specific updates to this function, it really does not have anything to do with the transaction subsystem - it is really log and log item level functionality. This should be part of the CIL code as it is the callback that moves log items from the CIL checkpoint to the AIL. Move it and rename it to xlog_cil_ail_insert(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_cil.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_trans.c | 129 ---------------------------------------------- fs/xfs/xfs_trans_priv.h | 3 -- 3 files changed, 131 insertions(+), 133 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f51cbc6405c1..141bde08bd6e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -694,6 +694,136 @@ xlog_cil_insert_items( } } +static inline void +xlog_cil_ail_insert_batch( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->ail_lock); + /* xfs_trans_ail_update_bulk drops ailp->ail_lock */ + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 0); + } +} + +/* + * Take the checkpoint's log vector chain of items and insert the attached log + * items into the AIL. This uses bulk insertion techniques to minimise AIL lock + * traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through iop_committed and iop_committing, which + * means that checkpoint commit abort handling is treated exactly the same as an + * iclog write error even though we haven't started any IO yet. Hence in this + * case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk the AIL to + * find the insertion point on every xfs_log_item_batch_insert() call. This + * saves a lot of needless list walking and is a net win, even though it + * slightly increases that amount of AIL lock traffic to set it up and tear it + * down. + */ +static void +xlog_cil_ail_insert( + struct xlog *log, + struct list_head *lv_chain, + xfs_lsn_t commit_lsn, + bool aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_ail *ailp = log->l_ailp; + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; + int i = 0; + + spin_lock(&ailp->ail_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->ail_lock); + + /* unpin all the log items */ + list_for_each_entry(lv, lv_chain, lv_list) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + set_bit(XFS_LI_ABORTED, &lip->li_flags); + + if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { + lip->li_ops->iop_release(lip); + continue; + } + + if (lip->li_ops->iop_committed) + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + else + item_lsn = commit_lsn; + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(xlog_is_shutdown(ailp->ail_log)); + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. + */ + spin_lock(&ailp->ail_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->ail_lock); + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xlog_cil_ail_insert_batch(ailp, &cur, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xlog_cil_ail_insert_batch(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->ail_lock); + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->ail_lock); +} + static void xlog_cil_free_logvec( struct list_head *lv_chain) @@ -733,7 +863,7 @@ xlog_cil_committed( spin_unlock(&ctx->cil->xc_push_lock); } - xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, + xlog_cil_ail_insert(ctx->cil->xc_log, &ctx->lv_chain, ctx->start_lsn, abort); xfs_extent_busy_sort(&ctx->busy_extents.extent_list); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 828da4ac4316..bdf3704dc301 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -725,135 +725,6 @@ xfs_trans_free_items( } } -static inline void -xfs_log_item_batch_insert( - struct xfs_ail *ailp, - struct xfs_ail_cursor *cur, - struct xfs_log_item **log_items, - int nr_items, - xfs_lsn_t commit_lsn) -{ - int i; - - spin_lock(&ailp->ail_lock); - /* xfs_trans_ail_update_bulk drops ailp->ail_lock */ - xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); - - for (i = 0; i < nr_items; i++) { - struct xfs_log_item *lip = log_items[i]; - - if (lip->li_ops->iop_unpin) - lip->li_ops->iop_unpin(lip, 0); - } -} - -/* - * Bulk operation version of xfs_trans_committed that takes a log vector of - * items to insert into the AIL. This uses bulk AIL insertion techniques to - * minimise lock traffic. - * - * If we are called with the aborted flag set, it is because a log write during - * a CIL checkpoint commit has failed. In this case, all the items in the - * checkpoint have already gone through iop_committed and iop_committing, which - * means that checkpoint commit abort handling is treated exactly the same - * as an iclog write error even though we haven't started any IO yet. Hence in - * this case all we need to do is iop_committed processing, followed by an - * iop_unpin(aborted) call. - * - * The AIL cursor is used to optimise the insert process. If commit_lsn is not - * at the end of the AIL, the insert cursor avoids the need to walk - * the AIL to find the insertion point on every xfs_log_item_batch_insert() - * call. This saves a lot of needless list walking and is a net win, even - * though it slightly increases that amount of AIL lock traffic to set it up - * and tear it down. - */ -void -xfs_trans_committed_bulk( - struct xfs_ail *ailp, - struct list_head *lv_chain, - xfs_lsn_t commit_lsn, - bool aborted) -{ -#define LOG_ITEM_BATCH_SIZE 32 - struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; - struct xfs_log_vec *lv; - struct xfs_ail_cursor cur; - int i = 0; - - spin_lock(&ailp->ail_lock); - xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); - spin_unlock(&ailp->ail_lock); - - /* unpin all the log items */ - list_for_each_entry(lv, lv_chain, lv_list) { - struct xfs_log_item *lip = lv->lv_item; - xfs_lsn_t item_lsn; - - if (aborted) - set_bit(XFS_LI_ABORTED, &lip->li_flags); - - if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { - lip->li_ops->iop_release(lip); - continue; - } - - if (lip->li_ops->iop_committed) - item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); - else - item_lsn = commit_lsn; - - /* item_lsn of -1 means the item needs no further processing */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) - continue; - - /* - * if we are aborting the operation, no point in inserting the - * object into the AIL as we are in a shutdown situation. - */ - if (aborted) { - ASSERT(xlog_is_shutdown(ailp->ail_log)); - if (lip->li_ops->iop_unpin) - lip->li_ops->iop_unpin(lip, 1); - continue; - } - - if (item_lsn != commit_lsn) { - - /* - * Not a bulk update option due to unusual item_lsn. - * Push into AIL immediately, rechecking the lsn once - * we have the ail lock. Then unpin the item. This does - * not affect the AIL cursor the bulk insert path is - * using. - */ - spin_lock(&ailp->ail_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) - xfs_trans_ail_update(ailp, lip, item_lsn); - else - spin_unlock(&ailp->ail_lock); - if (lip->li_ops->iop_unpin) - lip->li_ops->iop_unpin(lip, 0); - continue; - } - - /* Item is a candidate for bulk AIL insert. */ - log_items[i++] = lv->lv_item; - if (i >= LOG_ITEM_BATCH_SIZE) { - xfs_log_item_batch_insert(ailp, &cur, log_items, - LOG_ITEM_BATCH_SIZE, commit_lsn); - i = 0; - } - } - - /* make sure we insert the remainder! */ - if (i) - xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); - - spin_lock(&ailp->ail_lock); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); -} - /* * Sort transaction items prior to running precommit operations. This will * attempt to order the items such that they will always be locked in the same diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d5400150358e..52a45f0a5ef1 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -19,9 +19,6 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); -void xfs_trans_committed_bulk(struct xfs_ail *ailp, - struct list_head *lv_chain, - xfs_lsn_t commit_lsn, bool aborted); /* * AIL traversal cursor. * -- cgit v1.2.3-70-g09d2 From 9adf40249e6cfd7231c2973bb305f6c20902bfd9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:20 +0200 Subject: xfs: AIL doesn't need manual pushing We have a mechanism that checks the amount of log space remaining available every time we make a transaction reservation. If the amount of space is below a threshold (25% free) we push on the AIL to tell it to do more work. To do this, we end up calculating the LSN that the AIL needs to push to on every reservation and updating the push target for the AIL with that new target LSN. This is silly and expensive. The AIL is perfectly capable of calculating the push target itself, and it will always be running when the AIL contains objects. What the target does is determine if the AIL needs to do any work before it goes back to sleep. If we haven't run out of reservation space or memory (or some other push all trigger), it will simply go back to sleep for a while if there is more than 25% of the journal space free without doing anything. If there are items in the AIL at a lower LSN than the target, it will try to push up to the target or to the point of getting stuck before going back to sleep and trying again soon after.` Hence we can modify the AIL to calculate it's own 25% push target before it starts a push using the same reserve grant head based calculation as is currently used, and remove all the places where we ask the AIL to push to a new 25% free target. We can also drop the minimum free space size of 256BBs from the calculation because the 25% of a minimum sized log is *always going to be larger than 256BBs. This does still require a manual push in certain circumstances. These circumstances arise when the AIL is not full, but the reservation grants consume the entire of the free space in the log. In this case, we still need to push on the AIL to free up space, so when we hit this condition (i.e. reservation going to sleep to wait on log space) we do a single push to tell the AIL it should empty itself. This will keep the AIL moving as new reservations come in and want more space, rather than keep queuing them and having to push the AIL repeatedly. The reason for using the "push all" when grant space runs out is that we can run out of grant space when there is more than 25% of the log free. Small logs are notorious for this, and we have a hack in the log callback code (xlog_state_set_callback()) where we push the AIL because the *head* moved) to ensure that we kick the AIL when we consume space in it because that can push us over the "less than 25% available" available that starts tail pushing back up again. Hence when we run out of grant space and are going to sleep, we have to consider that the grant space may be consuming almost all the log space and there is almost nothing in the AIL. In this situation, the AIL pins the tail and moving the tail forwards is the only way the grant space will come available, so we have to force the AIL to push everything to guarantee grant space will eventually be returned. Hence triggering a "push all" just before sleeping removes all the nasty corner cases we have in other parts of the code that work around the "we didn't ask the AIL to push enough to free grant space" condition that leads to log space hangs... Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 4 +- fs/xfs/xfs_log.c | 135 ++------------------------------------ fs/xfs/xfs_log.h | 1 - fs/xfs/xfs_log_priv.h | 2 + fs/xfs/xfs_trans_ail.c | 162 ++++++++++++++++++++-------------------------- fs/xfs/xfs_trans_priv.h | 33 ++++++++-- 6 files changed, 108 insertions(+), 229 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 4a078e07e1a0..e2c8308d518b 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -12,12 +12,14 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "xfs_bmap.h" @@ -556,7 +558,7 @@ xfs_defer_relog( * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { - threshold_lsn = xlog_grant_push_threshold(log, 0); + threshold_lsn = xfs_ail_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 416c15494983..235fcf6dc4ee 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -30,10 +30,6 @@ xlog_alloc_log( struct xfs_buftarg *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int -xlog_space_left( - struct xlog *log, - atomic64_t *head); STATIC void xlog_dealloc_log( struct xlog *log); @@ -51,10 +47,6 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp); STATIC void -xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog, @@ -242,42 +234,15 @@ xlog_grant_head_wake( { struct xlog_ticket *tic; int need_bytes; - bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { - - /* - * There is a chance that the size of the CIL checkpoints in - * progress at the last AIL push target calculation resulted in - * limiting the target to the log head (l_last_sync_lsn) at the - * time. This may not reflect where the log head is now as the - * CIL checkpoints may have completed. - * - * Hence when we are woken here, it may be that the head of the - * log that has moved rather than the tail. As the tail didn't - * move, there still won't be space available for the - * reservation we require. However, if the AIL has already - * pushed to the target defined by the old log head location, we - * will hang here waiting for something else to update the AIL - * push target. - * - * Therefore, if there isn't space to wake the first waiter on - * the grant head, we need to push the AIL again to ensure the - * target reflects both the current log tail and log head - * position before we wait for the tail to move again. - */ - need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) { - if (!woken_task) - xlog_grant_push_ail(log, need_bytes); + if (*free_bytes < need_bytes) return false; - } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); - woken_task = true; } return true; @@ -296,13 +261,15 @@ xlog_grant_head_wait( do { if (xlog_is_shutdown(log)) goto shutdown; - xlog_grant_push_ail(log, need_bytes); __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&head->lock); XFS_STATS_INC(log->l_mp, xs_sleep_logspace); + /* Push on the AIL to free up all the log space. */ + xfs_ail_push_all(log->l_ailp); + trace_xfs_log_grant_sleep(log, tic); schedule(); trace_xfs_log_grant_wake(log, tic); @@ -418,9 +385,6 @@ xfs_log_regrant( * of rolling transactions in the log easily. */ tic->t_tid++; - - xlog_grant_push_ail(log, tic->t_unit_res); - tic->t_curr_res = tic->t_unit_res; if (tic->t_cnt > 0) return 0; @@ -477,12 +441,7 @@ xfs_log_reserve( ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; - - xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt - : tic->t_unit_res); - trace_xfs_log_reserve(log, tic); - error = xlog_grant_head_check(log, &log->l_reserve_head, tic, &need_bytes); if (error) @@ -1330,7 +1289,7 @@ xlog_assign_tail_lsn( * shortcut invalidity asserts in this case so that we don't trigger them * falsely. */ -STATIC int +int xlog_space_left( struct xlog *log, atomic64_t *head) @@ -1667,89 +1626,6 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ -/* - * Compute the LSN that we'd need to push the log tail towards in order to have - * (a) enough on-disk log space to log the number of bytes specified, (b) at - * least 25% of the log space free, and (c) at least 256 blocks free. If the - * log free space already meets all three thresholds, this function returns - * NULLCOMMITLSN. - */ -xfs_lsn_t -xlog_grant_push_threshold( - struct xlog *log, - int need_bytes) -{ - xfs_lsn_t threshold_lsn = 0; - xfs_lsn_t last_sync_lsn; - int free_blocks; - int free_bytes; - int threshold_block; - int threshold_cycle; - int free_threshold; - - ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - - free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); - free_blocks = BTOBBT(free_bytes); - - /* - * Set the threshold for the minimum number of free blocks in the - * log to the maximum of what the caller needs, one quarter of the - * log, and 256 blocks. - */ - free_threshold = BTOBB(need_bytes); - free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = max(free_threshold, 256); - if (free_blocks >= free_threshold) - return NULLCOMMITLSN; - - xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, - &threshold_block); - threshold_block += free_threshold; - if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; - } - threshold_lsn = xlog_assign_lsn(threshold_cycle, - threshold_block); - /* - * Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. Use a snapshot of the last sync lsn - * so that it doesn't change between the compare and the set. - */ - last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); - if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) - threshold_lsn = last_sync_lsn; - - return threshold_lsn; -} - -/* - * Push the tail of the log if we need to do so to maintain the free log space - * thresholds set out by xlog_grant_push_threshold. We may need to adopt a - * policy which pushes on an lsn which is further along in the log once we - * reach the high water mark. In this manner, we would be creating a low water - * mark. - */ -STATIC void -xlog_grant_push_ail( - struct xlog *log, - int need_bytes) -{ - xfs_lsn_t threshold_lsn; - - threshold_lsn = xlog_grant_push_threshold(log, need_bytes); - if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) - return; - - /* - * Get the transaction layer to kick the dirty buffers out to - * disk asynchronously. No point in trying to do this if - * the filesystem is shutting down. - */ - xfs_ail_push(log->l_ailp, threshold_lsn); -} - /* * Stamp cycle number in every block */ @@ -2712,7 +2588,6 @@ xlog_state_set_callback( return; atomic64_set(&log->l_last_sync_lsn, header_lsn); - xlog_grant_push_ail(log, 0); } /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d69acf881153..67c539cc9305 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -156,7 +156,6 @@ int xfs_log_quiesce(struct xfs_mount *mp); void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); -xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); int xfs_attr_use_log_assist(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 40e22ec0fbe6..0482b11965e2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -575,6 +575,8 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); } +int xlog_space_left(struct xlog *log, atomic64_t *head); + /* * Committed Item List interfaces */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index e4c343096f95..a6b6fca1d138 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -134,25 +134,6 @@ xfs_ail_min_lsn( return lsn; } -/* - * Return the maximum lsn held in the AIL, or zero if the AIL is empty. - */ -static xfs_lsn_t -xfs_ail_max_lsn( - struct xfs_ail *ailp) -{ - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; - - spin_lock(&ailp->ail_lock); - lip = xfs_ail_max(ailp); - if (lip) - lsn = lip->li_lsn; - spin_unlock(&ailp->ail_lock); - - return lsn; -} - /* * The cursor keeps track of where our current traversal is up to by tracking * the next item in the list for us. However, for this to be safe, removing an @@ -414,6 +395,56 @@ xfsaild_push_item( return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } +/* + * Compute the LSN that we'd need to push the log tail towards in order to have + * at least 25% of the log space free. If the log free space already meets this + * threshold, this function returns NULLCOMMITLSN. + */ +xfs_lsn_t +__xfs_ail_push_target( + struct xfs_ail *ailp) +{ + struct xlog *log = ailp->ail_log; + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_blocks = BTOBBT(free_bytes); + + /* + * The threshold for the minimum number of free blocks is one quarter of + * the entire log space. + */ + free_threshold = log->l_logBBsize >> 2; + if (free_blocks >= free_threshold) + return NULLCOMMITLSN; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; + if (threshold_block >= log->l_logBBsize) { + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; + } + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; + + return threshold_lsn; +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -454,21 +485,24 @@ xfsaild_push( * capture updates that occur after the sync push waiter has gone to * sleep. */ - if (waitqueue_active(&ailp->ail_empty)) { + if (test_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate) || + waitqueue_active(&ailp->ail_empty)) { lip = xfs_ail_max(ailp); if (lip) target = lip->li_lsn; + else + clear_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate); } else { - /* barrier matches the ail_target update in xfs_ail_push() */ - smp_rmb(); - target = ailp->ail_target; - ailp->ail_target_prev = target; + target = __xfs_ail_push_target(ailp); } + if (target == NULLCOMMITLSN) + goto out_done; + /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); if (!lip) - goto out_done; + goto out_done_cursor; XFS_STATS_INC(mp, xs_push_ail); @@ -553,8 +587,9 @@ xfsaild_push( lsn = lip->li_lsn; } -out_done: +out_done_cursor: xfs_trans_ail_cursor_done(&cur); +out_done: spin_unlock(&ailp->ail_lock); if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list)) @@ -603,7 +638,7 @@ xfsaild( set_freezable(); while (1) { - if (tout && tout <= 20) + if (tout) set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); @@ -639,21 +674,9 @@ xfsaild( break; } + /* Idle if the AIL is empty. */ spin_lock(&ailp->ail_lock); - - /* - * Idle if the AIL is empty and we are not racing with a target - * update. We check the AIL after we set the task to a sleep - * state to guarantee that we either catch an ail_target update - * or that a wake_up resets the state to TASK_RUNNING. - * Otherwise, we run the risk of sleeping indefinitely. - * - * The barrier matches the ail_target update in xfs_ail_push(). - */ - smp_rmb(); - if (!xfs_ail_min(ailp) && - ailp->ail_target == ailp->ail_target_prev && - list_empty(&ailp->ail_buf_list)) { + if (!xfs_ail_min(ailp) && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); schedule(); tout = 0; @@ -675,56 +698,6 @@ xfsaild( return 0; } -/* - * This routine is called to move the tail of the AIL forward. It does this by - * trying to flush items in the AIL whose lsns are below the given - * threshold_lsn. - * - * The push is run asynchronously in a workqueue, which means the caller needs - * to handle waiting on the async flush for space to become available. - * We don't want to interrupt any push that is in progress, hence we only queue - * work if we set the pushing bit appropriately. - * - * We do this unlocked - we only need to know whether there is anything in the - * AIL at the time we are called. We don't need to access the contents of - * any of the objects, so the lock is not needed. - */ -void -xfs_ail_push( - struct xfs_ail *ailp, - xfs_lsn_t threshold_lsn) -{ - struct xfs_log_item *lip; - - lip = xfs_ail_min(ailp); - if (!lip || xlog_is_shutdown(ailp->ail_log) || - XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) - return; - - /* - * Ensure that the new target is noticed in push code before it clears - * the XFS_AIL_PUSHING_BIT. - */ - smp_wmb(); - xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn); - smp_wmb(); - - wake_up_process(ailp->ail_task); -} - -/* - * Push out all items in the AIL immediately - */ -void -xfs_ail_push_all( - struct xfs_ail *ailp) -{ - xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); - - if (threshold_lsn) - xfs_ail_push(ailp, threshold_lsn); -} - /* * Push out all items in the AIL immediately and wait until the AIL is empty. */ @@ -829,6 +802,13 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); + /* + * If this is the first insert, wake up the push daemon so it can + * actively scan for items to push. + */ + if (!mlip) + wake_up_process(ailp->ail_task); + xfs_ail_update_finish(ailp, tail_lsn); } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 52a45f0a5ef1..9a131e7fae94 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -52,16 +52,18 @@ struct xfs_ail { struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; - xfs_lsn_t ail_target; - xfs_lsn_t ail_target_prev; struct list_head ail_cursors; spinlock_t ail_lock; xfs_lsn_t ail_last_pushed_lsn; int ail_log_flush; + unsigned long ail_opstate; struct list_head ail_buf_list; wait_queue_head_t ail_empty; }; +/* Push all items out of the AIL immediately. */ +#define XFS_AIL_OPSTATE_PUSH_ALL 0u + /* * From xfs_trans_ail.c */ @@ -98,10 +100,29 @@ void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type); -void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); -void xfs_ail_push_all(struct xfs_ail *); -void xfs_ail_push_all_sync(struct xfs_ail *); -struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +static inline void xfs_ail_push(struct xfs_ail *ailp) +{ + wake_up_process(ailp->ail_task); +} + +static inline void xfs_ail_push_all(struct xfs_ail *ailp) +{ + if (!test_and_set_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate)) + xfs_ail_push(ailp); +} + +xfs_lsn_t __xfs_ail_push_target(struct xfs_ail *ailp); +static inline xfs_lsn_t xfs_ail_push_target(struct xfs_ail *ailp) +{ + xfs_lsn_t lsn; + + spin_lock(&ailp->ail_lock); + lsn = __xfs_ail_push_target(ailp); + spin_unlock(&ailp->ail_lock); + return lsn; +} + +void xfs_ail_push_all_sync(struct xfs_ail *ailp); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, -- cgit v1.2.3-70-g09d2 From b50b4c49d8d79af05ac3bb3587f58589713139cc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:21 +0200 Subject: xfs: background AIL push should target physical space Currently the AIL attempts to keep 25% of the "log space" free, where the current used space is tracked by the reserve grant head. That is, it tracks both physical space used plus the amount reserved by transactions in progress. When we start tail pushing, we are trying to make space for new reservations by writing back older metadata and the log is generally physically full of dirty metadata, and reservations for modifications in flight take up whatever space the AIL can physically free up. Hence we don't really need to take into account the reservation space that has been used - we just need to keep the log tail moving as fast as we can to free up space for more reservations to be made. We know exactly how much physical space the journal is consuming in the AIL (i.e. max LSN - min LSN) so we can base push thresholds directly on this state rather than have to look at grant head reservations to determine how much to physically push out of the log. This also allows code that needs to know if log items in the current transaction need to be pushed or re-logged to simply sample the current target - they don't need to calculate the current target themselves. This avoids the need for any locking when doing such checks. Further, moving to a physical target means we don't need "push all until empty semantics" like were introduced in the previous patch. We can now test and clear the "push all" as a one-shot command to set the target to the current head of the AIL. This allows the xfsaild to maximise the use of log space right up to the point where conditions indicate that the xfsaild is not keeping up with load and it needs to work harder, and as soon as those constraints go away (i.e. external code no longer needs everything pushed) the xfsaild will return to maintaining the normal 25% free space thresholds. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 2 +- fs/xfs/xfs_log_priv.h | 18 +++++++ fs/xfs/xfs_trans_ail.c | 116 +++++++++++++++++++++++----------------------- fs/xfs/xfs_trans_priv.h | 11 ++--- 4 files changed, 80 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index e2c8308d518b..40021849b42f 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -558,7 +558,7 @@ xfs_defer_relog( * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { - threshold_lsn = xfs_ail_push_target(log->l_ailp); + threshold_lsn = xfs_ail_get_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 0482b11965e2..971871b84d84 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -625,6 +625,24 @@ xlog_wait( int xlog_wait_on_iclog(struct xlog_in_core *iclog) __releases(iclog->ic_log->l_icloglock); +/* Calculate the distance between two LSNs in bytes */ +static inline uint64_t +xlog_lsn_sub( + struct xlog *log, + xfs_lsn_t high, + xfs_lsn_t low) +{ + uint32_t hi_cycle = CYCLE_LSN(high); + uint32_t hi_block = BLOCK_LSN(high); + uint32_t lo_cycle = CYCLE_LSN(low); + uint32_t lo_block = BLOCK_LSN(low); + + if (hi_cycle == lo_cycle) + return BBTOB(hi_block - lo_block); + ASSERT((hi_cycle == lo_cycle + 1) || xlog_is_shutdown(log)); + return (uint64_t)log->l_logsize - BBTOB(lo_block - hi_block); +} + /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index a6b6fca1d138..26d4d9b3e357 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,51 +398,69 @@ xfsaild_push_item( /* * Compute the LSN that we'd need to push the log tail towards in order to have * at least 25% of the log space free. If the log free space already meets this - * threshold, this function returns NULLCOMMITLSN. + * threshold, this function returns the lowest LSN in the AIL to slowly keep + * writeback ticking over and the tail of the log moving forward. */ -xfs_lsn_t -__xfs_ail_push_target( +static xfs_lsn_t +xfs_ail_calc_push_target( struct xfs_ail *ailp) { - struct xlog *log = ailp->ail_log; - xfs_lsn_t threshold_lsn = 0; - xfs_lsn_t last_sync_lsn; - int free_blocks; - int free_bytes; - int threshold_block; - int threshold_cycle; - int free_threshold; - - free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); - free_blocks = BTOBBT(free_bytes); + struct xlog *log = ailp->ail_log; + struct xfs_log_item *lip; + xfs_lsn_t target_lsn; + xfs_lsn_t max_lsn; + xfs_lsn_t min_lsn; + int32_t free_bytes; + uint32_t target_block; + uint32_t target_cycle; + + lockdep_assert_held(&ailp->ail_lock); + + lip = xfs_ail_max(ailp); + if (!lip) + return NULLCOMMITLSN; + + max_lsn = lip->li_lsn; + min_lsn = __xfs_ail_min_lsn(ailp); /* - * The threshold for the minimum number of free blocks is one quarter of - * the entire log space. + * If we are supposed to push all the items in the AIL, we want to push + * to the current head. We then clear the push flag so that we don't + * keep pushing newly queued items beyond where the push all command was + * run. If the push waiter wants to empty the ail, it should queue + * itself on the ail_empty wait queue. */ - free_threshold = log->l_logBBsize >> 2; - if (free_blocks >= free_threshold) - return NULLCOMMITLSN; + if (test_and_clear_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate)) + return max_lsn; + + /* If someone wants the AIL empty, keep pushing everything we have. */ + if (waitqueue_active(&ailp->ail_empty)) + return max_lsn; - xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, - &threshold_block); - threshold_block += free_threshold; - if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; - } - threshold_lsn = xlog_assign_lsn(threshold_cycle, - threshold_block); /* - * Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. Use a snapshot of the last sync lsn - * so that it doesn't change between the compare and the set. + * Background pushing - attempt to keep 25% of the log free and if we + * have that much free retain the existing target. */ - last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); - if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) - threshold_lsn = last_sync_lsn; + free_bytes = log->l_logsize - xlog_lsn_sub(log, max_lsn, min_lsn); + if (free_bytes >= log->l_logsize >> 2) + return ailp->ail_target; + + target_cycle = CYCLE_LSN(min_lsn); + target_block = BLOCK_LSN(min_lsn) + (log->l_logBBsize >> 2); + if (target_block >= log->l_logBBsize) { + target_block -= log->l_logBBsize; + target_cycle += 1; + } + target_lsn = xlog_assign_lsn(target_cycle, target_block); + + /* Cap the target to the highest LSN known to be in the AIL. */ + if (XFS_LSN_CMP(target_lsn, max_lsn) > 0) + return max_lsn; - return threshold_lsn; + /* If the existing target is higher than the new target, keep it. */ + if (XFS_LSN_CMP(ailp->ail_target, target_lsn) >= 0) + return ailp->ail_target; + return target_lsn; } static long @@ -453,7 +471,6 @@ xfsaild_push( struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; - xfs_lsn_t target = NULLCOMMITLSN; long tout; int stuck = 0; int flushing = 0; @@ -478,25 +495,8 @@ xfsaild_push( } spin_lock(&ailp->ail_lock); - - /* - * If we have a sync push waiter, we always have to push till the AIL is - * empty. Update the target to point to the end of the AIL so that - * capture updates that occur after the sync push waiter has gone to - * sleep. - */ - if (test_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate) || - waitqueue_active(&ailp->ail_empty)) { - lip = xfs_ail_max(ailp); - if (lip) - target = lip->li_lsn; - else - clear_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate); - } else { - target = __xfs_ail_push_target(ailp); - } - - if (target == NULLCOMMITLSN) + WRITE_ONCE(ailp->ail_target, xfs_ail_calc_push_target(ailp)); + if (ailp->ail_target == NULLCOMMITLSN) goto out_done; /* we're done if the AIL is empty or our push has reached the end */ @@ -506,10 +506,10 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail); - ASSERT(target != NULLCOMMITLSN); + ASSERT(ailp->ail_target != NULLCOMMITLSN); lsn = lip->li_lsn; - while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { + while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) { int lock_result; /* @@ -595,7 +595,7 @@ out_done: if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list)) ailp->ail_log_flush++; - if (!count || XFS_LSN_CMP(lsn, target) >= 0) { + if (!count || XFS_LSN_CMP(lsn, ailp->ail_target) >= 0) { /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 9a131e7fae94..60b4707c3a65 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -59,6 +59,7 @@ struct xfs_ail { unsigned long ail_opstate; struct list_head ail_buf_list; wait_queue_head_t ail_empty; + xfs_lsn_t ail_target; }; /* Push all items out of the AIL immediately. */ @@ -111,15 +112,9 @@ static inline void xfs_ail_push_all(struct xfs_ail *ailp) xfs_ail_push(ailp); } -xfs_lsn_t __xfs_ail_push_target(struct xfs_ail *ailp); -static inline xfs_lsn_t xfs_ail_push_target(struct xfs_ail *ailp) +static inline xfs_lsn_t xfs_ail_get_push_target(struct xfs_ail *ailp) { - xfs_lsn_t lsn; - - spin_lock(&ailp->ail_lock); - lsn = __xfs_ail_push_target(ailp); - spin_unlock(&ailp->ail_lock); - return lsn; + return READ_ONCE(ailp->ail_target); } void xfs_ail_push_all_sync(struct xfs_ail *ailp); -- cgit v1.2.3-70-g09d2 From a07776ab814d432190a902c2c3fac867c4e76934 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:22 +0200 Subject: xfs: ensure log tail is always up to date Whenever we write an iclog, we call xlog_assign_tail_lsn() to update the current tail before we write it into the iclog header. This means we have to take the AIL lock on every iclog write just to check if the tail of the log has moved. This doesn't avoid races with log tail updates - the log tail could move immediately after we assign the tail to the iclog header and hence by the time the iclog reaches stable storage the tail LSN has moved forward in memory. Hence the log tail LSN in the iclog header is really just a point in time snapshot of the current state of the AIL. With this in mind, if we simply update the in memory log->l_tail_lsn every time it changes in the AIL, there is no need to update the in memory value when we are writing it into an iclog - it will already be up-to-date in memory and checking the AIL again will not change this. Hence xlog_state_release_iclog() does not need to check the AIL to update the tail lsn and can just sample it directly without needing to take the AIL lock. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 5 ++--- fs/xfs/xfs_trans_ail.c | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 235fcf6dc4ee..ae22f361627f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -530,7 +530,6 @@ xlog_state_release_iclog( struct xlog_in_core *iclog, struct xlog_ticket *ticket) { - xfs_lsn_t tail_lsn; bool last_ref; lockdep_assert_held(&log->l_icloglock); @@ -545,8 +544,8 @@ xlog_state_release_iclog( if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && !iclog->ic_header.h_tail_lsn) { - tail_lsn = xlog_assign_tail_lsn(log->l_mp); - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = + cpu_to_be64(atomic64_read(&log->l_tail_lsn)); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 26d4d9b3e357..7d6ccd21aae2 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -720,6 +720,13 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +/* + * Callers should pass the original tail lsn so that we can detect if the tail + * has moved as a result of the operation that was performed. If the caller + * needs to force a tail LSN update, it should pass NULLCOMMITLSN to bypass the + * "did the tail LSN change?" checks. If the caller wants to avoid a tail update + * (e.g. it knows the tail did not change) it should pass an @old_lsn of 0. + */ void xfs_ail_update_finish( struct xfs_ail *ailp, @@ -804,10 +811,16 @@ xfs_trans_ail_update_bulk( /* * If this is the first insert, wake up the push daemon so it can - * actively scan for items to push. + * actively scan for items to push. We also need to do a log tail + * LSN update to ensure that it is correctly tracked by the log, so + * set the tail_lsn to NULLCOMMITLSN so that xfs_ail_update_finish() + * will see that the tail lsn has changed and will update the tail + * appropriately. */ - if (!mlip) + if (!mlip) { wake_up_process(ailp->ail_task); + tail_lsn = NULLCOMMITLSN; + } xfs_ail_update_finish(ailp, tail_lsn); } -- cgit v1.2.3-70-g09d2 From 0dcd5a10d9878dc5a3fd17c0331646a69ebb5da6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:23 +0200 Subject: xfs: l_last_sync_lsn is really AIL state The current implementation of xlog_assign_tail_lsn() assumes that when the AIL is empty, the log tail matches the LSN of the last written commit record. This is recorded in xlog_state_set_callback() as log->l_last_sync_lsn when the iclog state changes to XLOG_STATE_CALLBACK. This change is then immediately followed by running the callbacks on the iclog which then insert the log items into the AIL at the "commit lsn" of that checkpoint. The AIL tracks log items via the start record LSN of the checkpoint, not the commit record LSN. This is because we can pipeline multiple checkpoints, and so the start record of checkpoint N+1 can be written before the commit record of checkpoint N. i.e: start N commit N +-------------+------------+----------------+ start N+1 commit N+1 The tail of the log cannot be moved to the LSN of commit N when all the items of that checkpoint are written back, because then the start record for N+1 is no longer in the active portion of the log and recovery will fail/corrupt the filesystem. Hence when all the log items in checkpoint N are written back, the tail of the log most now only move as far forwards as the start LSN of checkpoint N+1. Hence we cannot use the maximum start record LSN the AIL sees as a replacement the pointer to the current head of the on-disk log records. However, we currently only use the l_last_sync_lsn when the AIL is empty - when there is no start LSN remaining, the tail of the log moves to the LSN of the last commit record as this is where recovery needs to start searching for recoverable records. THe next checkpoint will have a start record LSN that is higher than l_last_sync_lsn, and so everything still works correctly when new checkpoints are written to an otherwise empty log. l_last_sync_lsn is an atomic variable because it is currently updated when an iclog with callbacks attached moves to the CALLBACK state. While we hold the icloglock at this point, we don't hold the AIL lock. When we assign the log tail, we hold the AIL lock, not the icloglock because we have to look up the AIL. Hence it is an atomic variable so it's not bound to a specific lock context. However, the iclog callbacks are only used for CIL checkpoints. We don't use callbacks with unmount record writes, so the l_last_sync_lsn variable only gets updated when we are processing CIL checkpoint callbacks. And those callbacks run under AIL lock contexts, not icloglock context. The CIL checkpoint already knows what the LSN of the iclog the commit record was written to (obtained when written into the iclog before submission) and so we can update the l_last_sync_lsn under the AIL lock in this callback. No other iclog callbacks will run until the currently executing one completes, and hence we can update the l_last_sync_lsn under the AIL lock safely. This means l_last_sync_lsn can move to the AIL as the "ail_head_lsn" and it can be used to replace the atomic l_last_sync_lsn in the iclog code. This makes tracking the log tail belong entirely to the AIL, rather than being smeared across log, iclog and AIL state and locking. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 81 ++++++------------------------------------------ fs/xfs/xfs_log_cil.c | 54 ++++++++++++++++++++++++-------- fs/xfs/xfs_log_priv.h | 9 ++---- fs/xfs/xfs_log_recover.c | 19 ++++++------ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 8 ++--- fs/xfs/xfs_trans_ail.c | 26 +++++++++++++--- fs/xfs/xfs_trans_priv.h | 13 ++++++++ 8 files changed, 102 insertions(+), 109 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ae22f361627f..1977afecd385 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1230,47 +1230,6 @@ xfs_log_cover( return error; } -/* - * We may be holding the log iclog lock upon entering this routine. - */ -xfs_lsn_t -xlog_assign_tail_lsn_locked( - struct xfs_mount *mp) -{ - struct xlog *log = mp->m_log; - struct xfs_log_item *lip; - xfs_lsn_t tail_lsn; - - assert_spin_locked(&mp->m_ail->ail_lock); - - /* - * To make sure we always have a valid LSN for the log tail we keep - * track of the last LSN which was committed in log->l_last_sync_lsn, - * and use that when the AIL was empty. - */ - lip = xfs_ail_min(mp->m_ail); - if (lip) - tail_lsn = lip->li_lsn; - else - tail_lsn = atomic64_read(&log->l_last_sync_lsn); - trace_xfs_log_assign_tail_lsn(log, tail_lsn); - atomic64_set(&log->l_tail_lsn, tail_lsn); - return tail_lsn; -} - -xfs_lsn_t -xlog_assign_tail_lsn( - struct xfs_mount *mp) -{ - xfs_lsn_t tail_lsn; - - spin_lock(&mp->m_ail->ail_lock); - tail_lsn = xlog_assign_tail_lsn_locked(mp); - spin_unlock(&mp->m_ail->ail_lock); - - return tail_lsn; -} - /* * Return the space in the log between the tail and the head. The head * is passed in the cycle/bytes formal parms. In the special case where @@ -1501,7 +1460,6 @@ xlog_alloc_log( log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) @@ -2549,44 +2507,23 @@ xlog_get_lowest_lsn( return lowest_lsn; } -/* - * Completion of a iclog IO does not imply that a transaction has completed, as - * transactions can be large enough to span many iclogs. We cannot change the - * tail of the log half way through a transaction as this may be the only - * transaction in the log and moving the tail to point to the middle of it - * will prevent recovery from finding the start of the transaction. Hence we - * should only update the last_sync_lsn if this iclog contains transaction - * completion callbacks on it. - * - * We have to do this before we drop the icloglock to ensure we are the only one - * that can update it. - * - * If we are moving the last_sync_lsn forwards, we also need to ensure we kick - * the reservation grant head pushing. This is due to the fact that the push - * target is bound by the current last_sync_lsn value. Hence if we have a large - * amount of log space bound up in this committing transaction then the - * last_sync_lsn value may be the limiting factor preventing tail pushing from - * freeing space in the log. Hence once we've updated the last_sync_lsn we - * should push the AIL to ensure the push target (and hence the grant head) is - * no longer bound by the old log head location and can move forwards and make - * progress again. - */ static void xlog_state_set_callback( struct xlog *log, struct xlog_in_core *iclog, xfs_lsn_t header_lsn) { + /* + * If there are no callbacks on this iclog, we can mark it clean + * immediately and return. Otherwise we need to run the + * callbacks. + */ + if (list_empty(&iclog->ic_callbacks)) { + xlog_state_clean_iclog(log, iclog); + return; + } trace_xlog_iclog_callback(iclog, _RET_IP_); iclog->ic_state = XLOG_STATE_CALLBACK; - - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - header_lsn) <= 0); - - if (list_empty_careful(&iclog->ic_callbacks)) - return; - - atomic64_set(&log->l_last_sync_lsn, header_lsn); } /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 141bde08bd6e..482955f1fa1f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -721,6 +721,24 @@ xlog_cil_ail_insert_batch( * items into the AIL. This uses bulk insertion techniques to minimise AIL lock * traffic. * + * The AIL tracks log items via the start record LSN of the checkpoint, + * not the commit record LSN. This is because we can pipeline multiple + * checkpoints, and so the start record of checkpoint N+1 can be + * written before the commit record of checkpoint N. i.e: + * + * start N commit N + * +-------------+------------+----------------+ + * start N+1 commit N+1 + * + * The tail of the log cannot be moved to the LSN of commit N when all + * the items of that checkpoint are written back, because then the + * start record for N+1 is no longer in the active portion of the log + * and recovery will fail/corrupt the filesystem. + * + * Hence when all the log items in checkpoint N are written back, the + * tail of the log most now only move as far forwards as the start LSN + * of checkpoint N+1. + * * If we are called with the aborted flag set, it is because a log write during * a CIL checkpoint commit has failed. In this case, all the items in the * checkpoint have already gone through iop_committed and iop_committing, which @@ -738,24 +756,33 @@ xlog_cil_ail_insert_batch( */ static void xlog_cil_ail_insert( - struct xlog *log, - struct list_head *lv_chain, - xfs_lsn_t commit_lsn, + struct xfs_cil_ctx *ctx, bool aborted) { #define LOG_ITEM_BATCH_SIZE 32 - struct xfs_ail *ailp = log->l_ailp; + struct xfs_ail *ailp = ctx->cil->xc_log->l_ailp; struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; struct xfs_ail_cursor cur; int i = 0; + /* + * Update the AIL head LSN with the commit record LSN of this + * checkpoint. As iclogs are always completed in order, this should + * always be the same (as iclogs can contain multiple commit records) or + * higher LSN than the current head. We do this before insertion of the + * items so that log space checks during insertion will reflect the + * space that this checkpoint has already consumed. + */ + ASSERT(XFS_LSN_CMP(ctx->commit_lsn, ailp->ail_head_lsn) >= 0 || + aborted); spin_lock(&ailp->ail_lock); - xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + ailp->ail_head_lsn = ctx->commit_lsn; + xfs_trans_ail_cursor_last(ailp, &cur, ctx->start_lsn); spin_unlock(&ailp->ail_lock); /* unpin all the log items */ - list_for_each_entry(lv, lv_chain, lv_list) { + list_for_each_entry(lv, &ctx->lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; @@ -768,9 +795,10 @@ xlog_cil_ail_insert( } if (lip->li_ops->iop_committed) - item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + item_lsn = lip->li_ops->iop_committed(lip, + ctx->start_lsn); else - item_lsn = commit_lsn; + item_lsn = ctx->start_lsn; /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) @@ -787,7 +815,7 @@ xlog_cil_ail_insert( continue; } - if (item_lsn != commit_lsn) { + if (item_lsn != ctx->start_lsn) { /* * Not a bulk update option due to unusual item_lsn. @@ -810,14 +838,15 @@ xlog_cil_ail_insert( log_items[i++] = lv->lv_item; if (i >= LOG_ITEM_BATCH_SIZE) { xlog_cil_ail_insert_batch(ailp, &cur, log_items, - LOG_ITEM_BATCH_SIZE, commit_lsn); + LOG_ITEM_BATCH_SIZE, ctx->start_lsn); i = 0; } } /* make sure we insert the remainder! */ if (i) - xlog_cil_ail_insert_batch(ailp, &cur, log_items, i, commit_lsn); + xlog_cil_ail_insert_batch(ailp, &cur, log_items, i, + ctx->start_lsn); spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_done(&cur); @@ -863,8 +892,7 @@ xlog_cil_committed( spin_unlock(&ctx->cil->xc_push_lock); } - xlog_cil_ail_insert(ctx->cil->xc_log, &ctx->lv_chain, - ctx->start_lsn, abort); + xlog_cil_ail_insert(ctx, abort); xfs_extent_busy_sort(&ctx->busy_extents.extent_list); xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 971871b84d84..4b8ef9260445 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -431,13 +431,10 @@ struct xlog { int l_prev_block; /* previous logical log block */ /* - * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and - * read without needing to hold specific locks. To avoid operations - * contending with other hot objects, place each of them on a separate - * cacheline. + * l_tail_lsn is atomic so it can be set and read without needing to + * hold specific locks. To avoid operations contending with other hot + * objects, it on a separate cacheline. */ - /* lsn of last LR on disk */ - atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; /* lsn of 1st LR with unflushed * buffers */ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 409b645ce799..0d4563bd129e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1177,8 +1177,8 @@ xlog_check_unmount_rec( */ xlog_assign_atomic_lsn(&log->l_tail_lsn, log->l_curr_cycle, after_umount_blk); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, - log->l_curr_cycle, after_umount_blk); + log->l_ailp->ail_head_lsn = + atomic64_read(&log->l_tail_lsn); *tail_blk = after_umount_blk; *clean = true; @@ -1212,7 +1212,7 @@ xlog_set_state( if (bump_cycle) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); - atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + log->l_ailp->ail_head_lsn = be64_to_cpu(rhead->h_lsn); xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, @@ -3366,14 +3366,13 @@ xlog_do_recover( /* * We now update the tail_lsn since much of the recovery has completed - * and there may be space available to use. If there were no extent - * or iunlinks, we can free up the entire log and set the tail_lsn to - * be the last_sync_lsn. This was set in xlog_find_tail to be the - * lsn of the last known good LR on disk. If there are extent frees - * or iunlinks they will have some entries in the AIL; so we look at - * the AIL to determine how to set the tail_lsn. + * and there may be space available to use. If there were no extent or + * iunlinks, we can free up the entire log. This was set in + * xlog_find_tail to be the lsn of the last known good LR on disk. If + * there are extent frees or iunlinks they will have some entries in the + * AIL; so we look at the AIL to determine how to set the tail_lsn. */ - xlog_assign_tail_lsn(mp); + xfs_ail_assign_tail_lsn(log->l_ailp); /* * Now that we've finished replaying all buffer and inode updates, diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index f98fb86ff8d7..2af9f274e872 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,6 +22,7 @@ #include "xfs_trans.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" #include "xfs_dquot_item.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 56c8333a470b..16e0635177ac 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1407,19 +1407,19 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, __field(dev_t, dev) __field(xfs_lsn_t, new_lsn) __field(xfs_lsn_t, old_lsn) - __field(xfs_lsn_t, last_sync_lsn) + __field(xfs_lsn_t, head_lsn) ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; __entry->new_lsn = new_lsn; __entry->old_lsn = atomic64_read(&log->l_tail_lsn); - __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + __entry->head_lsn = log->l_ailp->ail_head_lsn; ), - TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, head lsn %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), - CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) + CYCLE_LSN(__entry->head_lsn), BLOCK_LSN(__entry->head_lsn)) ) DECLARE_EVENT_CLASS(xfs_file_class, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d6ccd21aae2..5f03f82c4683 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -720,6 +720,26 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +__xfs_ail_assign_tail_lsn( + struct xfs_ail *ailp) +{ + struct xlog *log = ailp->ail_log; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&ailp->ail_lock); + + if (xlog_is_shutdown(log)) + return; + + tail_lsn = __xfs_ail_min_lsn(ailp); + if (!tail_lsn) + tail_lsn = ailp->ail_head_lsn; + + trace_xfs_log_assign_tail_lsn(log, tail_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); +} + /* * Callers should pass the original tail lsn so that we can detect if the tail * has moved as a result of the operation that was performed. If the caller @@ -734,15 +754,13 @@ xfs_ail_update_finish( { struct xlog *log = ailp->ail_log; - /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + /* If the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { spin_unlock(&ailp->ail_lock); return; } - if (!xlog_is_shutdown(log)) - xlog_assign_tail_lsn_locked(log->l_mp); - + __xfs_ail_assign_tail_lsn(ailp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 60b4707c3a65..bd841df93021 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -55,6 +55,7 @@ struct xfs_ail { struct list_head ail_cursors; spinlock_t ail_lock; xfs_lsn_t ail_last_pushed_lsn; + xfs_lsn_t ail_head_lsn; int ail_log_flush; unsigned long ail_opstate; struct list_head ail_buf_list; @@ -130,6 +131,18 @@ struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); +void __xfs_ail_assign_tail_lsn(struct xfs_ail *ailp); + +static inline void +xfs_ail_assign_tail_lsn( + struct xfs_ail *ailp) +{ + + spin_lock(&ailp->ail_lock); + __xfs_ail_assign_tail_lsn(ailp); + spin_unlock(&ailp->ail_lock); +} + #if BITS_PER_LONG != 64 static inline void xfs_trans_ail_copy_lsn( -- cgit v1.2.3-70-g09d2 From be5abd323bf4bee137d80d605ff30a7a66dad96d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:24 +0200 Subject: xfs: collapse xlog_state_set_callback in caller The function is called from a single place, and it isn't just setting the iclog state to XLOG_STATE_CALLBACK - it can mark iclogs clean, which moves them to states after CALLBACK. Hence the function is now badly named, and should just be folded into the caller where the iclog completion logic makes a whole lot more sense. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1977afecd385..381d6143a787 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2507,25 +2507,6 @@ xlog_get_lowest_lsn( return lowest_lsn; } -static void -xlog_state_set_callback( - struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t header_lsn) -{ - /* - * If there are no callbacks on this iclog, we can mark it clean - * immediately and return. Otherwise we need to run the - * callbacks. - */ - if (list_empty(&iclog->ic_callbacks)) { - xlog_state_clean_iclog(log, iclog); - return; - } - trace_xlog_iclog_callback(iclog, _RET_IP_); - iclog->ic_state = XLOG_STATE_CALLBACK; -} - /* * Return true if we need to stop processing, false to continue to the next * iclog. The caller will need to run callbacks if the iclog is returned in the @@ -2557,7 +2538,17 @@ xlog_state_iodone_process_iclog( lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) return false; - xlog_state_set_callback(log, iclog, header_lsn); + /* + * If there are no callbacks on this iclog, we can mark it clean + * immediately and return. Otherwise we need to run the + * callbacks. + */ + if (list_empty(&iclog->ic_callbacks)) { + xlog_state_clean_iclog(log, iclog); + return false; + } + trace_xlog_iclog_callback(iclog, _RET_IP_); + iclog->ic_state = XLOG_STATE_CALLBACK; return false; default: /* -- cgit v1.2.3-70-g09d2 From 551bf13ba8b24a9b938e85061c9e03bd452ea28d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:25 +0200 Subject: xfs: track log space pinned by the AIL Currently we track space used in the log by grant heads. These store the reserved space as a physical log location and combine both space reserved for future use with space already used in the log in a single variable. The amount of space consumed in the log is then calculated as the distance between the log tail and the grant head. The problem with tracking the grant head as a physical location comes from the fact that it tracks both log cycle count and offset into the log in bytes in a single 64 bit variable. because the cycle count on disk is a 32 bit number, this also limits the offset into the log to 32 bits. ANd because that is in bytes, we are limited to being able to track only 2GB of log space in the grant head. Hence to support larger physical logs, we need to track used space differently in the grant head. We no longer use the grant head for guiding AIL pushing, so the only thing it is now used for is determining if we've run out of reservation space via the calculation in xlog_space_left(). What we really need to do is move the grant heads away from tracking physical space in the log. The issue here is that space consumed in the log is not directly tracked by the current mechanism - the space consumed in the log by grant head reservations gets returned to the free pool by the tail of the log moving forward. i.e. the space isn't directly tracked or calculated, but the used grant space gets "freed" as the physical limits of the log are updated without actually needing to update the grant heads. Hence to move away from implicit, zero-update log space tracking we need to explicitly track the amount of physical space the log actually consumes separately to the in-memory reservations for operations that will be committed to the journal. Luckily, we already track the information we need to calculate this in the AIL itself. That is, the space currently consumed by the journal is the maximum LSN that the AIL has seen minus the current log tail. As we update both of these items dynamically as the head and tail of the log moves, we always know exactly how much space the journal consumes. This means that we also know exactly how much space the currently active reservations require, and exactly how much free space we have remaining for new reservations to be made. Most importantly, we know what these spaces are indepedently of the physical locations of the head and tail of the log. Hence by separating out the physical space consumed by the journal, we can now track reservations in the grant heads purely as a byte count, and the log can be considered full when the tail space + reservation space exceeds the size of the log. This means we can use the full 64 bits of grant head space for reservation space, completely removing the 32 bit byte count limitation on log size that they impose. Hence the first step in this conversion is to track and update the "log tail space" every time the AIL tail or maximum seen LSN changes. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_cil.c | 9 ++++++--- fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_trans_ail.c | 9 ++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 482955f1fa1f..92ccac7f9054 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -772,14 +772,17 @@ xlog_cil_ail_insert( * always be the same (as iclogs can contain multiple commit records) or * higher LSN than the current head. We do this before insertion of the * items so that log space checks during insertion will reflect the - * space that this checkpoint has already consumed. + * space that this checkpoint has already consumed. We call + * xfs_ail_update_finish() so that tail space and space-based wakeups + * will be recalculated appropriately. */ ASSERT(XFS_LSN_CMP(ctx->commit_lsn, ailp->ail_head_lsn) >= 0 || aborted); spin_lock(&ailp->ail_lock); - ailp->ail_head_lsn = ctx->commit_lsn; xfs_trans_ail_cursor_last(ailp, &cur, ctx->start_lsn); - spin_unlock(&ailp->ail_lock); + ailp->ail_head_lsn = ctx->commit_lsn; + /* xfs_ail_update_finish() drops the ail_lock */ + xfs_ail_update_finish(ailp, NULLCOMMITLSN); /* unpin all the log items */ list_for_each_entry(lv, &ctx->lv_chain, lv_list) { diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 4b8ef9260445..289674598979 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -440,6 +440,7 @@ struct xlog { struct xlog_grant_head l_reserve_head; struct xlog_grant_head l_write_head; + uint64_t l_tail_space; struct xfs_kobj l_kobj; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 5f03f82c4683..6a106a05fae0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -736,6 +736,8 @@ __xfs_ail_assign_tail_lsn( if (!tail_lsn) tail_lsn = ailp->ail_head_lsn; + WRITE_ONCE(log->l_tail_space, + xlog_lsn_sub(log, ailp->ail_head_lsn, tail_lsn)); trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); } @@ -743,9 +745,10 @@ __xfs_ail_assign_tail_lsn( /* * Callers should pass the original tail lsn so that we can detect if the tail * has moved as a result of the operation that was performed. If the caller - * needs to force a tail LSN update, it should pass NULLCOMMITLSN to bypass the - * "did the tail LSN change?" checks. If the caller wants to avoid a tail update - * (e.g. it knows the tail did not change) it should pass an @old_lsn of 0. + * needs to force a tail space update, it should pass NULLCOMMITLSN to bypass + * the "did the tail LSN change?" checks. If the caller wants to avoid a tail + * update (e.g. it knows the tail did not change) it should pass an @old_lsn of + * 0. */ void xfs_ail_update_finish( -- cgit v1.2.3-70-g09d2 From de302cea1e3b812e89a15b4eb349d063b2ab3aa1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:26 +0200 Subject: xfs: pass the full grant head to accounting functions Because we are going to need them soon. API change only, no logic changes. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 157 +++++++++++++++++++++++++------------------------- fs/xfs/xfs_log_priv.h | 2 - 2 files changed, 77 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 381d6143a787..0e50b370f0e4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -136,10 +136,10 @@ xlog_prepare_iovec( static void xlog_grant_sub_space( struct xlog *log, - atomic64_t *head, + struct xlog_grant_head *head, int bytes) { - int64_t head_val = atomic64_read(head); + int64_t head_val = atomic64_read(&head->grant); int64_t new, old; do { @@ -155,17 +155,17 @@ xlog_grant_sub_space( old = head_val; new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(head, old, new); + head_val = atomic64_cmpxchg(&head->grant, old, new); } while (head_val != old); } static void xlog_grant_add_space( struct xlog *log, - atomic64_t *head, + struct xlog_grant_head *head, int bytes) { - int64_t head_val = atomic64_read(head); + int64_t head_val = atomic64_read(&head->grant); int64_t new, old; do { @@ -184,7 +184,7 @@ xlog_grant_add_space( old = head_val; new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(head, old, new); + head_val = atomic64_cmpxchg(&head->grant, old, new); } while (head_val != old); } @@ -197,6 +197,63 @@ xlog_grant_head_init( spin_lock_init(&head->lock); } +/* + * Return the space in the log between the tail and the head. The head + * is passed in the cycle/bytes formal parms. In the special case where + * the reserve head has wrapped passed the tail, this calculation is no + * longer valid. In this case, just return 0 which means there is no space + * in the log. This works for all places where this function is called + * with the reserve head. Of course, if the write head were to ever + * wrap the tail, we should blow up. Rather than catch this case here, + * we depend on other ASSERTions in other parts of the code. XXXmiken + * + * If reservation head is behind the tail, we have a problem. Warn about it, + * but then treat it as if the log is empty. + * + * If the log is shut down, the head and tail may be invalid or out of whack, so + * shortcut invalidity asserts in this case so that we don't trigger them + * falsely. + */ +static int +xlog_grant_space_left( + struct xlog *log, + struct xlog_grant_head *head) +{ + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(&head->grant, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + return log->l_logsize - (head_bytes - tail_bytes); + if (tail_cycle + 1 < head_cycle) + return 0; + + /* Ignore potential inconsistency when shutdown. */ + if (xlog_is_shutdown(log)) + return log->l_logsize; + + if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + return tail_bytes - head_bytes; + } + + /* + * The reservation head is behind the tail. In this case we just want to + * return the size of the log as the amount of space left. + */ + xfs_alert(log->l_mp, "xlog_grant_space_left: head behind tail"); + xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); + ASSERT(0); + return log->l_logsize; +} + STATIC void xlog_grant_head_wake_all( struct xlog_grant_head *head) @@ -277,7 +334,7 @@ xlog_grant_head_wait( spin_lock(&head->lock); if (xlog_is_shutdown(log)) goto shutdown; - } while (xlog_space_left(log, &head->grant) < need_bytes); + } while (xlog_grant_space_left(log, head) < need_bytes); list_del_init(&tic->t_queue); return 0; @@ -322,7 +379,7 @@ xlog_grant_head_check( * otherwise try to get some space for this transaction. */ *need_bytes = xlog_ticket_reservation(log, head, tic); - free_bytes = xlog_space_left(log, &head->grant); + free_bytes = xlog_grant_space_left(log, head); if (!list_empty_careful(&head->waiters)) { spin_lock(&head->lock); if (!xlog_grant_head_wake(log, head, &free_bytes) || @@ -396,7 +453,7 @@ xfs_log_regrant( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head, need_bytes); trace_xfs_log_regrant_exit(log, tic); xlog_verify_grant_tail(log); return 0; @@ -447,8 +504,8 @@ xfs_log_reserve( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); - xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_reserve_head, need_bytes); + xlog_grant_add_space(log, &log->l_write_head, need_bytes); trace_xfs_log_reserve_exit(log, tic); xlog_verify_grant_tail(log); return 0; @@ -1107,7 +1164,7 @@ xfs_log_space_wake( ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); - free_bytes = xlog_space_left(log, &log->l_write_head.grant); + free_bytes = xlog_grant_space_left(log, &log->l_write_head); xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); spin_unlock(&log->l_write_head.lock); } @@ -1116,7 +1173,7 @@ xfs_log_space_wake( ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); - free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_bytes = xlog_grant_space_left(log, &log->l_reserve_head); xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); spin_unlock(&log->l_reserve_head.lock); } @@ -1230,64 +1287,6 @@ xfs_log_cover( return error; } -/* - * Return the space in the log between the tail and the head. The head - * is passed in the cycle/bytes formal parms. In the special case where - * the reserve head has wrapped passed the tail, this calculation is no - * longer valid. In this case, just return 0 which means there is no space - * in the log. This works for all places where this function is called - * with the reserve head. Of course, if the write head were to ever - * wrap the tail, we should blow up. Rather than catch this case here, - * we depend on other ASSERTions in other parts of the code. XXXmiken - * - * If reservation head is behind the tail, we have a problem. Warn about it, - * but then treat it as if the log is empty. - * - * If the log is shut down, the head and tail may be invalid or out of whack, so - * shortcut invalidity asserts in this case so that we don't trigger them - * falsely. - */ -int -xlog_space_left( - struct xlog *log, - atomic64_t *head) -{ - int tail_bytes; - int tail_cycle; - int head_cycle; - int head_bytes; - - xlog_crack_grant_head(head, &head_cycle, &head_bytes); - xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); - tail_bytes = BBTOB(tail_bytes); - if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - return log->l_logsize - (head_bytes - tail_bytes); - if (tail_cycle + 1 < head_cycle) - return 0; - - /* Ignore potential inconsistency when shutdown. */ - if (xlog_is_shutdown(log)) - return log->l_logsize; - - if (tail_cycle < head_cycle) { - ASSERT(tail_cycle == (head_cycle - 1)); - return tail_bytes - head_bytes; - } - - /* - * The reservation head is behind the tail. In this case we just want to - * return the size of the log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - return log->l_logsize; -} - - static void xlog_ioend_work( struct work_struct *work) @@ -1881,8 +1880,8 @@ xlog_sync( if (ticket) { ticket->t_curr_res -= roundoff; } else { - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_reserve_head, roundoff); + xlog_grant_add_space(log, &log->l_write_head, roundoff); } /* put cycle number in every block */ @@ -2802,17 +2801,15 @@ xfs_log_ticket_regrant( if (ticket->t_cnt > 0) ticket->t_cnt--; - xlog_grant_sub_space(log, &log->l_reserve_head.grant, - ticket->t_curr_res); - xlog_grant_sub_space(log, &log->l_write_head.grant, - ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_reserve_head, ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_write_head, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ if (!ticket->t_cnt) { - xlog_grant_add_space(log, &log->l_reserve_head.grant, + xlog_grant_add_space(log, &log->l_reserve_head, ticket->t_unit_res); trace_xfs_log_ticket_regrant_exit(log, ticket); @@ -2860,8 +2857,8 @@ xfs_log_ticket_ungrant( bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); - xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_reserve_head, bytes); + xlog_grant_sub_space(log, &log->l_write_head, bytes); trace_xfs_log_ticket_ungrant_exit(log, ticket); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 289674598979..0838c57ca8ac 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -573,8 +573,6 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); } -int xlog_space_left(struct xlog *log, atomic64_t *head); - /* * Committed Item List interfaces */ -- cgit v1.2.3-70-g09d2 From c1220522ef405a9ebf19447330c9e9de5dfc649c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:27 +0200 Subject: xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- Documentation/ABI/testing/sysfs-fs-xfs | 18 ++- fs/xfs/xfs_log.c | 246 ++++++++++++--------------------- fs/xfs/xfs_log_cil.c | 12 ++ fs/xfs/xfs_log_priv.h | 33 +---- fs/xfs/xfs_log_recover.c | 4 - fs/xfs/xfs_sysfs.c | 29 ++-- fs/xfs/xfs_trace.h | 34 ++--- 7 files changed, 138 insertions(+), 238 deletions(-) (limited to 'fs') diff --git a/Documentation/ABI/testing/sysfs-fs-xfs b/Documentation/ABI/testing/sysfs-fs-xfs index 82d8e2f79834..7da4de948b46 100644 --- a/Documentation/ABI/testing/sysfs-fs-xfs +++ b/Documentation/ABI/testing/sysfs-fs-xfs @@ -15,25 +15,23 @@ Description: The log sequence number (LSN) of the current tail of the log. The LSN is exported in "cycle:basic block" format. -What: /sys/fs/xfs//log/reserve_grant_head -Date: July 2014 -KernelVersion: 3.17 +What: /sys/fs/xfs//log/reserve_grant_head_bytes +Date: June 2024 +KernelVersion: 6.11 Contact: linux-xfs@vger.kernel.org Description: The current state of the log reserve grant head. It represents the total log reservation of all currently - outstanding transactions. The grant head is exported in - "cycle:bytes" format. + outstanding transactions in bytes. Users: xfstests -What: /sys/fs/xfs//log/write_grant_head -Date: July 2014 -KernelVersion: 3.17 +What: /sys/fs/xfs//log/write_grant_head_bytes +Date: June 2024 +KernelVersion: 6.11 Contact: linux-xfs@vger.kernel.org Description: The current state of the log write grant head. It represents the total log reservation of all currently outstanding transactions, including regrants due to - rolling transactions. The grant head is exported in - "cycle:bytes" format. + rolling transactions in bytes. Users: xfstests diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0e50b370f0e4..817ea7e0a8ab 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -53,9 +53,6 @@ xlog_sync( struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void -xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, @@ -65,7 +62,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) #endif @@ -133,125 +129,64 @@ xlog_prepare_iovec( return buf; } -static void +static inline void xlog_grant_sub_space( - struct xlog *log, struct xlog_grant_head *head, - int bytes) + int64_t bytes) { - int64_t head_val = atomic64_read(&head->grant); - int64_t new, old; - - do { - int cycle, space; - - xlog_crack_grant_head_val(head_val, &cycle, &space); - - space -= bytes; - if (space < 0) { - space += log->l_logsize; - cycle--; - } - - old = head_val; - new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(&head->grant, old, new); - } while (head_val != old); + atomic64_sub(bytes, &head->grant); } -static void +static inline void xlog_grant_add_space( - struct xlog *log, struct xlog_grant_head *head, - int bytes) + int64_t bytes) { - int64_t head_val = atomic64_read(&head->grant); - int64_t new, old; - - do { - int tmp; - int cycle, space; - - xlog_crack_grant_head_val(head_val, &cycle, &space); - - tmp = log->l_logsize - space; - if (tmp > bytes) - space += bytes; - else { - space = bytes - tmp; - cycle++; - } - - old = head_val; - new = xlog_assign_grant_head_val(cycle, space); - head_val = atomic64_cmpxchg(&head->grant, old, new); - } while (head_val != old); + atomic64_add(bytes, &head->grant); } -STATIC void +static void xlog_grant_head_init( struct xlog_grant_head *head) { - xlog_assign_grant_head(&head->grant, 1, 0); + atomic64_set(&head->grant, 0); INIT_LIST_HEAD(&head->waiters); spin_lock_init(&head->lock); } +void +xlog_grant_return_space( + struct xlog *log, + xfs_lsn_t old_head, + xfs_lsn_t new_head) +{ + int64_t diff = xlog_lsn_sub(log, new_head, old_head); + + xlog_grant_sub_space(&log->l_reserve_head, diff); + xlog_grant_sub_space(&log->l_write_head, diff); +} + /* - * Return the space in the log between the tail and the head. The head - * is passed in the cycle/bytes formal parms. In the special case where - * the reserve head has wrapped passed the tail, this calculation is no - * longer valid. In this case, just return 0 which means there is no space - * in the log. This works for all places where this function is called - * with the reserve head. Of course, if the write head were to ever - * wrap the tail, we should blow up. Rather than catch this case here, - * we depend on other ASSERTions in other parts of the code. XXXmiken - * - * If reservation head is behind the tail, we have a problem. Warn about it, - * but then treat it as if the log is empty. - * - * If the log is shut down, the head and tail may be invalid or out of whack, so - * shortcut invalidity asserts in this case so that we don't trigger them - * falsely. + * Return the space in the log between the tail and the head. In the case where + * we have overrun available reservation space, return 0. The memory barrier + * pairs with the smp_wmb() in xlog_cil_ail_insert() to ensure that grant head + * vs tail space updates are seen in the correct order and hence avoid + * transients as space is transferred from the grant heads to the AIL on commit + * completion. */ -static int +static uint64_t xlog_grant_space_left( struct xlog *log, struct xlog_grant_head *head) { - int tail_bytes; - int tail_cycle; - int head_cycle; - int head_bytes; - - xlog_crack_grant_head(&head->grant, &head_cycle, &head_bytes); - xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); - tail_bytes = BBTOB(tail_bytes); - if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - return log->l_logsize - (head_bytes - tail_bytes); - if (tail_cycle + 1 < head_cycle) - return 0; - - /* Ignore potential inconsistency when shutdown. */ - if (xlog_is_shutdown(log)) - return log->l_logsize; - - if (tail_cycle < head_cycle) { - ASSERT(tail_cycle == (head_cycle - 1)); - return tail_bytes - head_bytes; - } + int64_t free_bytes; - /* - * The reservation head is behind the tail. In this case we just want to - * return the size of the log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_grant_space_left: head behind tail"); - xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - return log->l_logsize; + smp_rmb(); /* paired with smp_wmb in xlog_cil_ail_insert() */ + free_bytes = log->l_logsize - READ_ONCE(log->l_tail_space) - + atomic64_read(&head->grant); + if (free_bytes > 0) + return free_bytes; + return 0; } STATIC void @@ -453,9 +388,8 @@ xfs_log_regrant( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_write_head, need_bytes); + xlog_grant_add_space(&log->l_write_head, need_bytes); trace_xfs_log_regrant_exit(log, tic); - xlog_verify_grant_tail(log); return 0; out_error: @@ -504,10 +438,9 @@ xfs_log_reserve( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_reserve_head, need_bytes); - xlog_grant_add_space(log, &log->l_write_head, need_bytes); + xlog_grant_add_space(&log->l_reserve_head, need_bytes); + xlog_grant_add_space(&log->l_write_head, need_bytes); trace_xfs_log_reserve_exit(log, tic); - xlog_verify_grant_tail(log); return 0; out_error: @@ -1880,8 +1813,8 @@ xlog_sync( if (ticket) { ticket->t_curr_res -= roundoff; } else { - xlog_grant_add_space(log, &log->l_reserve_head, roundoff); - xlog_grant_add_space(log, &log->l_write_head, roundoff); + xlog_grant_add_space(&log->l_reserve_head, roundoff); + xlog_grant_add_space(&log->l_write_head, roundoff); } /* put cycle number in every block */ @@ -2801,16 +2734,15 @@ xfs_log_ticket_regrant( if (ticket->t_cnt > 0) ticket->t_cnt--; - xlog_grant_sub_space(log, &log->l_reserve_head, ticket->t_curr_res); - xlog_grant_sub_space(log, &log->l_write_head, ticket->t_curr_res); + xlog_grant_sub_space(&log->l_reserve_head, ticket->t_curr_res); + xlog_grant_sub_space(&log->l_write_head, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ if (!ticket->t_cnt) { - xlog_grant_add_space(log, &log->l_reserve_head, - ticket->t_unit_res); + xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res); trace_xfs_log_ticket_regrant_exit(log, ticket); ticket->t_curr_res = ticket->t_unit_res; @@ -2857,8 +2789,8 @@ xfs_log_ticket_ungrant( bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_grant_sub_space(log, &log->l_reserve_head, bytes); - xlog_grant_sub_space(log, &log->l_write_head, bytes); + xlog_grant_sub_space(&log->l_reserve_head, bytes); + xlog_grant_sub_space(&log->l_write_head, bytes); trace_xfs_log_ticket_ungrant_exit(log, ticket); @@ -3331,42 +3263,27 @@ xlog_ticket_alloc( } #if defined(DEBUG) -/* - * Check to make sure the grant write head didn't just over lap the tail. If - * the cycles are the same, we can't be overlapping. Otherwise, make sure that - * the cycles differ by exactly one and check the byte count. - * - * This check is run unlocked, so can give false positives. Rather than assert - * on failures, use a warn-once flag and a panic tag to allow the admin to - * determine if they want to panic the machine when such an error occurs. For - * debug kernels this will have the same effect as using an assert but, unlinke - * an assert, it can be turned off at runtime. - */ -STATIC void -xlog_verify_grant_tail( - struct xlog *log) +static void +xlog_verify_dump_tail( + struct xlog *log, + struct xlog_in_core *iclog) { - int tail_cycle, tail_blocks; - int cycle, space; - - xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); - xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); - if (tail_cycle != cycle) { - if (cycle - 1 != tail_cycle && - !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { - xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, - "%s: cycle - 1 != tail_cycle", __func__); - } - - if (space > BBTOB(tail_blocks) && - !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { - xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, - "%s: space > BBTOB(tail_blocks)", __func__); - } - } -} - -/* check if it will fit */ + xfs_alert(log->l_mp, +"ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x", + iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1, + atomic64_read(&log->l_tail_lsn), + log->l_ailp->ail_head_lsn, + log->l_curr_cycle, log->l_curr_block, + log->l_prev_cycle, log->l_prev_block); + xfs_alert(log->l_mp, +"write grant 0x%llx, reserve grant 0x%llx, tail_space 0x%llx, size 0x%x, iclog flags 0x%x", + atomic64_read(&log->l_write_head.grant), + atomic64_read(&log->l_reserve_head.grant), + log->l_tail_space, log->l_logsize, + iclog ? iclog->ic_flags : -1); +} + +/* Check if the new iclog will fit in the log. */ STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -3375,21 +3292,34 @@ xlog_verify_tail_lsn( xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); int blocks; - if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { - blocks = - log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); - if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) - xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); - } else { - ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = log->l_logBBsize - + (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset) + + BTOBB(log->l_iclog_hsize)) { + xfs_emerg(log->l_mp, + "%s: ran out of log space", __func__); + xlog_verify_dump_tail(log, iclog); + } + return; + } - if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + if (CYCLE_LSN(tail_lsn) + 1 != log->l_prev_cycle) { + xfs_emerg(log->l_mp, "%s: head has wrapped tail.", __func__); + xlog_verify_dump_tail(log, iclog); + return; + } + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) { xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); + xlog_verify_dump_tail(log, iclog); + return; + } blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; - if (blocks < BTOBB(iclog->ic_offset) + 1) - xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); - } + if (blocks < BTOBB(iclog->ic_offset) + 1) { + xfs_emerg(log->l_mp, "%s: ran out of iclog space", __func__); + xlog_verify_dump_tail(log, iclog); + } } /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 92ccac7f9054..391a938d690c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -764,6 +764,7 @@ xlog_cil_ail_insert( struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; struct xfs_ail_cursor cur; + xfs_lsn_t old_head; int i = 0; /* @@ -780,10 +781,21 @@ xlog_cil_ail_insert( aborted); spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_last(ailp, &cur, ctx->start_lsn); + old_head = ailp->ail_head_lsn; ailp->ail_head_lsn = ctx->commit_lsn; /* xfs_ail_update_finish() drops the ail_lock */ xfs_ail_update_finish(ailp, NULLCOMMITLSN); + /* + * We move the AIL head forwards to account for the space used in the + * log before we remove that space from the grant heads. This prevents a + * transient condition where reservation space appears to become + * available on return, only for it to disappear again immediately as + * the AIL head update accounts in the log tail space. + */ + smp_wmb(); /* paired with smp_rmb in xlog_grant_space_left */ + xlog_grant_return_space(ailp->ail_log, old_head, ailp->ail_head_lsn); + /* unpin all the log items */ list_for_each_entry(lv, &ctx->lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 0838c57ca8ac..b8778a4fd6b6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -543,36 +543,6 @@ xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) atomic64_set(lsn, xlog_assign_lsn(cycle, block)); } -/* - * When we crack the grant head, we sample it first so that the value will not - * change while we are cracking it into the component values. This means we - * will always get consistent component values to work from. - */ -static inline void -xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) -{ - *cycle = val >> 32; - *space = val & 0xffffffff; -} - -static inline void -xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) -{ - xlog_crack_grant_head_val(atomic64_read(head), cycle, space); -} - -static inline int64_t -xlog_assign_grant_head_val(int cycle, int space) -{ - return ((int64_t)cycle << 32) | space; -} - -static inline void -xlog_assign_grant_head(atomic64_t *head, int cycle, int space) -{ - atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); -} - /* * Committed Item List interfaces */ @@ -639,6 +609,9 @@ xlog_lsn_sub( return (uint64_t)log->l_logsize - BBTOB(lo_block - hi_block); } +void xlog_grant_return_space(struct xlog *log, xfs_lsn_t old_head, + xfs_lsn_t new_head); + /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0d4563bd129e..4423dd344239 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1213,10 +1213,6 @@ xlog_set_state( log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); log->l_ailp->ail_head_lsn = be64_to_cpu(rhead->h_lsn); - xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); } /* diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index d2391eec37fe..60cb5318fdae 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -432,39 +432,30 @@ log_tail_lsn_show( XFS_SYSFS_ATTR_RO(log_tail_lsn); STATIC ssize_t -reserve_grant_head_show( +reserve_grant_head_bytes_show( struct kobject *kobject, char *buf) - { - int cycle; - int bytes; - struct xlog *log = to_xlog(kobject); - - xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); - return sysfs_emit(buf, "%d:%d\n", cycle, bytes); + return sysfs_emit(buf, "%lld\n", + atomic64_read(&to_xlog(kobject)->l_reserve_head.grant)); } -XFS_SYSFS_ATTR_RO(reserve_grant_head); +XFS_SYSFS_ATTR_RO(reserve_grant_head_bytes); STATIC ssize_t -write_grant_head_show( +write_grant_head_bytes_show( struct kobject *kobject, char *buf) { - int cycle; - int bytes; - struct xlog *log = to_xlog(kobject); - - xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); - return sysfs_emit(buf, "%d:%d\n", cycle, bytes); + return sysfs_emit(buf, "%lld\n", + atomic64_read(&to_xlog(kobject)->l_write_head.grant)); } -XFS_SYSFS_ATTR_RO(write_grant_head); +XFS_SYSFS_ATTR_RO(write_grant_head_bytes); static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), - ATTR_LIST(reserve_grant_head), - ATTR_LIST(write_grant_head), + ATTR_LIST(reserve_grant_head_bytes), + ATTR_LIST(write_grant_head_bytes), NULL, }; ATTRIBUTE_GROUPS(xfs_log); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16e0635177ac..5646d300b286 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1230,6 +1230,7 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) + __field(unsigned long, tic) __field(char, ocnt) __field(char, cnt) __field(int, curr_res) @@ -1237,16 +1238,16 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __field(unsigned int, flags) __field(int, reserveq) __field(int, writeq) - __field(int, grant_reserve_cycle) - __field(int, grant_reserve_bytes) - __field(int, grant_write_cycle) - __field(int, grant_write_bytes) + __field(uint64_t, grant_reserve_bytes) + __field(uint64_t, grant_write_bytes) + __field(uint64_t, tail_space) __field(int, curr_cycle) __field(int, curr_block) __field(xfs_lsn_t, tail_lsn) ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; + __entry->tic = (unsigned long)tic; __entry->ocnt = tic->t_ocnt; __entry->cnt = tic->t_cnt; __entry->curr_res = tic->t_curr_res; @@ -1254,23 +1255,22 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->flags = tic->t_flags; __entry->reserveq = list_empty(&log->l_reserve_head.waiters); __entry->writeq = list_empty(&log->l_write_head.waiters); - xlog_crack_grant_head(&log->l_reserve_head.grant, - &__entry->grant_reserve_cycle, - &__entry->grant_reserve_bytes); - xlog_crack_grant_head(&log->l_write_head.grant, - &__entry->grant_write_cycle, - &__entry->grant_write_bytes); + __entry->tail_space = READ_ONCE(log->l_tail_space); + __entry->grant_reserve_bytes = __entry->tail_space + + atomic64_read(&log->l_reserve_head.grant); + __entry->grant_write_bytes = __entry->tail_space + + atomic64_read(&log->l_write_head.grant); __entry->curr_cycle = log->l_curr_cycle; __entry->curr_block = log->l_curr_block; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), - TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u " - "t_unit_res %u t_flags %s reserveq %s " - "writeq %s grant_reserve_cycle %d " - "grant_reserve_bytes %d grant_write_cycle %d " - "grant_write_bytes %d curr_cycle %d curr_block %d " + TP_printk("dev %d:%d tic 0x%lx t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserveq %s writeq %s " + "tail space %llu grant_reserve_bytes %llu " + "grant_write_bytes %llu curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tic, __entry->ocnt, __entry->cnt, __entry->curr_res, @@ -1278,9 +1278,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), __entry->reserveq ? "empty" : "active", __entry->writeq ? "empty" : "active", - __entry->grant_reserve_cycle, + __entry->tail_space, __entry->grant_reserve_bytes, - __entry->grant_write_cycle, __entry->grant_write_bytes, __entry->curr_cycle, __entry->curr_block, @@ -1308,6 +1307,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_return); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), -- cgit v1.2.3-70-g09d2 From f3f7ae68a4ea23aa9c49530733f1faaa6996b03a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Jun 2024 09:21:28 +0200 Subject: xfs: skip flushing log items during push The AIL pushing code spends a huge amount of time skipping over items that are already marked as flushing. It is not uncommon to see hundreds of thousands of items skipped every second due to inode clustering marking all the inodes in a cluster as flushing when the first one is flushed. However, to discover an item is already flushing and should be skipped we have to call the iop_push() method for it to try to flush the item. For inodes (where this matters most), we have to first check that inode is flushable first. We can optimise this overhead away by tracking whether the log item is flushing internally. This allows xfsaild_push() to check the log item directly for flushing state and immediately skip the log item. Whilst this doesn't remove the CPU cache misses for loading the log item, it does avoid the overhead of an indirect function call and the cache misses involved in accessing inode and backing cluster buffer structures to determine flushing state. When trying to flush hundreds of thousands of inodes each second, this CPU overhead saving adds up quickly. It's so noticeable that the biggest issue with pushing on the AIL on fast storage becomes the 10ms back-off wait when we hit enough pinned buffers to break out of the push loop but not enough for the AIL pushing to be considered stuck. This limits the xfsaild to about 70% total CPU usage, and on fast storage this isn't enough to keep the storage 100% busy. The xfsaild will block on IO submission on slow storage and so is self throttling - it does not need a backoff in the case where we are really just breaking out of the walk to submit the IO we have gathered. Further with no backoff we don't need to gather huge delwri lists to mitigate the impact of backoffs, so we can submit IO more frequently and reduce the time log items spend in flushing state by breaking out of the item push loop once we've gathered enough IO to batch submission effectively. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 1 + fs/xfs/xfs_inode_item.c | 6 +++++- fs/xfs/xfs_trans.h | 4 +++- fs/xfs/xfs_trans_ail.c | 8 +++++++- 4 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 62ca6c75117c..7dc6f326936c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2555,6 +2555,7 @@ flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ef05cbbe116c..b509cbd191f4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -965,6 +965,7 @@ xfs_iflush_finish( } iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; + clear_bit(XFS_LI_FLUSHING, &lip->li_flags); spin_unlock(&iip->ili_lock); xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) @@ -1023,8 +1024,10 @@ xfs_buf_inode_io_fail( { struct xfs_log_item *lip; - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { set_bit(XFS_LI_FAILED, &lip->li_flags); + clear_bit(XFS_LI_FLUSHING, &lip->li_flags); + } } /* @@ -1043,6 +1046,7 @@ xfs_iflush_abort_clean( iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); + clear_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f97e8c68641f..f06cc0f41665 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -58,13 +58,15 @@ struct xfs_log_item { #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 #define XFS_LI_WHITEOUT 4 +#define XFS_LI_FLUSHING 5 #define XFS_LI_FLAGS \ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1u << XFS_LI_ABORTED), "ABORTED" }, \ { (1u << XFS_LI_FAILED), "FAILED" }, \ { (1u << XFS_LI_DIRTY), "DIRTY" }, \ - { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } + { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }, \ + { (1u << XFS_LI_FLUSHING), "FLUSHING" } struct xfs_item_ops { unsigned flags; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6a106a05fae0..0fafcc9f3dbe 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -512,6 +512,9 @@ xfsaild_push( while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) { int lock_result; + if (test_bit(XFS_LI_FLUSHING, &lip->li_flags)) + goto next_item; + /* * Note that iop_push may unlock and reacquire the AIL lock. We * rely on the AIL cursor implementation to be able to deal with @@ -581,9 +584,12 @@ xfsaild_push( if (stuck > 100) break; +next_item: lip = xfs_trans_ail_cursor_next(ailp, &cur); if (lip == NULL) break; + if (lip->li_lsn != lsn && count > 1000) + break; lsn = lip->li_lsn; } @@ -620,7 +626,7 @@ out_done: /* * Assume we have more work to do in a short while. */ - tout = 10; + tout = 0; } return tout; -- cgit v1.2.3-70-g09d2 From 49cdc4e834e46d7c11a91d7adcfa04f56d19efaf Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 3 Jul 2024 14:42:26 +0800 Subject: xfs: get rid of xfs_ag_resv_rmapbt_alloc The pag in xfs_ag_resv_rmapbt_alloc() is already held when the struct xfs_btree_cur is initialized in xfs_rmapbt_init_cursor(), so there is no need to get pag again. On the other hand, in xfs_rmapbt_free_block(), the similar function xfs_ag_resv_rmapbt_free() was removed in commit 92a005448f6f ("xfs: get rid of unnecessary xfs_perag_{get,put} pairs"), xfs_ag_resv_rmapbt_alloc() was left because scrub used it, but now scrub has removed it. Therefore, we could get rid of xfs_ag_resv_rmapbt_alloc() just like the rmap free block, make the code cleaner. Signed-off-by: Long Li Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ag_resv.h | 19 ------------------- fs/xfs/libxfs/xfs_rmap_btree.c | 7 ++++++- 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index ff20ed93de77..f247eeff7358 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -33,23 +33,4 @@ xfs_perag_resv( } } -/* - * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from - * the AGFL, they are allocated one at a time and the reservation updates don't - * require a transaction. - */ -static inline void -xfs_ag_resv_rmapbt_alloc( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_alloc_arg args = { NULL }; - struct xfs_perag *pag; - - args.len = 1; - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9e759efa81cc..56fd6c4bd8b4 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -88,6 +88,7 @@ xfs_rmapbt_alloc_block( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_alloc_arg args = { .len = 1 }; int error; xfs_agblock_t bno; @@ -107,7 +108,11 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); + /* + * Since rmapbt blocks are sourced from the AGFL, they are allocated one + * at a time and the reservation updates don't require a transaction. + */ + xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); *stat = 1; return 0; -- cgit v1.2.3-70-g09d2 From 2bf6e353542d233486195953dc9c346331f82dcb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jul 2024 14:02:57 +0200 Subject: xfs: fix rtalloc rotoring when delalloc is in use If we're trying to allocate real space for a delalloc reservation at offset 0, we should use the rotor to spread files across the rt volume. Switch the rtalloc to use the XFS_ALLOC_INITIAL_USER_DATA flag that is set for any write at startoff to make it match the behavior for the main data device. Based on a patch from Darrick J. Wong. Fixes: 6a94b1acda7e ("xfs: reinstate delalloc for RT inodes (if sb_rextsize == 1)") Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5a7ddfed1bb8..0c3e96c621a6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" @@ -1382,7 +1383,7 @@ retry: start = 0; } else if (xfs_bmap_adjacent(ap)) { start = xfs_rtb_to_rtx(mp, ap->blkno); - } else if (ap->eof && ap->offset == 0) { + } else if (ap->datatype & XFS_ALLOC_INITIAL_USER_DATA) { /* * If it's an allocation to an empty file at offset 0, pick an * extent that will space things out in the rt area. -- cgit v1.2.3-70-g09d2