diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-17 12:57:48 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-17 12:57:48 -0700 |
commit | bf3aa9de7ba57c2c7b5ea70c1ad3a6670cd6fcb0 (patch) | |
tree | 791228dc4eb6d90e2c27295930449b06f6952ad3 /fs/xfs/xfs_discard.c | |
parent | 0260b0a7445c62a08938fa66fad256e5d0779817 (diff) | |
parent | 2bf6e353542d233486195953dc9c346331f82dcb (diff) |
Merge tag 'xfs-6.11-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Chandan Babu:
"Major changes in this release are limited to enabling FITRIM on
realtime devices and Byte-based grant head log reservation tracking.
The remaining changes are limited to fixes and cleanups included in
this pull request.
Core:
- Enable FITRIM on the realtime device
- Introduce byte-based grant head log reservation tracking instead of
physical log location tracking.
This allows grant head to track a full 64 bit bytes space and hence
overcome the limit of 4GB indexing that has been present until now
Fixes:
- xfs_flush_unmap_range() and xfs_prepare_shift() should consider RT
extents in the flush unmap range
- Implement bounds check when traversing log operations during log
replay
- Prevent out of bounds access when traversing a directory data block
- Prevent incorrect ENOSPC when concurrently performing file creation
and file writes
- Fix rtalloc rotoring when delalloc is in use
Cleanups:
- Clean up I/O path inode locking helpers and the page fault handler
- xfs: hoist inode operations to libxfs in anticipation of the
metadata inode directory feature, which maintains a directory tree
of metadata inodes. This will be necessary for further enhancements
to the realtime feature, subvolume support
- Clean up some warts in the extent freeing log intent code
- Clean up the refcount and rmap intent code before adding support
for realtime devices
- Provide the correct email address for sysfs ABI documentation"
* tag 'xfs-6.11-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (80 commits)
xfs: fix rtalloc rotoring when delalloc is in use
xfs: get rid of xfs_ag_resv_rmapbt_alloc
xfs: skip flushing log items during push
xfs: grant heads track byte counts, not LSNs
xfs: pass the full grant head to accounting functions
xfs: track log space pinned by the AIL
xfs: collapse xlog_state_set_callback in caller
xfs: l_last_sync_lsn is really AIL state
xfs: ensure log tail is always up to date
xfs: background AIL push should target physical space
xfs: AIL doesn't need manual pushing
xfs: move and rename xfs_trans_committed_bulk
xfs: fix the contact address for the sysfs ABI documentation
xfs: Avoid races with cnt_btree lastrec updates
xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c
xfs: simplify usage of the rcur local variable in xfs_refcount_finish_one
xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one
xfs: reuse xfs_refcount_update_cancel_item
xfs: add a ci_entry helper
xfs: remove xfs_trans_set_refcount_flags
...
Diffstat (limited to 'fs/xfs/xfs_discard.c')
-rw-r--r-- | fs/xfs/xfs_discard.c | 303 |
1 files changed, 279 insertions, 24 deletions
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 25fe3b932b5a..6f0fc7fe1f2b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -20,6 +20,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rtbitmap.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -322,7 +323,7 @@ xfs_trim_should_stop(void) * we found in the last batch as the key to start the next. */ static int -xfs_trim_extents( +xfs_trim_perag_extents( struct xfs_perag *pag, xfs_agblock_t start, xfs_agblock_t end, @@ -383,6 +384,259 @@ xfs_trim_extents( } +static int +xfs_trim_datadev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_extlen_t minlen, + uint64_t *blocks_trimmed) +{ + xfs_agnumber_t start_agno, end_agno; + xfs_agblock_t start_agbno, end_agbno; + xfs_daddr_t ddev_end; + struct xfs_perag *pag; + int last_error = 0, error; + + ddev_end = min_t(xfs_daddr_t, end, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); + + start_agno = xfs_daddr_to_agno(mp, start); + start_agbno = xfs_daddr_to_agbno(mp, start); + end_agno = xfs_daddr_to_agno(mp, ddev_end); + end_agbno = xfs_daddr_to_agbno(mp, ddev_end); + + for_each_perag_range(mp, start_agno, end_agno, pag) { + xfs_agblock_t agend = pag->block_count; + + if (start_agno == end_agno) + agend = end_agbno; + error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen, + blocks_trimmed); + if (error) + last_error = error; + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; + } + start_agbno = 0; + } + + return last_error; +} + +#ifdef CONFIG_XFS_RT +struct xfs_trim_rtdev { + /* list of rt extents to free */ + struct list_head extent_list; + + /* pointer to count of blocks trimmed */ + uint64_t *blocks_trimmed; + + /* minimum length that caller allows us to trim */ + xfs_rtblock_t minlen_fsb; + + /* restart point for the rtbitmap walk */ + xfs_rtxnum_t restart_rtx; + + /* stopping point for the current rtbitmap walk */ + xfs_rtxnum_t stop_rtx; +}; + +struct xfs_rtx_busy { + struct list_head list; + xfs_rtblock_t bno; + xfs_rtblock_t length; +}; + +static void +xfs_discard_free_rtdev_extents( + struct xfs_trim_rtdev *tr) +{ + struct xfs_rtx_busy *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { + list_del_init(&busyp->list); + kfree(busyp); + } +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +static int +xfs_discard_rtdev_extents( + struct xfs_mount *mp, + struct xfs_trim_rtdev *tr) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_rtx_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + xfs_rtblock_t start = NULLRTBLOCK, length = 0; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &tr->extent_list, list) { + if (start == NULLRTBLOCK) + start = busyp->bno; + length += busyp->length; + + trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); + + error = __blkdev_issue_discard(bdev, + XFS_FSB_TO_BB(mp, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error) + break; + } + xfs_discard_free_rtdev_extents(tr); + + if (bio) { + error = submit_bio_wait(bio); + if (error == -EOPNOTSUPP) + error = 0; + if (error) + xfs_info(mp, + "discard failed for rtextent [0x%llx,%llu], error %d", + (unsigned long long)start, + (unsigned long long)length, + error); + bio_put(bio); + } + blk_finish_plug(&plug); + + return error; +} + +static int +xfs_trim_gather_rtextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_trim_rtdev *tr = priv; + struct xfs_rtx_busy *busyp; + xfs_rtblock_t rbno, rlen; + + if (rec->ar_startext > tr->stop_rtx) { + /* + * If we've scanned a large number of rtbitmap blocks, update + * the cursor to point at this extent so we restart the next + * batch from this extent. + */ + tr->restart_rtx = rec->ar_startext; + return -ECANCELED; + } + + rbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + + /* Ignore too small. */ + if (rlen < tr->minlen_fsb) { + trace_xfs_discard_rttoosmall(mp, rbno, rlen); + return 0; + } + + busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); + if (!busyp) + return -ENOMEM; + + busyp->bno = rbno; + busyp->length = rlen; + INIT_LIST_HEAD(&busyp->list); + list_add_tail(&busyp->list, &tr->extent_list); + *tr->blocks_trimmed += rlen; + + tr->restart_rtx = rec->ar_startext + rec->ar_extcount; + return 0; +} + +static int +xfs_trim_rtdev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_rtalloc_rec low = { }; + struct xfs_rtalloc_rec high = { }; + struct xfs_trim_rtdev tr = { + .blocks_trimmed = blocks_trimmed, + .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + }; + struct xfs_trans *tp; + xfs_daddr_t rtdev_daddr; + int error; + + INIT_LIST_HEAD(&tr.extent_list); + + /* Shift the start and end downwards to match the rt device. */ + rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (start > rtdev_daddr) + start -= rtdev_daddr; + else + start = 0; + + if (end <= rtdev_daddr) + return 0; + end -= rtdev_daddr; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + end = min_t(xfs_daddr_t, end, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1); + + /* Convert the rt blocks to rt extents */ + low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); + high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); + + /* + * Walk the free ranges between low and high. The query_range function + * trims the extents returned. + */ + do { + tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); + error = xfs_rtalloc_query_range(mp, tp, &low, &high, + xfs_trim_gather_rtextent, &tr); + + if (error == -ECANCELED) + error = 0; + if (error) { + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_discard_free_rtdev_extents(&tr); + break; + } + + if (list_empty(&tr.extent_list)) { + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + break; + } + + error = xfs_discard_rtdev_extents(mp, &tr); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + if (error) + break; + + low.ar_startext = tr.restart_rtx; + } while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext); + + xfs_trans_cancel(tp); + return error; +} +#else +# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + /* * trim a range of the filesystem. * @@ -391,28 +645,37 @@ xfs_trim_extents( * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format * is a linear address range. Hence we need to use DADDR based conversions and * comparisons for determining the correct offset and regions to trim. + * + * The realtime device is mapped into the FITRIM "address space" immediately + * after the data device. */ int xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct xfs_perag *pag; unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); + struct block_device *rt_bdev = NULL; struct fstrim_range range; xfs_daddr_t start, end; xfs_extlen_t minlen; - xfs_agnumber_t start_agno, end_agno; - xfs_agblock_t start_agbno, end_agbno; + xfs_rfsblock_t max_blocks; uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) + if (mp->m_rtdev_targp && + bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) + rt_bdev = mp->m_rtdev_targp->bt_bdev; + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) return -EOPNOTSUPP; + if (rt_bdev) + granularity = max(granularity, + bdev_discard_granularity(rt_bdev)); + /* * We haven't recovered the log, so we cannot use our bnobt-guided * storage zapping commands. @@ -433,35 +696,27 @@ xfs_ioc_trim( * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ - if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; + if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || range.len < mp->m_sb.sb_blocksize) return -EINVAL; start = BTOBB(range.start); - end = min_t(xfs_daddr_t, start + BTOBBT(range.len), - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1; + end = start + BTOBBT(range.len) - 1; - start_agno = xfs_daddr_to_agno(mp, start); - start_agbno = xfs_daddr_to_agbno(mp, start); - end_agno = xfs_daddr_to_agno(mp, end); - end_agbno = xfs_daddr_to_agbno(mp, end); - - for_each_perag_range(mp, start_agno, end_agno, pag) { - xfs_agblock_t agend = pag->block_count; - - if (start_agno == end_agno) - agend = end_agbno; - error = xfs_trim_extents(pag, start_agbno, agend, minlen, + if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { + error = xfs_trim_datadev_extents(mp, start, end, minlen, &blocks_trimmed); if (error) last_error = error; + } - if (xfs_trim_should_stop()) { - xfs_perag_rele(pag); - break; - } - start_agbno = 0; + if (rt_bdev && !xfs_trim_should_stop()) { + error = xfs_trim_rtdev_extents(mp, start, end, minlen, + &blocks_trimmed); + if (error) + last_error = error; } if (last_error) |