summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r--fs/xfs/scrub/agheader_repair.c101
-rw-r--r--fs/xfs/scrub/bitmap.c78
-rw-r--r--fs/xfs/scrub/bitmap.h10
-rw-r--r--fs/xfs/scrub/bmap.c42
-rw-r--r--fs/xfs/scrub/common.c215
-rw-r--r--fs/xfs/scrub/common.h39
-rw-r--r--fs/xfs/scrub/health.c10
-rw-r--r--fs/xfs/scrub/ialloc.c3
-rw-r--r--fs/xfs/scrub/inode.c11
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/scrub/quota.c15
-rw-r--r--fs/xfs/scrub/reap.c498
-rw-r--r--fs/xfs/scrub/reap.h12
-rw-r--r--fs/xfs/scrub/repair.c377
-rw-r--r--fs/xfs/scrub/repair.h25
-rw-r--r--fs/xfs/scrub/rtbitmap.c48
-rw-r--r--fs/xfs/scrub/rtsummary.c264
-rw-r--r--fs/xfs/scrub/scrub.c46
-rw-r--r--fs/xfs/scrub/scrub.h4
-rw-r--r--fs/xfs/scrub/stats.c405
-rw-r--r--fs/xfs/scrub/stats.h59
-rw-r--r--fs/xfs/scrub/trace.c4
-rw-r--r--fs/xfs/scrub/trace.h391
-rw-r--r--fs/xfs/scrub/xfarray.c1083
-rw-r--r--fs/xfs/scrub/xfarray.h141
-rw-r--r--fs/xfs/scrub/xfile.c420
-rw-r--r--fs/xfs/scrub/xfile.h77
27 files changed, 3789 insertions, 593 deletions
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index bbaa65422c4f..876a2f41b063 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/reap.h"
/* Superblock */
@@ -48,6 +49,10 @@ xrep_superblock(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
@@ -423,6 +428,10 @@ xrep_agf(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Start rewriting the header and implant the btrees we found. */
xrep_agf_init_header(sc, agf_bp, &old_agf);
xrep_agf_set_roots(sc, agf, fab);
@@ -444,13 +453,13 @@ out_revert:
struct xrep_agfl {
/* Bitmap of alleged AGFL blocks that we're not going to add. */
- struct xbitmap crossed;
+ struct xagb_bitmap crossed;
/* Bitmap of other OWN_AG metadata blocks. */
- struct xbitmap agmetablocks;
+ struct xagb_bitmap agmetablocks;
/* Bitmap of free space. */
- struct xbitmap *freesp;
+ struct xagb_bitmap *freesp;
/* rmapbt cursor for finding crosslinked blocks */
struct xfs_btree_cur *rmap_cur;
@@ -466,7 +475,6 @@ xrep_agfl_walk_rmap(
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_fsblock_t fsb;
int error = 0;
if (xchk_should_terminate(ra->sc, &error))
@@ -474,14 +482,13 @@ xrep_agfl_walk_rmap(
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- rec->rm_startblock);
- error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+ error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
+ rec->rm_blockcount);
if (error)
return error;
}
- return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
+ return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
}
/* Strike out the blocks that are cross-linked according to the rmapbt. */
@@ -492,12 +499,10 @@ xrep_agfl_check_extent(
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+ xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
- ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
-
while (agbno <= last_agbno) {
bool other_owners;
@@ -507,7 +512,7 @@ xrep_agfl_check_extent(
return error;
if (other_owners) {
- error = xbitmap_set(&ra->crossed, agbno, 1);
+ error = xagb_bitmap_set(&ra->crossed, agbno, 1);
if (error)
return error;
}
@@ -533,7 +538,7 @@ STATIC int
xrep_agfl_collect_blocks(
struct xfs_scrub *sc,
struct xfs_buf *agf_bp,
- struct xbitmap *agfl_extents,
+ struct xagb_bitmap *agfl_extents,
xfs_agblock_t *flcount)
{
struct xrep_agfl ra;
@@ -543,8 +548,8 @@ xrep_agfl_collect_blocks(
ra.sc = sc;
ra.freesp = agfl_extents;
- xbitmap_init(&ra.agmetablocks);
- xbitmap_init(&ra.crossed);
+ xagb_bitmap_init(&ra.agmetablocks);
+ xagb_bitmap_init(&ra.crossed);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
@@ -556,7 +561,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
- error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
@@ -564,7 +569,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
- error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
@@ -573,17 +578,17 @@ xrep_agfl_collect_blocks(
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
- error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+ error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
if (error)
goto out_bmp;
/* Strike out the blocks that are cross-linked. */
ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
- error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
+ error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
xfs_btree_del_cursor(ra.rmap_cur, error);
if (error)
goto out_bmp;
- error = xbitmap_disunion(agfl_extents, &ra.crossed);
+ error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
if (error)
goto out_bmp;
@@ -591,12 +596,12 @@ xrep_agfl_collect_blocks(
* Calculate the new AGFL size. If we found more blocks than fit in
* the AGFL we'll free them later.
*/
- *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+ *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
xfs_agfl_size(mp));
out_bmp:
- xbitmap_destroy(&ra.crossed);
- xbitmap_destroy(&ra.agmetablocks);
+ xagb_bitmap_destroy(&ra.crossed);
+ xagb_bitmap_destroy(&ra.agmetablocks);
return error;
}
@@ -615,18 +620,24 @@ xrep_agfl_update_agf(
xfs_force_summary_recalc(sc->mp);
/* Update the AGF counters. */
- if (xfs_perag_initialised_agf(sc->sa.pag))
+ if (xfs_perag_initialised_agf(sc->sa.pag)) {
sc->sa.pag->pagf_flcount = flcount;
+ clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
+ &sc->sa.pag->pag_opstate);
+ }
agf->agf_flfirst = cpu_to_be32(0);
agf->agf_flcount = cpu_to_be32(flcount);
- agf->agf_fllast = cpu_to_be32(flcount - 1);
+ if (flcount)
+ agf->agf_fllast = cpu_to_be32(flcount - 1);
+ else
+ agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
xfs_alloc_log_agf(sc->tp, agf_bp,
XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
}
struct xrep_agfl_fill {
- struct xbitmap used_extents;
+ struct xagb_bitmap used_extents;
struct xfs_scrub *sc;
__be32 *agfl_bno;
xfs_agblock_t flcount;
@@ -642,17 +653,15 @@ xrep_agfl_fill(
{
struct xrep_agfl_fill *af = priv;
struct xfs_scrub *sc = af->sc;
- xfs_fsblock_t fsbno = start;
+ xfs_agblock_t agbno = start;
int error;
- while (fsbno < start + len && af->fl_off < af->flcount)
- af->agfl_bno[af->fl_off++] =
- cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++));
+ trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
- trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno,
- XFS_FSB_TO_AGBNO(sc->mp, start), len);
+ while (agbno < start + len && af->fl_off < af->flcount)
+ af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
- error = xbitmap_set(&af->used_extents, start, fsbno - 1);
+ error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
if (error)
return error;
@@ -667,7 +676,7 @@ STATIC int
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
- struct xbitmap *agfl_extents,
+ struct xagb_bitmap *agfl_extents,
xfs_agblock_t flcount)
{
struct xrep_agfl_fill af = {
@@ -695,17 +704,17 @@ xrep_agfl_init_header(
* blocks than fit in the AGFL, they will be freed in a subsequent
* step.
*/
- xbitmap_init(&af.used_extents);
+ xagb_bitmap_init(&af.used_extents);
af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
- xbitmap_walk(agfl_extents, xrep_agfl_fill, &af);
- error = xbitmap_disunion(agfl_extents, &af.used_extents);
+ xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
+ error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
if (error)
return error;
/* Write new AGFL to disk. */
xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
- xbitmap_destroy(&af.used_extents);
+ xagb_bitmap_destroy(&af.used_extents);
return 0;
}
@@ -714,7 +723,7 @@ int
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xbitmap agfl_extents;
+ struct xagb_bitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
struct xfs_buf *agfl_bp;
@@ -725,7 +734,7 @@ xrep_agfl(
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xbitmap_init(&agfl_extents);
+ xagb_bitmap_init(&agfl_extents);
/*
* Read the AGF so that we can query the rmapbt. We hope that there's
@@ -753,6 +762,10 @@ xrep_agfl(
if (error)
goto err;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err;
+
/*
* Update AGF and AGFL. We reset the global free block counter when
* we adjust the AGF flcount (which can fail) so avoid updating any
@@ -774,10 +787,10 @@ xrep_agfl(
goto err;
/* Dump any AGFL overflow. */
- error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
err:
- xbitmap_destroy(&agfl_extents);
+ xagb_bitmap_destroy(&agfl_extents);
return error;
}
@@ -1000,6 +1013,10 @@ xrep_agi(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Start rewriting the header and implant the btrees we found. */
xrep_agi_init_header(sc, agi_bp, &old_agi);
xrep_agi_set_roots(sc, agi, fab);
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 0c959be396ea..e0c89a9a0ca0 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks(
* blocks going from the leaf towards the root.
*/
int
-xbitmap_set_btcur_path(
- struct xbitmap *bitmap,
+xagb_bitmap_set_btcur_path(
+ struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
{
- struct xfs_buf *bp;
- xfs_fsblock_t fsb;
int i;
int error;
for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
- xfs_btree_get_block(cur, i, &bp);
- if (!bp)
- continue;
- fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- error = xbitmap_set(bitmap, fsb, 1);
+ error = xagb_bitmap_visit_btblock(cur, i, bitmap);
if (error)
return error;
}
@@ -323,35 +317,6 @@ xbitmap_set_btcur_path(
return 0;
}
-/* Collect a btree's block in the bitmap. */
-STATIC int
-xbitmap_collect_btblock(
- struct xfs_btree_cur *cur,
- int level,
- void *priv)
-{
- struct xbitmap *bitmap = priv;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
-
- xfs_btree_get_block(cur, level, &bp);
- if (!bp)
- return 0;
-
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- return xbitmap_set(bitmap, fsbno, 1);
-}
-
-/* Walk the btree and mark the bitmap wherever a btree block is found. */
-int
-xbitmap_set_btblocks(
- struct xbitmap *bitmap,
- struct xfs_btree_cur *cur)
-{
- return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
- XFS_BTREE_VISIT_ALL, bitmap);
-}
-
/* How many bits are set in this bitmap? */
uint64_t
xbitmap_hweight(
@@ -385,43 +350,6 @@ xbitmap_walk(
return error;
}
-struct xbitmap_walk_bits {
- xbitmap_walk_bits_fn fn;
- void *priv;
-};
-
-/* Walk all the bits in a run. */
-static int
-xbitmap_walk_bits_in_run(
- uint64_t start,
- uint64_t len,
- void *priv)
-{
- struct xbitmap_walk_bits *wb = priv;
- uint64_t i;
- int error = 0;
-
- for (i = start; i < start + len; i++) {
- error = wb->fn(i, wb->priv);
- if (error)
- break;
- }
-
- return error;
-}
-
-/* Call a function for every set bit in this bitmap. */
-int
-xbitmap_walk_bits(
- struct xbitmap *bitmap,
- xbitmap_walk_bits_fn fn,
- void *priv)
-{
- struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv};
-
- return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb);
-}
-
/* Does this bitmap have no bits set at all? */
bool
xbitmap_empty(
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 84981724ecaf..4fe58bad6734 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap);
int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-int xbitmap_set_btcur_path(struct xbitmap *bitmap,
- struct xfs_btree_cur *cur);
-int xbitmap_set_btblocks(struct xbitmap *bitmap,
- struct xfs_btree_cur *cur);
uint64_t xbitmap_hweight(struct xbitmap *bitmap);
/*
@@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
void *priv);
-typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv);
-int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
- void *priv);
-
bool xbitmap_empty(struct xbitmap *bitmap);
bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
@@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 5bf4326e9783..75588915572e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -38,8 +38,7 @@ xchk_setup_inode_bmap(
if (error)
goto out;
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
/*
* We don't want any ephemeral data/cow fork updates sitting around
@@ -50,8 +49,7 @@ xchk_setup_inode_bmap(
sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
- sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+ xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
inode_dio_wait(VFS_I(sc->ip));
@@ -79,9 +77,8 @@ xchk_setup_inode_bmap(
error = xchk_trans_alloc(sc, 0);
if (error)
goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode */
return error;
@@ -844,7 +841,7 @@ xchk_bmap(
/* Non-existent forks can be ignored. */
if (!ifp)
- goto out;
+ return -ENOENT;
info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
info.whichfork = whichfork;
@@ -853,10 +850,10 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink inodes/filesystems. */
- if (!xfs_is_reflink_inode(ip)) {
+ /* No CoW forks on non-reflink filesystems. */
+ if (!xfs_has_reflink(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
- goto out;
+ return 0;
}
break;
case XFS_ATTR_FORK:
@@ -876,31 +873,31 @@ xchk_bmap(
/* No mappings to check. */
if (whichfork == XFS_COW_FORK)
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
case XFS_DINODE_FMT_EXTENTS:
break;
case XFS_DINODE_FMT_BTREE:
if (whichfork == XFS_COW_FORK) {
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
}
error = xchk_bmap_btree(sc, whichfork, &info);
if (error)
- goto out;
+ return error;
break;
default:
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
}
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ return 0;
/* Find the offset of the last extent in the mapping. */
error = xfs_bmap_last_offset(ip, &endoff, whichfork);
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
- goto out;
+ return error;
/*
* Scrub extent records. We use a special iterator function here that
@@ -913,12 +910,12 @@ xchk_bmap(
while (xchk_bmap_iext_iter(&info, &irec)) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out;
+ return 0;
if (irec.br_startoff >= endoff) {
xchk_fblock_set_corrupt(sc, whichfork,
irec.br_startoff);
- goto out;
+ return 0;
}
if (isnullstartblock(irec.br_startblock))
@@ -931,10 +928,10 @@ xchk_bmap(
if (xchk_bmap_want_check_rmaps(&info)) {
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
- goto out;
+ return error;
}
-out:
- return error;
+
+ return 0;
}
/* Scrub an inode's data fork. */
@@ -958,8 +955,5 @@ int
xchk_bmap_cow(
struct xfs_scrub *sc)
{
- if (!xfs_is_reflink_inode(sc->ip))
- return -ENOENT;
-
return xchk_bmap(sc, XFS_COW_FORK);
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 7a20256be969..de24532fe083 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -832,6 +832,25 @@ xchk_install_handle_inode(
}
/*
+ * Install an already-referenced inode for scrubbing. Get our own reference to
+ * the inode to make disposal simpler. The inode must not be in I_FREEING or
+ * I_WILL_FREE state!
+ */
+int
+xchk_install_live_inode(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!igrab(VFS_I(ip))) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/*
* In preparation to scrub metadata structures that hang off of an inode,
* grab either the inode referenced in the scrub control structure or the
* inode passed in. If the inumber does not reference an allocated inode
@@ -854,10 +873,8 @@ xchk_iget_for_scrubbing(
ASSERT(sc->tp == NULL);
/* We want to scan the inode we already had opened. */
- if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
- sc->ip = ip_in;
- return 0;
- }
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
+ return xchk_install_live_inode(sc, ip_in);
/* Reject internal metadata files and obviously bad inode numbers. */
if (xfs_internal_inum(mp, sc->sm->sm_ino))
@@ -1005,20 +1022,48 @@ xchk_setup_inode_contents(
return error;
/* Lock the inode so the VFS cannot touch this file. */
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
-
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode for us */
return error;
}
+void
+xchk_ilock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ xfs_ilock(sc->ip, ilock_flags);
+ sc->ilock_flags |= ilock_flags;
+}
+
+bool
+xchk_ilock_nowait(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
+ sc->ilock_flags |= ilock_flags;
+ return true;
+ }
+
+ return false;
+}
+
+void
+xchk_iunlock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ sc->ilock_flags &= ~ilock_flags;
+ xfs_iunlock(sc->ip, ilock_flags);
+}
+
/*
* Predicate that decides if we need to evaluate the cross-reference check.
* If there was an error accessing the cross-reference btree, just delete
@@ -1185,3 +1230,155 @@ xchk_fsgates_enable(
sc->flags |= scrub_fsgates;
}
+
+/*
+ * Decide if this is this a cached inode that's also allocated. The caller
+ * must hold a reference to an AG and the AGI buffer lock to prevent inodes
+ * from being allocated or freed.
+ *
+ * Look up an inode by number in the given file system. If the inode number
+ * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA.
+ * If the inode is being reclaimed, return -ENODATA because we know the inode
+ * cache cannot be updating the ondisk metadata.
+ *
+ * Otherwise, the incore inode is the one we want, and it is either live,
+ * somewhere in the inactivation machinery, or reclaimable. The inode is
+ * allocated if i_mode is nonzero. In all three cases, the cached inode will
+ * be more up to date than the ondisk inode buffer, so we must use the incore
+ * i_mode.
+ */
+int
+xchk_inode_is_allocated(
+ struct xfs_scrub *sc,
+ xfs_agino_t agino,
+ bool *inuse)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_ino_t ino;
+ struct xfs_inode *ip;
+ int error;
+
+ /* caller must hold perag reference */
+ if (pag == NULL) {
+ ASSERT(pag != NULL);
+ return -EINVAL;
+ }
+
+ /* caller must have AGI buffer */
+ if (sc->sa.agi_bp == NULL) {
+ ASSERT(sc->sa.agi_bp != NULL);
+ return -EINVAL;
+ }
+
+ /* reject inode numbers outside existing AGs */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ if (!xfs_verify_ino(mp, ino))
+ return -EINVAL;
+
+ error = -ENODATA;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* cache miss */
+ goto out_rcu;
+ }
+
+ /*
+ * If the inode number doesn't match, the incore inode got reused
+ * during an RCU grace period and the radix tree hasn't been updated.
+ * This isn't the inode we want.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino)
+ goto out_skip;
+
+ trace_xchk_inode_is_allocated(ip);
+
+ /*
+ * We have an incore inode that matches the inode we want, and the
+ * caller holds the perag structure and the AGI buffer. Let's check
+ * our assumptions below:
+ */
+
+#ifdef DEBUG
+ /*
+ * (1) If the incore inode is live (i.e. referenced from the dcache),
+ * it will not be INEW, nor will it be in the inactivation or reclaim
+ * machinery. The ondisk inode had better be allocated. This is the
+ * most trivial case.
+ */
+ if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
+ XFS_INACTIVATING))) {
+ /* live inode */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+
+ /*
+ * If the incore inode is INEW, there are several possibilities:
+ *
+ * (2) For a file that is being created, note that we allocate the
+ * ondisk inode before allocating, initializing, and adding the incore
+ * inode to the radix tree.
+ *
+ * (3) If the incore inode is being recycled, the inode has to be
+ * allocated because we don't allow freed inodes to be recycled.
+ * Recycling doesn't touch i_mode.
+ */
+ if (ip->i_flags & XFS_INEW) {
+ /* created on disk already or recycling */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+
+ /*
+ * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
+ * inactivation has not started (!INACTIVATING), it is still allocated.
+ */
+ if ((ip->i_flags & XFS_NEED_INACTIVE) &&
+ !(ip->i_flags & XFS_INACTIVATING)) {
+ /* definitely before difree */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+#endif
+
+ /*
+ * If the incore inode is undergoing inactivation (INACTIVATING), there
+ * are two possibilities:
+ *
+ * (5) It is before the point where it would get freed ondisk, in which
+ * case i_mode is still nonzero.
+ *
+ * (6) It has already been freed, in which case i_mode is zero.
+ *
+ * We don't take the ILOCK here, but difree and dialloc update the AGI,
+ * and we've taken the AGI buffer lock, which prevents that from
+ * happening.
+ */
+
+ /*
+ * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
+ * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still
+ * reflects the ondisk state.
+ */
+
+ /*
+ * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
+ * the flush code uses i_mode to format the ondisk inode.
+ */
+
+ /*
+ * (9) If the inode is in IRECLAIM and was reachable via the radix
+ * tree, it still has the same i_mode as it did before it entered
+ * reclaim. The inode object is still alive because we hold the RCU
+ * read lock.
+ */
+
+ *inuse = VFS_I(ip)->i_mode != 0;
+ error = 0;
+
+out_skip:
+ spin_unlock(&ip->i_flags_lock);
+out_rcu:
+ rcu_read_unlock();
+ return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 791235cd9b00..cabdc0e16838 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
-int xchk_setup_rt(struct xfs_scrub *sc);
+int xchk_setup_rtbitmap(struct xfs_scrub *sc);
+int xchk_setup_rtsummary(struct xfs_scrub *sc);
#else
static inline int
-xchk_setup_rt(struct xfs_scrub *sc)
+xchk_setup_rtbitmap(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
+static inline int
+xchk_setup_rtsummary(struct xfs_scrub *sc)
{
return -ENOENT;
}
@@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
+int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
+void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags);
+void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
@@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Decide if a repair is required. */
+static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN);
+}
+#else
+# define xchk_needs_repair(sc) (false)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
/*
+ * Helper macros to allocate and format xfile description strings.
+ * Callers must kfree the pointer returned.
+ */
+#define xchk_xfile_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
+ (sc)->mp->m_super->s_id, ##__VA_ARGS__)
+
+/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
* the CPU hotplug lock and force an i-cache flush on all CPUs once to set it
* up, and again to tear it down. These costs add up quickly, so we only want
@@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc)
void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
+int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
+ bool *inuse);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index d2b2a1cb6533..5e2b09ed6e29 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough(
return true;
}
+ /*
+ * If we just repaired some AG metadata, sc->sick_mask will reflect all
+ * the per-AG metadata types that were repaired. Exclude these from
+ * the filesystem health query because we have not yet updated the
+ * health status and we want everything to be scanned.
+ */
+ if ((sc->flags & XREP_ALREADY_FIXED) &&
+ type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
+ mask &= ~sc->sick_mask;
+
if (xfs_ag_has_sickness(pag, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
return false;
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 575f22a02ebe..fb7bbf47ae5d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree(
goto out;
}
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
- &ino_inuse);
+ error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
freemask_ok = irec_free ^ !!(dip->di_mode);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 3e1e02e340a6..59d7912fb75f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -32,15 +32,13 @@ xchk_prepare_iscrub(
{
int error;
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
error = xchk_trans_alloc(sc, 0);
if (error)
return error;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -83,7 +81,10 @@ xchk_setup_inode(
/* We want to scan the opened inode, so lock it and exit. */
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
- sc->ip = ip_in;
+ error = xchk_install_live_inode(sc, ip_in);
+ if (error)
+ return error;
+
return xchk_prepare_iscrub(sc);
}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 58d5dfb7ea21..e6155d86f791 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -150,8 +150,8 @@ xchk_parent_validate(
lock_mode = xchk_parent_ilock_dir(dp);
if (!lock_mode) {
- xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
error = -EAGAIN;
goto out_rele;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index e6caa358cbda..5671c8153433 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -59,9 +59,12 @@ xchk_setup_quota(
error = xchk_setup_fs(sc);
if (error)
return error;
- sc->ip = xfs_quota_inode(sc->mp, dqtype);
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
- sc->ilock_flags = XFS_ILOCK_EXCL;
+
+ error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype));
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -235,13 +238,11 @@ xchk_quota(
* data fork we have to drop ILOCK_EXCL to use the regular dquot
* functions.
*/
- xfs_iunlock(sc->ip, sc->ilock_flags);
- sc->ilock_flags = 0;
+ xchk_iunlock(sc, sc->ilock_flags);
sqi.sc = sc;
sqi.last_id = 0;
error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
- sc->ilock_flags = XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
if (error == -ECANCELED)
error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
new file mode 100644
index 000000000000..86a62420e02c
--- /dev/null
+++ b/fs/xfs/scrub/reap.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * Disposal of Blocks from Old Metadata
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (bitmap) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from bitmap. In theory the extents
+ * remaining in bitmap are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk. The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked. Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space. Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block. If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok. For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @bitmap,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal. If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking. We must also invalidate any buffers associated with
+ * @bitmap.
+ */
+
+/* Information about reaping extents after a repair. */
+struct xreap_state {
+ struct xfs_scrub *sc;
+
+ /* Reverse mapping owner and metadata reservation type. */
+ const struct xfs_owner_info *oinfo;
+ enum xfs_ag_resv_type resv;
+
+ /* If true, roll the transaction before reaping the next extent. */
+ bool force_roll;
+
+ /* Number of deferred reaps attached to the current transaction. */
+ unsigned int deferred;
+
+ /* Number of invalidated buffers logged to the current transaction. */
+ unsigned int invalidated;
+
+ /* Number of deferred reaps queued during the whole reap sequence. */
+ unsigned long long total_deferred;
+};
+
+/* Put a block back on the AGFL. */
+STATIC int
+xreap_put_freelist(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_buf *agfl_bp;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xrep_fix_freelist(sc, true);
+ if (error)
+ return error;
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+ agfl_bp, agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Are there any uncommitted reap operations? */
+static inline bool xreap_dirty(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->deferred)
+ return true;
+ if (rs->invalidated)
+ return true;
+ if (rs->total_deferred)
+ return true;
+ return false;
+}
+
+#define XREAP_MAX_BINVAL (2048)
+
+/*
+ * Decide if we want to roll the transaction after reaping an extent. We don't
+ * want to overrun the transaction reservation, so we prohibit more than
+ * 128 EFIs per transaction. For the same reason, we limit the number
+ * of buffer invalidations to 2048.
+ */
+static inline bool xreap_want_roll(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
+ return true;
+ if (rs->invalidated > XREAP_MAX_BINVAL)
+ return true;
+ return false;
+}
+
+static inline void xreap_reset(struct xreap_state *rs)
+{
+ rs->total_deferred += rs->deferred;
+ rs->deferred = 0;
+ rs->invalidated = 0;
+ rs->force_roll = false;
+}
+
+#define XREAP_MAX_DEFER_CHAIN (2048)
+
+/*
+ * Decide if we want to finish the deferred ops that are attached to the scrub
+ * transaction. We don't want to queue huge chains of deferred ops because
+ * that can consume a lot of log space and kernel memory. Hence we trigger a
+ * xfs_defer_finish if there are more than 2048 deferred reap operations or the
+ * caller did some real work.
+ */
+static inline bool
+xreap_want_defer_finish(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
+ return true;
+ return false;
+}
+
+static inline void xreap_defer_finish_reset(struct xreap_state *rs)
+{
+ rs->total_deferred = 0;
+ rs->deferred = 0;
+ rs->invalidated = 0;
+ rs->force_roll = false;
+}
+
+/* Try to invalidate the incore buffers for an extent that we're freeing. */
+STATIC void
+xreap_agextent_binval(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_extlen_t *aglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sc->sa.pag->pag_agno;
+ xfs_agblock_t agbno_next = agbno + *aglenp;
+ xfs_agblock_t bno = agbno;
+
+ /*
+ * Avoid invalidating AG headers and post-EOFS blocks because we never
+ * own those.
+ */
+ if (!xfs_verify_agbno(pag, agbno) ||
+ !xfs_verify_agbno(pag, agbno_next - 1))
+ return;
+
+ /*
+ * If there are incore buffers for these blocks, invalidate them. We
+ * assume that the lack of any other known owners means that the buffer
+ * can be locked without risk of deadlocking. The buffer cache cannot
+ * detect aliasing, so employ nested loops to scan for incore buffers
+ * of any plausible size.
+ */
+ while (bno < agbno_next) {
+ xfs_agblock_t fsbcount;
+ xfs_agblock_t max_fsbs;
+
+ /*
+ * Max buffer size is the max remote xattr buffer size, which
+ * is one fs block larger than 64k.
+ */
+ max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
+ xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
+
+ for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+ struct xfs_buf *bp = NULL;
+ xfs_daddr_t daddr;
+ int error;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
+ error = xfs_buf_incore(mp->m_ddev_targp, daddr,
+ XFS_FSB_TO_BB(mp, fsbcount),
+ XBF_LIVESCAN, &bp);
+ if (error)
+ continue;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ rs->invalidated++;
+
+ /*
+ * Stop invalidating if we've hit the limit; we should
+ * still have enough reservation left to free however
+ * far we've gotten.
+ */
+ if (rs->invalidated > XREAP_MAX_BINVAL) {
+ *aglenp -= agbno_next - bno;
+ goto out;
+ }
+ }
+
+ bno++;
+ }
+
+out:
+ trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+}
+
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call. Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed. AGFL blocks can only be put back one at
+ * a time.
+ */
+STATIC int
+xreap_agextent_select(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_agblock_t agbno_next,
+ bool *crosslinked,
+ xfs_extlen_t *aglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_btree_cur *cur;
+ xfs_agblock_t bno = agbno + 1;
+ xfs_extlen_t len = 1;
+ int error;
+
+ /*
+ * Determine if there are any other rmap records covering the first
+ * block of this extent. If so, the block is crosslinked.
+ */
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
+ crosslinked);
+ if (error)
+ goto out_cur;
+
+ /* AGFL blocks can only be deal with one at a time. */
+ if (rs->resv == XFS_AG_RESV_AGFL)
+ goto out_found;
+
+ /*
+ * Figure out how many of the subsequent blocks have the same crosslink
+ * status.
+ */
+ while (bno < agbno_next) {
+ bool also_crosslinked;
+
+ error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (*crosslinked != also_crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+out_found:
+ *aglenp = len;
+ trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this AG extent as possible. The
+ * number of blocks disposed of will be returned in @aglenp.
+ */
+STATIC int
+xreap_agextent_iter(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_extlen_t *aglenp,
+ bool crosslinked)
+{
+ struct xfs_scrub *sc = rs->sc;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ *
+ * XXX: XFS doesn't support detecting the case where a single block
+ * metadata structure is crosslinked with a multi-block structure
+ * because the buffer cache doesn't detect aliasing problems, so we
+ * can't fix 100% of crosslinking problems (yet). The verifiers will
+ * blow on writeout, the filesystem will shut down, and the admin gets
+ * to run xfs_repair.
+ */
+ if (crosslinked) {
+ trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+
+ rs->force_roll = true;
+ return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
+ *aglenp, rs->oinfo);
+ }
+
+ trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+
+ /*
+ * Invalidate as many buffers as we can, starting at agbno. If this
+ * function sets *aglenp to zero, the transaction is full of logged
+ * buffer invalidations, so we need to return early so that we can
+ * roll and retry.
+ */
+ xreap_agextent_binval(rs, agbno, aglenp);
+ if (*aglenp == 0) {
+ ASSERT(xreap_want_roll(rs));
+ return 0;
+ }
+
+ /* Put blocks back on the AGFL one at a time. */
+ if (rs->resv == XFS_AG_RESV_AGFL) {
+ ASSERT(*aglenp == 1);
+ error = xreap_put_freelist(sc, agbno);
+ if (error)
+ return error;
+
+ rs->force_roll = true;
+ return 0;
+ }
+
+ /*
+ * Use deferred frees to get rid of the old btree blocks to try to
+ * minimize the window in which we could crash and lose the old blocks.
+ */
+ error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+ rs->resv, true);
+ if (error)
+ return error;
+
+ rs->deferred++;
+ return 0;
+}
+
+/*
+ * Break an AG metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.
+ */
+STATIC int
+xreap_agmeta_extent(
+ uint64_t fsbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_agblock_t agbno = fsbno;
+ xfs_agblock_t agbno_next = agbno + len;
+ int error = 0;
+
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(sc->ip == NULL);
+
+ while (agbno < agbno_next) {
+ xfs_extlen_t aglen;
+ bool crosslinked;
+
+ error = xreap_agextent_select(rs, agbno, agbno_next,
+ &crosslinked, &aglen);
+ if (error)
+ return error;
+
+ error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+ if (error)
+ return error;
+
+ if (xreap_want_defer_finish(rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ error = xrep_roll_ag_trans(sc);
+ if (error)
+ return error;
+ xreap_reset(rs);
+ }
+
+ agbno += aglen;
+ }
+
+ return 0;
+}
+
+/* Dispose of every block of every AG metadata extent in the bitmap. */
+int
+xrep_reap_agblocks(
+ struct xfs_scrub *sc,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = type,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip == NULL);
+
+ error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
new file mode 100644
index 000000000000..fe24626af164
--- /dev/null
+++ b/fs/xfs/scrub/reap.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_REAP_H__
+#define __XFS_SCRUB_REAP_H__
+
+int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index ac6d8803e660..1b8b5439f2d7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -26,11 +26,13 @@
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/stats.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -39,8 +41,10 @@
*/
int
xrep_attempt(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc,
+ struct xchk_stats_run *run)
{
+ u64 repair_start;
int error = 0;
trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
@@ -49,8 +53,11 @@ xrep_attempt(
/* Repair whatever's broken. */
ASSERT(sc->ops->repair);
+ run->repair_attempted = true;
+ repair_start = xchk_stats_now();
error = sc->ops->repair(sc);
trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
+ run->repair_ns += xchk_stats_elapsed_ns(repair_start);
switch (error) {
case 0:
/*
@@ -59,14 +66,17 @@ xrep_attempt(
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sc->flags |= XREP_ALREADY_FIXED;
+ run->repair_succeeded = true;
return -EAGAIN;
case -ECHRNG:
sc->flags |= XCHK_NEED_DRAIN;
+ run->retries++;
return -EAGAIN;
case -EDEADLOCK:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
sc->flags |= XCHK_TRY_HARDER;
+ run->retries++;
return -EAGAIN;
}
/*
@@ -166,6 +176,56 @@ xrep_roll_ag_trans(
return 0;
}
+/* Finish all deferred work attached to the repair transaction. */
+int
+xrep_defer_finish(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /*
+ * Keep the AG header buffers locked while we complete deferred work
+ * items. Ensure that both AG buffers are dirty and held when we roll
+ * the transaction so that they move forward in the log without losing
+ * the bli (and hence the bli type) when the transaction commits.
+ *
+ * Normal code would never hold clean buffers across a roll, but repair
+ * needs both buffers to maintain a total lock on the AG.
+ */
+ if (sc->sa.agi_bp) {
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.agf_bp) {
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ }
+
+ /*
+ * Finish all deferred work items. We still hold the AG header buffers
+ * locked regardless of whether or not that succeeds. On failure, the
+ * buffers will be released during teardown on our way out of the
+ * kernel. If successful, join the buffers to the new transaction
+ * and move on.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /*
+ * Release the hold that we set above because defer_finish won't do
+ * that for us. The defer roll code redirties held buffers after each
+ * roll, so the AG header buffers should be ready for logging.
+ */
+ if (sc->sa.agi_bp)
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ if (sc->sa.agf_bp)
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+
+ return 0;
+}
+
/*
* Does the given AG have enough space to rebuild a btree? Neither AG
* reservation can be critical, and we must have enough space (factoring
@@ -297,89 +357,6 @@ xrep_calc_ag_resblks(
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
-/* Allocate a block in an AG. */
-int
-xrep_alloc_ag_block(
- struct xfs_scrub *sc,
- const struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv)
-{
- struct xfs_alloc_arg args = {0};
- xfs_agblock_t bno;
- int error;
-
- switch (resv) {
- case XFS_AG_RESV_AGFL:
- case XFS_AG_RESV_RMAPBT:
- error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
- sc->sa.agf_bp, &bno, 1);
- if (error)
- return error;
- if (bno == NULLAGBLOCK)
- return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
- *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
- if (resv == XFS_AG_RESV_RMAPBT)
- xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
- return 0;
- default:
- break;
- }
-
- args.tp = sc->tp;
- args.mp = sc->mp;
- args.pag = sc->sa.pag;
- args.oinfo = *oinfo;
- args.minlen = 1;
- args.maxlen = 1;
- args.prod = 1;
- args.resv = resv;
-
- error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
- if (error)
- return error;
- if (args.fsbno == NULLFSBLOCK)
- return -ENOSPC;
- ASSERT(args.len == 1);
- *fsbno = args.fsbno;
-
- return 0;
-}
-
-/* Initialize a new AG btree root block with zero entries. */
-int
-xrep_init_btblock(
- struct xfs_scrub *sc,
- xfs_fsblock_t fsb,
- struct xfs_buf **bpp,
- xfs_btnum_t btnum,
- const struct xfs_buf_ops *ops)
-{
- struct xfs_trans *tp = sc->tp;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp;
- int error;
-
- trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), btnum);
-
- ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
- &bp);
- if (error)
- return error;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
- xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
- bp->b_ops = ops;
- *bpp = bp;
-
- return 0;
-}
-
/*
* Reconstructing per-AG Btrees
*
@@ -404,91 +381,8 @@ xrep_init_btblock(
* sublist. As with the other btrees we subtract sublist from bitmap, and the
* result (since the rmapbt lives in the free space) are the blocks from the
* old rmapbt.
- *
- * Disposal of Blocks from Old per-AG Btrees
- *
- * Now that we've constructed a new btree to replace the damaged one, we want
- * to dispose of the blocks that (we think) the old btree was using.
- * Previously, we used the rmapbt to collect the extents (bitmap) with the
- * rmap owner corresponding to the tree we rebuilt, collected extents for any
- * blocks with the same rmap owner that are owned by another data structure
- * (sublist), and subtracted sublist from bitmap. In theory the extents
- * remaining in bitmap are the old btree's blocks.
- *
- * Unfortunately, it's possible that the btree was crosslinked with other
- * blocks on disk. The rmap data can tell us if there are multiple owners, so
- * if the rmapbt says there is an owner of this block other than @oinfo, then
- * the block is crosslinked. Remove the reverse mapping and continue.
- *
- * If there is one rmap record, we can free the block, which removes the
- * reverse mapping but doesn't add the block to the free space. Our repair
- * strategy is to hope the other metadata objects crosslinked on this block
- * will be rebuilt (atop different blocks), thereby removing all the cross
- * links.
- *
- * If there are no rmap records at all, we also free the block. If the btree
- * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
- * supposed to be a rmap record and everything is ok. For other btrees there
- * had to have been an rmap entry for the block to have ended up on @bitmap,
- * so if it's gone now there's something wrong and the fs will shut down.
- *
- * Note: If there are multiple rmap records with only the same rmap owner as
- * the btree we're trying to rebuild and the block is indeed owned by another
- * data structure with the same rmap owner, then the block will be in sublist
- * and therefore doesn't need disposal. If there are multiple rmap records
- * with only the same rmap owner but the block is not owned by something with
- * the same rmap owner, the block will be freed.
- *
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking. We also assume that the caller already invalidated any
- * buffers associated with @bitmap.
*/
-static int
-xrep_invalidate_block(
- uint64_t fsbno,
- void *priv)
-{
- struct xfs_scrub *sc = priv;
- struct xfs_buf *bp;
- int error;
-
- /* Skip AG headers and post-EOFS blocks */
- if (!xfs_verify_fsbno(sc->mp, fsbno))
- return 0;
-
- error = xfs_buf_incore(sc->mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
- if (error)
- return 0;
-
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- return 0;
-}
-
-/*
- * Invalidate buffers for per-AG btree blocks we're dumping. This function
- * is not intended for use with file data repairs; we have bunmapi for that.
- */
-int
-xrep_invalidate_blocks(
- struct xfs_scrub *sc,
- struct xbitmap *bitmap)
-{
- /*
- * For each block in each extent, see if there's an incore buffer for
- * exactly that block; if so, invalidate it. The buffer cache only
- * lets us look for one buffer at a time, so we have to look one block
- * at a time. Avoid invalidating AG headers and post-EOFS blocks
- * because we never own those; and if we can't TRYLOCK the buffer we
- * assume it's owned by someone else.
- */
- return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
-}
-
/* Ensure the freelist is the correct size. */
int
xrep_fix_freelist(
@@ -507,155 +401,6 @@ xrep_fix_freelist(
can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
}
-/* Information about reaping extents after a repair. */
-struct xrep_reap_state {
- struct xfs_scrub *sc;
-
- /* Reverse mapping owner and metadata reservation type. */
- const struct xfs_owner_info *oinfo;
- enum xfs_ag_resv_type resv;
-};
-
-/*
- * Put a block back on the AGFL.
- */
-STATIC int
-xrep_put_freelist(
- struct xfs_scrub *sc,
- xfs_agblock_t agbno)
-{
- struct xfs_buf *agfl_bp;
- int error;
-
- /* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
- if (error)
- return error;
-
- /*
- * Since we're "freeing" a lost block onto the AGFL, we have to
- * create an rmap for the block prior to merging it or else other
- * parts will break.
- */
- error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
- &XFS_RMAP_OINFO_AG);
- if (error)
- return error;
-
- /* Put the block on the AGFL. */
- error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
- if (error)
- return error;
-
- error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
- agfl_bp, agbno, 0);
- if (error)
- return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
- XFS_EXTENT_BUSY_SKIP_DISCARD);
-
- return 0;
-}
-
-/* Dispose of a single block. */
-STATIC int
-xrep_reap_block(
- uint64_t fsbno,
- void *priv)
-{
- struct xrep_reap_state *rs = priv;
- struct xfs_scrub *sc = rs->sc;
- struct xfs_btree_cur *cur;
- struct xfs_buf *agf_bp = NULL;
- xfs_agblock_t agbno;
- bool has_other_rmap;
- int error;
-
- ASSERT(sc->ip != NULL ||
- XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
- trace_xrep_dispose_btree_extent(sc->mp,
- XFS_FSB_TO_AGNO(sc->mp, fsbno),
- XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
-
- agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
- ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
-
- /*
- * If we are repairing per-inode metadata, we need to read in the AGF
- * buffer. Otherwise, we're repairing a per-AG structure, so reuse
- * the AGF buffer that the setup functions already grabbed.
- */
- if (sc->ip) {
- error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
- if (error)
- return error;
- } else {
- agf_bp = sc->sa.agf_bp;
- }
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
-
- /* Can we find any other rmappings? */
- error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
- &has_other_rmap);
- xfs_btree_del_cursor(cur, error);
- if (error)
- goto out_free;
-
- /*
- * If there are other rmappings, this block is cross linked and must
- * not be freed. Remove the reverse mapping and move on. Otherwise,
- * we were the only owner of the block, so free the extent, which will
- * also remove the rmap.
- *
- * XXX: XFS doesn't support detecting the case where a single block
- * metadata structure is crosslinked with a multi-block structure
- * because the buffer cache doesn't detect aliasing problems, so we
- * can't fix 100% of crosslinking problems (yet). The verifiers will
- * blow on writeout, the filesystem will shut down, and the admin gets
- * to run xfs_repair.
- */
- if (has_other_rmap)
- error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
- 1, rs->oinfo);
- else if (rs->resv == XFS_AG_RESV_AGFL)
- error = xrep_put_freelist(sc, agbno);
- else
- error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
- rs->resv);
- if (agf_bp != sc->sa.agf_bp)
- xfs_trans_brelse(sc->tp, agf_bp);
- if (error)
- return error;
-
- if (sc->ip)
- return xfs_trans_roll_inode(&sc->tp, sc->ip);
- return xrep_roll_ag_trans(sc);
-
-out_free:
- if (agf_bp != sc->sa.agf_bp)
- xfs_trans_brelse(sc->tp, agf_bp);
- return error;
-}
-
-/* Dispose of every block of every extent in the bitmap. */
-int
-xrep_reap_extents(
- struct xfs_scrub *sc,
- struct xbitmap *bitmap,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
-{
- struct xrep_reap_state rs = {
- .sc = sc,
- .oinfo = oinfo,
- .resv = type,
- };
-
- ASSERT(xfs_has_rmapbt(sc->mp));
-
- return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
-}
-
/*
* Finding per-AG Btree Roots for AGF/AGI Reconstruction
*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index dce791c679ee..60d2a9ae5f2e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -8,6 +8,8 @@
#include "xfs_quota_defs.h"
+struct xchk_stats_run;
+
static inline int xrep_notsupported(struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
@@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_ONLINE_REPAIR
+/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS (128)
+
+
/* Repair helpers */
-int xrep_attempt(struct xfs_scrub *sc);
+int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_defer_finish(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc,
- const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv);
-int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
- struct xfs_buf **bpp, xfs_btnum_t btnum,
- const struct xfs_buf_ops *ops);
struct xbitmap;
struct xagb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
- const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc);
static inline int
xrep_attempt(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc,
+ struct xchk_stats_run *run)
{
return -EOPNOTSUPP;
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e7dace7b4be8..008ddb599e13 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -19,19 +19,20 @@
/* Set us up with the realtime metadata locked. */
int
-xchk_setup_rt(
+xchk_setup_rtbitmap(
struct xfs_scrub *sc)
{
int error;
- error = xchk_setup_fs(sc);
+ error = xchk_trans_alloc(sc, 0);
if (error)
return error;
- sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
- sc->ip = sc->mp->m_rbmip;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xchk_install_live_inode(sc, sc->mp->m_rbmip);
+ if (error)
+ return error;
+ xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
return 0;
}
@@ -123,43 +124,6 @@ out:
return error;
}
-/* Scrub the realtime summary. */
-int
-xchk_rtsummary(
- struct xfs_scrub *sc)
-{
- struct xfs_inode *rsumip = sc->mp->m_rsumip;
- struct xfs_inode *old_ip = sc->ip;
- uint old_ilock_flags = sc->ilock_flags;
- int error = 0;
-
- /*
- * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
- * rt summary ip in compliance with the rt inode locking rules.
- *
- * Since we switch sc->ip to rsumip we have to save the old ilock
- * flags so that we don't mix up the inode state that @sc tracks.
- */
- sc->ip = rsumip;
- sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
- xfs_ilock(sc->ip, sc->ilock_flags);
-
- /* Invoke the fork scrubber. */
- error = xchk_metadata_inode_forks(sc);
- if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out;
-
- /* XXX: implement this some day */
- xchk_set_incomplete(sc);
-out:
- /* Switch back to the rtbitmap inode and lock flags. */
- xfs_iunlock(sc->ip, sc->ilock_flags);
- sc->ilock_flags = old_ilock_flags;
- sc->ip = old_ip;
- return error;
-}
-
-
/* xref check that the extent is not free in the rtbitmap */
void
xchk_xref_is_used_rt_space(
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
new file mode 100644
index 000000000000..437ed9acbb27
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+
+/*
+ * Realtime Summary
+ * ================
+ *
+ * We check the realtime summary by scanning the realtime bitmap file to create
+ * a new summary file incore, and then we compare the computed version against
+ * the ondisk version. We use the 'xfile' functionality to store this
+ * (potentially large) amount of data in pageable memory.
+ */
+
+/* Set us up to check the rtsummary file. */
+int
+xchk_setup_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /*
+ * Create an xfile to construct a new rtsummary file. The xfile allows
+ * us to avoid pinning kernel memory for this purpose.
+ */
+ descr = xchk_xfile_descr(sc, "realtime summary file");
+ error = xfile_create(descr, mp->m_rsumsize, &sc->xfile);
+ kfree(descr);
+ if (error)
+ return error;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ /* Allocate a memory buffer for the summary comparison. */
+ sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ error = xchk_install_live_inode(sc, mp->m_rsumip);
+ if (error)
+ return error;
+
+ /*
+ * Locking order requires us to take the rtbitmap first. We must be
+ * careful to unlock it ourselves when we are done with the rtbitmap
+ * file since the scrub infrastructure won't do that for us. Only
+ * then we can lock the rtsummary inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ return 0;
+}
+
+/* Helper functions to record suminfo words in an xfile. */
+
+typedef unsigned int xchk_rtsumoff_t;
+
+static inline int
+xfsum_load(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ xfs_suminfo_t *info)
+{
+ return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+ sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_store(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ const xfs_suminfo_t info)
+{
+ return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+ sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_copyout(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ xfs_suminfo_t *info,
+ unsigned int nr_words)
+{
+ return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+ sumoff << XFS_WORDLOG);
+}
+
+/* Update the summary file to reflect the free extent that we've accumulated. */
+STATIC int
+xchk_rtsum_record_free(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ xfs_fileoff_t rbmoff;
+ xfs_rtblock_t rtbno;
+ xfs_filblks_t rtlen;
+ xchk_rtsumoff_t offs;
+ unsigned int lenlog;
+ xfs_suminfo_t v = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Compute the relevant location in the rtsum file. */
+ rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+ lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+ offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+
+ rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+
+ if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+ xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
+ /* Bump the summary count. */
+ error = xfsum_load(sc, offs, &v);
+ if (error)
+ return error;
+
+ v++;
+ trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
+ lenlog, offs, v);
+
+ return xfsum_store(sc, offs, v);
+}
+
+/* Compute the realtime summary from the realtime bitmap. */
+STATIC int
+xchk_rtsum_compute(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long rtbmp_bytes;
+
+ /* If the bitmap size doesn't match the computed size, bail. */
+ rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
+ if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
+ mp->m_rbmip->i_disk_size)
+ return -EFSCORRUPTED;
+
+ return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
+ sc);
+}
+
+/* Compare the rtsummary file against the one we computed. */
+STATIC int
+xchk_rtsum_compare(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t off;
+ xchk_rtsumoff_t sumoff = 0;
+ int nmap;
+
+ for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Make sure we have a written extent. */
+ nmap = 1;
+ error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+ XFS_DATA_FORK);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ return error;
+
+ if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ return 0;
+ }
+
+ /* Read a block's worth of ondisk rtsummary file. */
+ error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ return error;
+
+ /* Read a block's worth of computed rtsummary file. */
+ error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+ if (error) {
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+ }
+
+ if (memcmp(bp->b_addr, sc->buf,
+ mp->m_blockwsize << XFS_WORDLOG) != 0)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+
+ xfs_trans_brelse(sc->tp, bp);
+ sumoff += mp->m_blockwsize;
+ }
+
+ return 0;
+}
+
+/* Scrub the realtime summary. */
+int
+xchk_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /* Invoke the fork scrubber. */
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ goto out_rbm;
+
+ /* Construct the new summary file from the rtbitmap. */
+ error = xchk_rtsum_compute(sc);
+ if (error == -EFSCORRUPTED) {
+ /*
+ * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref
+ * error since we're checking the summary file.
+ */
+ xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ error = 0;
+ goto out_rbm;
+ }
+ if (error)
+ goto out_rbm;
+
+ /* Does the computed summary file match the actual rtsummary file? */
+ error = xchk_rtsum_compare(sc);
+
+out_rbm:
+ /* Unlock the rtbitmap since we're done with it. */
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index a0fffbcd022b..7d3aa14d81b5 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -22,6 +22,8 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/health.h"
+#include "scrub/stats.h"
+#include "scrub/xfile.h"
/*
* Online Scrub and Repair
@@ -166,8 +168,6 @@ xchk_teardown(
struct xfs_scrub *sc,
int error)
{
- struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
-
xchk_ag_free(sc, &sc->sa);
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -178,16 +178,18 @@ xchk_teardown(
}
if (sc->ip) {
if (sc->ilock_flags)
- xfs_iunlock(sc->ip, sc->ilock_flags);
- if (sc->ip != ip_in &&
- !xfs_internal_inum(sc->mp, sc->ip->i_ino))
- xchk_irele(sc, sc->ip);
+ xchk_iunlock(sc, sc->ilock_flags);
+ xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
+ if (sc->xfile) {
+ xfile_destroy(sc->xfile);
+ sc->xfile = NULL;
+ }
if (sc->buf) {
if (sc->buf_cleanup)
sc->buf_cleanup(sc->buf);
@@ -322,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
- .setup = xchk_setup_rt,
+ .setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
- .setup = xchk_setup_rt,
+ .setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.has = xfs_has_realtime,
.repair = xrep_notsupported,
@@ -409,6 +411,11 @@ xchk_validate_inputs(
goto out;
}
+ /* No rebuild without repair. */
+ if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
+ !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return -EINVAL;
+
/*
* We only want to repair read-write v5+ filesystems. Defer the check
* for ops->repair until after our scrub confirms that we need to
@@ -463,8 +470,10 @@ xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
{
+ struct xchk_stats_run run = { };
struct xfs_scrub *sc;
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
+ u64 check_start;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -521,7 +530,9 @@ retry_op:
goto out_teardown;
/* Scrub for errors. */
+ check_start = xchk_stats_now();
error = sc->ops->scrub(sc);
+ run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
goto try_harder;
if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
@@ -533,15 +544,16 @@ retry_op:
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED)) {
- bool needs_fix;
+ bool needs_fix = xchk_needs_repair(sc->sm);
+
+ /* Userspace asked us to rebuild the structure regardless. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+ needs_fix = true;
/* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ needs_fix = true;
- needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
@@ -555,7 +567,7 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(sc);
+ error = xrep_attempt(sc, &run);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
@@ -583,12 +595,15 @@ out:
sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
error = 0;
}
+ if (error != -ENOENT)
+ xchk_stats_merge(mp, sm, &run);
return error;
need_drain:
error = xchk_teardown(sc, 0);
if (error)
goto out_sc;
sc->flags |= XCHK_NEED_DRAIN;
+ run.retries++;
goto retry_op;
try_harder:
/*
@@ -600,5 +615,6 @@ try_harder:
if (error)
goto out_sc;
sc->flags |= XCHK_TRY_HARDER;
+ run.retries++;
goto retry_op;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index f8ba00e51ca9..1ef9c6b4842a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -88,6 +88,10 @@ struct xfs_scrub {
*/
void (*buf_cleanup)(void *buf);
+ /* xfile used by the scrubbers; freed at teardown. */
+ struct xfile *xfile;
+
+ /* Lock flags for @ip. */
uint ilock_flags;
/* See the XCHK/XREP state flags below. */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
new file mode 100644
index 000000000000..aeb92624176b
--- /dev/null
+++ b/fs/xfs/scrub/stats.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sysfs.h"
+#include "xfs_btree.h"
+#include "xfs_super.h"
+#include "scrub/scrub.h"
+#include "scrub/stats.h"
+#include "scrub/trace.h"
+
+struct xchk_scrub_stats {
+ /* all 32-bit counters here */
+
+ /* checking stats */
+ uint32_t invocations;
+ uint32_t clean;
+ uint32_t corrupt;
+ uint32_t preen;
+ uint32_t xfail;
+ uint32_t xcorrupt;
+ uint32_t incomplete;
+ uint32_t warning;
+ uint32_t retries;
+
+ /* repair stats */
+ uint32_t repair_invocations;
+ uint32_t repair_success;
+
+ /* all 64-bit items here */
+
+ /* runtimes */
+ uint64_t checktime_us;
+ uint64_t repairtime_us;
+
+ /* non-counter state must go at the end for clearall */
+ spinlock_t css_lock;
+};
+
+struct xchk_stats {
+ struct dentry *cs_debugfs;
+ struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR];
+};
+
+
+static struct xchk_stats global_stats;
+
+static const char *name_map[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_SB] = "sb",
+ [XFS_SCRUB_TYPE_AGF] = "agf",
+ [XFS_SCRUB_TYPE_AGFL] = "agfl",
+ [XFS_SCRUB_TYPE_AGI] = "agi",
+ [XFS_SCRUB_TYPE_BNOBT] = "bnobt",
+ [XFS_SCRUB_TYPE_CNTBT] = "cntbt",
+ [XFS_SCRUB_TYPE_INOBT] = "inobt",
+ [XFS_SCRUB_TYPE_FINOBT] = "finobt",
+ [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt",
+ [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt",
+ [XFS_SCRUB_TYPE_INODE] = "inode",
+ [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd",
+ [XFS_SCRUB_TYPE_BMBTA] = "bmapbta",
+ [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc",
+ [XFS_SCRUB_TYPE_DIR] = "directory",
+ [XFS_SCRUB_TYPE_XATTR] = "xattr",
+ [XFS_SCRUB_TYPE_SYMLINK] = "symlink",
+ [XFS_SCRUB_TYPE_PARENT] = "parent",
+ [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap",
+ [XFS_SCRUB_TYPE_RTSUM] = "rtsummary",
+ [XFS_SCRUB_TYPE_UQUOTA] = "usrquota",
+ [XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
+ [XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+};
+
+/* Format the scrub stats into a text buffer, similar to pcp style. */
+STATIC ssize_t
+xchk_stats_format(
+ struct xchk_stats *cs,
+ char *buf,
+ size_t remaining)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+ ssize_t copied = 0;
+ int ret = 0;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ if (!name_map[i])
+ continue;
+
+ ret = scnprintf(buf, remaining,
+ "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n",
+ name_map[i],
+ (unsigned int)css->invocations,
+ (unsigned int)css->clean,
+ (unsigned int)css->corrupt,
+ (unsigned int)css->preen,
+ (unsigned int)css->xfail,
+ (unsigned int)css->xcorrupt,
+ (unsigned int)css->incomplete,
+ (unsigned int)css->warning,
+ (unsigned int)css->retries,
+ (unsigned long long)css->checktime_us,
+ (unsigned int)css->repair_invocations,
+ (unsigned int)css->repair_success,
+ (unsigned long long)css->repairtime_us);
+ if (ret <= 0)
+ break;
+
+ remaining -= ret;
+ copied += ret;
+ buf += ret;
+ }
+
+ return copied > 0 ? copied : ret;
+}
+
+/* Estimate the worst case buffer size required to hold the whole report. */
+STATIC size_t
+xchk_stats_estimate_bufsize(
+ struct xchk_stats *cs)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+ size_t field_width;
+ size_t ret = 0;
+
+ /* 4294967296 plus one space for each u32 field */
+ field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) /
+ sizeof(uint32_t));
+
+ /* 18446744073709551615 plus one space for each u64 field */
+ field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) -
+ offsetof(struct xchk_scrub_stats, checktime_us)) /
+ sizeof(uint64_t));
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ if (!name_map[i])
+ continue;
+
+ /* name plus one space */
+ ret += 1 + strlen(name_map[i]);
+
+ /* all fields, plus newline */
+ ret += field_width + 1;
+ }
+
+ return ret;
+}
+
+/* Clear all counters. */
+STATIC void
+xchk_stats_clearall(
+ struct xchk_stats *cs)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ spin_lock(&css->css_lock);
+ memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock));
+ spin_unlock(&css->css_lock);
+ }
+}
+
+#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \
+ XFS_SCRUB_OFLAG_PREEN | \
+ XFS_SCRUB_OFLAG_XFAIL | \
+ XFS_SCRUB_OFLAG_XCORRUPT | \
+ XFS_SCRUB_OFLAG_INCOMPLETE | \
+ XFS_SCRUB_OFLAG_WARNING)
+
+STATIC void
+xchk_stats_merge_one(
+ struct xchk_stats *cs,
+ const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run)
+{
+ struct xchk_scrub_stats *css;
+
+ ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+
+ css = &cs->cs_stats[sm->sm_type];
+ spin_lock(&css->css_lock);
+ css->invocations++;
+ if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN))
+ css->clean++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ css->corrupt++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+ css->preen++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL)
+ css->xfail++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)
+ css->xcorrupt++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ css->incomplete++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING)
+ css->warning++;
+ css->retries += run->retries;
+ css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC);
+
+ if (run->repair_attempted)
+ css->repair_invocations++;
+ if (run->repair_succeeded)
+ css->repair_success++;
+ css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC);
+ spin_unlock(&css->css_lock);
+}
+
+/* Merge these scrub-run stats into the global and mount stat data. */
+void
+xchk_stats_merge(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run)
+{
+ xchk_stats_merge_one(&global_stats, sm, run);
+ xchk_stats_merge_one(mp->m_scrub_stats, sm, run);
+}
+
+/* debugfs boilerplate */
+
+static ssize_t
+xchk_scrub_stats_read(
+ struct file *file,
+ char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct xchk_stats *cs = file->private_data;
+ char *buf;
+ size_t bufsize;
+ ssize_t avail, ret;
+
+ /*
+ * This generates stringly snapshot of all the scrub counters, so we
+ * do not want userspace to receive garbled text from multiple calls.
+ * If the file position is greater than 0, return a short read.
+ */
+ if (*ppos > 0)
+ return 0;
+
+ bufsize = xchk_stats_estimate_bufsize(cs);
+
+ buf = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+ if (!buf)
+ return -ENOMEM;
+
+ avail = xchk_stats_format(cs, buf, bufsize);
+ if (avail < 0) {
+ ret = avail;
+ goto out;
+ }
+
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail);
+out:
+ kvfree(buf);
+ return ret;
+}
+
+static const struct file_operations scrub_stats_fops = {
+ .open = simple_open,
+ .read = xchk_scrub_stats_read,
+};
+
+static ssize_t
+xchk_clear_scrub_stats_write(
+ struct file *file,
+ const char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct xchk_stats *cs = file->private_data;
+ unsigned int val;
+ int ret;
+
+ ret = kstrtouint_from_user(ubuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return -EINVAL;
+
+ xchk_stats_clearall(cs);
+ return count;
+}
+
+static const struct file_operations clear_scrub_stats_fops = {
+ .open = simple_open,
+ .write = xchk_clear_scrub_stats_write,
+};
+
+/* Initialize the stats object. */
+STATIC int
+xchk_stats_init(
+ struct xchk_stats *cs,
+ struct xfs_mount *mp)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++)
+ spin_lock_init(&css->css_lock);
+
+ return 0;
+}
+
+/* Connect the stats object to debugfs. */
+void
+xchk_stats_register(
+ struct xchk_stats *cs,
+ struct dentry *parent)
+{
+ if (!parent)
+ return;
+
+ cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent);
+ if (!cs->cs_debugfs)
+ return;
+
+ debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ &scrub_stats_fops);
+ debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ &clear_scrub_stats_fops);
+}
+
+/* Free all resources related to the stats object. */
+STATIC int
+xchk_stats_teardown(
+ struct xchk_stats *cs)
+{
+ return 0;
+}
+
+/* Disconnect the stats object from debugfs. */
+void
+xchk_stats_unregister(
+ struct xchk_stats *cs)
+{
+ debugfs_remove(cs->cs_debugfs);
+}
+
+/* Initialize global stats and register them */
+int __init
+xchk_global_stats_setup(
+ struct dentry *parent)
+{
+ int error;
+
+ error = xchk_stats_init(&global_stats, NULL);
+ if (error)
+ return error;
+
+ xchk_stats_register(&global_stats, parent);
+ return 0;
+}
+
+/* Unregister global stats and tear them down */
+void
+xchk_global_stats_teardown(void)
+{
+ xchk_stats_unregister(&global_stats);
+ xchk_stats_teardown(&global_stats);
+}
+
+/* Allocate per-mount stats */
+int
+xchk_mount_stats_alloc(
+ struct xfs_mount *mp)
+{
+ struct xchk_stats *cs;
+ int error;
+
+ cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ error = xchk_stats_init(cs, mp);
+ if (error)
+ goto out_free;
+
+ mp->m_scrub_stats = cs;
+ return 0;
+out_free:
+ kvfree(cs);
+ return error;
+}
+
+/* Free per-mount stats */
+void
+xchk_mount_stats_free(
+ struct xfs_mount *mp)
+{
+ xchk_stats_teardown(mp->m_scrub_stats);
+ kvfree(mp->m_scrub_stats);
+ mp->m_scrub_stats = NULL;
+}
diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h
new file mode 100644
index 000000000000..b358ad8d8b90
--- /dev/null
+++ b/fs/xfs/scrub/stats.h
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_STATS_H__
+#define __XFS_SCRUB_STATS_H__
+
+struct xchk_stats_run {
+ u64 scrub_ns;
+ u64 repair_ns;
+ unsigned int retries;
+ bool repair_attempted;
+ bool repair_succeeded;
+};
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
+struct xchk_stats;
+
+int __init xchk_global_stats_setup(struct dentry *parent);
+void xchk_global_stats_teardown(void);
+
+int xchk_mount_stats_alloc(struct xfs_mount *mp);
+void xchk_mount_stats_free(struct xfs_mount *mp);
+
+void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent);
+void xchk_stats_unregister(struct xchk_stats *cs);
+
+void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run);
+
+static inline u64 xchk_stats_now(void) { return ktime_get_ns(); }
+static inline u64 xchk_stats_elapsed_ns(u64 since)
+{
+ u64 now = xchk_stats_now();
+
+ /*
+ * If the system doesn't have a high enough resolution clock, charge at
+ * least one nanosecond so that our stats don't report instantaneous
+ * runtimes.
+ */
+ if (now == since)
+ return 1;
+
+ return now - since;
+}
+#else
+# define xchk_global_stats_setup(parent) (0)
+# define xchk_global_stats_teardown() ((void)0)
+# define xchk_mount_stats_alloc(mp) (0)
+# define xchk_mount_stats_free(mp) ((void)0)
+# define xchk_stats_register(cs, parent) ((void)0)
+# define xchk_stats_unregister(cs) ((void)0)
+# define xchk_stats_now() (0)
+# define xchk_stats_elapsed_ns(x) (0 * (x))
+# define xchk_stats_merge(mp, sm, run) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */
+
+#endif /* __XFS_SCRUB_STATS_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 0a975439d2b6..46249e7b17e0 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -12,8 +12,10 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "scrub/scrub.h"
#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 0b54f1a1cf0c..cbd4d01e253c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -16,6 +16,10 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+struct xfile;
+struct xfarray;
+struct xfarray_sortinfo;
+
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
* TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
@@ -94,7 +98,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \
{ XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \
{ XFS_SCRUB_OFLAG_WARNING, "warning" }, \
- { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }
+ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \
+ { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" }
#define XFS_SCRUB_STATE_STRINGS \
{ XCHK_TRY_HARDER, "try_harder" }, \
@@ -636,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__entry->cluster_ino)
)
+TRACE_EVENT(xchk_inode_is_allocated,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long, iflags)
+ __field(umode_t, mode)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->iflags = ip->i_flags;
+ __entry->mode = VFS_I(ip)->i_mode;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->iflags,
+ __entry->mode)
+);
+
TRACE_EVENT(xchk_fscounters_calc,
TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
uint64_t fdblocks, uint64_t delalloc),
@@ -751,13 +778,302 @@ TRACE_EVENT(xchk_refcount_incorrect,
__entry->seen)
)
+TRACE_EVENT(xfile_create,
+ TP_PROTO(struct xfile *xf),
+ TP_ARGS(xf),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __array(char, pathname, 256)
+ ),
+ TP_fast_assign(
+ char pathname[257];
+ char *path;
+
+ __entry->ino = file_inode(xf->file)->i_ino;
+ memset(pathname, 0, sizeof(pathname));
+ path = file_path(xf->file, pathname, sizeof(pathname) - 1);
+ if (IS_ERR(path))
+ path = "(unknown)";
+ strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ ),
+ TP_printk("xfino 0x%lx path '%s'",
+ __entry->ino,
+ __entry->pathname)
+);
+
+TRACE_EVENT(xfile_destroy,
+ TP_PROTO(struct xfile *xf),
+ TP_ARGS(xf),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes)
+ __field(loff_t, size)
+ ),
+ TP_fast_assign(
+ struct xfile_stat statbuf;
+ int ret;
+
+ ret = xfile_stat(xf, &statbuf);
+ if (!ret) {
+ __entry->bytes = statbuf.bytes;
+ __entry->size = statbuf.size;
+ } else {
+ __entry->bytes = -1;
+ __entry->size = -1;
+ }
+ __entry->ino = file_inode(xf->file)->i_ino;
+ ),
+ TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+ __entry->ino,
+ __entry->bytes,
+ __entry->size)
+);
+
+DECLARE_EVENT_CLASS(xfile_class,
+ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount),
+ TP_ARGS(xf, pos, bytecount),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes_used)
+ __field(loff_t, pos)
+ __field(loff_t, size)
+ __field(unsigned long long, bytecount)
+ ),
+ TP_fast_assign(
+ struct xfile_stat statbuf;
+ int ret;
+
+ ret = xfile_stat(xf, &statbuf);
+ if (!ret) {
+ __entry->bytes_used = statbuf.bytes;
+ __entry->size = statbuf.size;
+ } else {
+ __entry->bytes_used = -1;
+ __entry->size = -1;
+ }
+ __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->pos = pos;
+ __entry->bytecount = bytecount;
+ ),
+ TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
+ __entry->ino,
+ __entry->bytes_used,
+ __entry->pos,
+ __entry->bytecount,
+ __entry->size)
+);
+#define DEFINE_XFILE_EVENT(name) \
+DEFINE_EVENT(xfile_class, name, \
+ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
+ TP_ARGS(xf, pos, bytecount))
+DEFINE_XFILE_EVENT(xfile_pread);
+DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_seek_data);
+DEFINE_XFILE_EVENT(xfile_get_page);
+DEFINE_XFILE_EVENT(xfile_put_page);
+
+TRACE_EVENT(xfarray_create,
+ TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
+ TP_ARGS(xfa, required_capacity),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(uint64_t, max_nr)
+ __field(size_t, obj_size)
+ __field(int, obj_size_log)
+ __field(unsigned long long, required_capacity)
+ ),
+ TP_fast_assign(
+ __entry->max_nr = xfa->max_nr;
+ __entry->obj_size = xfa->obj_size;
+ __entry->obj_size_log = xfa->obj_size_log;
+ __entry->ino = file_inode(xfa->xfile->file)->i_ino;
+ __entry->required_capacity = required_capacity;
+ ),
+ TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d",
+ __entry->ino,
+ __entry->max_nr,
+ __entry->required_capacity,
+ __entry->obj_size,
+ __entry->obj_size_log)
+);
+
+TRACE_EVENT(xfarray_isort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_pagesort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_qsort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ __field(int, stack_depth)
+ __field(int, max_stack_depth)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ __entry->stack_depth = si->stack_depth;
+ __entry->max_stack_depth = si->max_stack_depth;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo,
+ __entry->stack_depth,
+ __entry->max_stack_depth)
+);
+
+TRACE_EVENT(xfarray_sort,
+ TP_PROTO(struct xfarray_sortinfo *si, size_t bytes),
+ TP_ARGS(si, bytes),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(size_t, bytes)
+ __field(unsigned int, max_stack_depth)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->bytes = bytes;
+ __entry->max_stack_depth = si->max_stack_depth;
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->max_stack_depth,
+ __entry->bytes)
+);
+
+TRACE_EVENT(xfarray_sort_stats,
+ TP_PROTO(struct xfarray_sortinfo *si, int error),
+ TP_ARGS(si, error),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+#ifdef DEBUG
+ __field(unsigned long long, loads)
+ __field(unsigned long long, stores)
+ __field(unsigned long long, compares)
+ __field(unsigned long long, heapsorts)
+#endif
+ __field(unsigned int, max_stack_depth)
+ __field(unsigned int, max_stack_used)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+#ifdef DEBUG
+ __entry->loads = si->loads;
+ __entry->stores = si->stores;
+ __entry->compares = si->compares;
+ __entry->heapsorts = si->heapsorts;
+#endif
+ __entry->max_stack_depth = si->max_stack_depth;
+ __entry->max_stack_used = si->max_stack_used;
+ __entry->error = error;
+ ),
+ TP_printk(
+#ifdef DEBUG
+ "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d",
+#else
+ "xfino 0x%lx stack_depth %u/%u error %d",
+#endif
+ __entry->ino,
+#ifdef DEBUG
+ __entry->loads,
+ __entry->stores,
+ __entry->compares,
+ __entry->heapsorts,
+#endif
+ __entry->max_stack_used,
+ __entry->max_stack_depth,
+ __entry->error)
+);
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xchk_rtsum_record_free,
+ TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
+ uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
+ TP_ARGS(mp, start, len, log, pos, v),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rtblock_t, start)
+ __field(unsigned long long, len)
+ __field(unsigned int, log)
+ __field(loff_t, pos)
+ __field(xfs_suminfo_t, v)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->start = start;
+ __entry->len = len;
+ __entry->log = log;
+ __entry->pos = pos;
+ __entry->v = v;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->start,
+ __entry->len,
+ __entry->log,
+ __entry->pos,
+ __entry->v)
+);
+#endif /* CONFIG_XFS_RT */
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, agbno, len),
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(pag, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -765,8 +1081,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
__entry->agbno = agbno;
__entry->len = len;
),
@@ -778,12 +1094,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len), \
- TP_ARGS(mp, agno, agbno, len))
-DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent);
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(pag, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
+DECLARE_EVENT_CLASS(xrep_reap_find_class,
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
+ bool crosslinked),
+ TP_ARGS(pag, agbno, len, crosslinked),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(bool, crosslinked)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->crosslinked = crosslinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->crosslinked ? 1 : 0)
+);
+#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
+DEFINE_EVENT(xrep_reap_find_class, name, \
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \
+ bool crosslinked), \
+ TP_ARGS(pag, agbno, len, crosslinked))
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+
DECLARE_EVENT_CLASS(xrep_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len,
@@ -853,28 +1202,6 @@ TRACE_EVENT(xrep_refcount_extent_fn,
__entry->refcount)
)
-TRACE_EVENT(xrep_init_btblock,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_btnum_t btnum),
- TP_ARGS(mp, agno, agbno, btnum),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
- __field(uint32_t, btnum)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agbno = agbno;
- __entry->btnum = btnum;
- ),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno,
- __entry->agbno,
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
-)
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
new file mode 100644
index 000000000000..f0f532c10a5a
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.c
@@ -0,0 +1,1083 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+
+/*
+ * Large Arrays of Fixed-Size Records
+ * ==================================
+ *
+ * This memory array uses an xfile (which itself is a memfd "file") to store
+ * large numbers of fixed-size records in memory that can be paged out. This
+ * puts less stress on the memory reclaim algorithms during an online repair
+ * because we don't have to pin so much memory. However, array access is less
+ * direct than would be in a regular memory array. Access to the array is
+ * performed via indexed load and store methods, and an append method is
+ * provided for convenience. Array elements can be unset, which sets them to
+ * all zeroes. Unset entries are skipped during iteration, though direct loads
+ * will return a zeroed buffer. Callers are responsible for concurrency
+ * control.
+ */
+
+/*
+ * Pointer to scratch space. Because we can't access the xfile data directly,
+ * we allocate a small amount of memory on the end of the xfarray structure to
+ * buffer array items when we need space to store values temporarily.
+ */
+static inline void *xfarray_scratch(struct xfarray *array)
+{
+ return (array + 1);
+}
+
+/* Compute array index given an xfile offset. */
+static xfarray_idx_t
+xfarray_idx(
+ struct xfarray *array,
+ loff_t pos)
+{
+ if (array->obj_size_log >= 0)
+ return (xfarray_idx_t)pos >> array->obj_size_log;
+
+ return div_u64((xfarray_idx_t)pos, array->obj_size);
+}
+
+/* Compute xfile offset of array element. */
+static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
+{
+ if (array->obj_size_log >= 0)
+ return idx << array->obj_size_log;
+
+ return idx * array->obj_size;
+}
+
+/*
+ * Initialize a big memory array. Array records cannot be larger than a
+ * page, and the array cannot span more bytes than the page cache supports.
+ * If @required_capacity is nonzero, the maximum array size will be set to this
+ * quantity and the array creation will fail if the underlying storage cannot
+ * support that many records.
+ */
+int
+xfarray_create(
+ const char *description,
+ unsigned long long required_capacity,
+ size_t obj_size,
+ struct xfarray **arrayp)
+{
+ struct xfarray *array;
+ struct xfile *xfile;
+ int error;
+
+ ASSERT(obj_size < PAGE_SIZE);
+
+ error = xfile_create(description, 0, &xfile);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+ array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
+ if (!array)
+ goto out_xfile;
+
+ array->xfile = xfile;
+ array->obj_size = obj_size;
+
+ if (is_power_of_2(obj_size))
+ array->obj_size_log = ilog2(obj_size);
+ else
+ array->obj_size_log = -1;
+
+ array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
+ trace_xfarray_create(array, required_capacity);
+
+ if (required_capacity > 0) {
+ if (array->max_nr < required_capacity) {
+ error = -ENOMEM;
+ goto out_xfarray;
+ }
+ array->max_nr = required_capacity;
+ }
+
+ *arrayp = array;
+ return 0;
+
+out_xfarray:
+ kfree(array);
+out_xfile:
+ xfile_destroy(xfile);
+ return error;
+}
+
+/* Destroy the array. */
+void
+xfarray_destroy(
+ struct xfarray *array)
+{
+ xfile_destroy(array->xfile);
+ kfree(array);
+}
+
+/* Load an element from the array. */
+int
+xfarray_load(
+ struct xfarray *array,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ if (idx >= array->nr)
+ return -ENODATA;
+
+ return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ xfarray_pos(array, idx));
+}
+
+/* Is this array element potentially unset? */
+static inline bool
+xfarray_is_unset(
+ struct xfarray *array,
+ loff_t pos)
+{
+ void *temp = xfarray_scratch(array);
+ int error;
+
+ if (array->unset_slots == 0)
+ return false;
+
+ error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ if (!error && xfarray_element_is_null(array, temp))
+ return true;
+
+ return false;
+}
+
+/*
+ * Unset an array element. If @idx is the last element in the array, the
+ * array will be truncated. Otherwise, the entry will be zeroed.
+ */
+int
+xfarray_unset(
+ struct xfarray *array,
+ xfarray_idx_t idx)
+{
+ void *temp = xfarray_scratch(array);
+ loff_t pos = xfarray_pos(array, idx);
+ int error;
+
+ if (idx >= array->nr)
+ return -ENODATA;
+
+ if (idx == array->nr - 1) {
+ array->nr--;
+ return 0;
+ }
+
+ if (xfarray_is_unset(array, pos))
+ return 0;
+
+ memset(temp, 0, array->obj_size);
+ error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ if (error)
+ return error;
+
+ array->unset_slots++;
+ return 0;
+}
+
+/*
+ * Store an element in the array. The element must not be completely zeroed,
+ * because those are considered unset sparse elements.
+ */
+int
+xfarray_store(
+ struct xfarray *array,
+ xfarray_idx_t idx,
+ const void *ptr)
+{
+ int ret;
+
+ if (idx >= array->max_nr)
+ return -EFBIG;
+
+ ASSERT(!xfarray_element_is_null(array, ptr));
+
+ ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ xfarray_pos(array, idx));
+ if (ret)
+ return ret;
+
+ array->nr = max(array->nr, idx + 1);
+ return 0;
+}
+
+/* Is this array element NULL? */
+bool
+xfarray_element_is_null(
+ struct xfarray *array,
+ const void *ptr)
+{
+ return !memchr_inv(ptr, 0, array->obj_size);
+}
+
+/*
+ * Store an element anywhere in the array that is unset. If there are no
+ * unset slots, append the element to the array.
+ */
+int
+xfarray_store_anywhere(
+ struct xfarray *array,
+ const void *ptr)
+{
+ void *temp = xfarray_scratch(array);
+ loff_t endpos = xfarray_pos(array, array->nr);
+ loff_t pos;
+ int error;
+
+ /* Find an unset slot to put it in. */
+ for (pos = 0;
+ pos < endpos && array->unset_slots > 0;
+ pos += array->obj_size) {
+ error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ pos);
+ if (error || !xfarray_element_is_null(array, temp))
+ continue;
+
+ error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ pos);
+ if (error)
+ return error;
+
+ array->unset_slots--;
+ return 0;
+ }
+
+ /* No unset slots found; attach it on the end. */
+ array->unset_slots = 0;
+ return xfarray_append(array, ptr);
+}
+
+/* Return length of array. */
+uint64_t
+xfarray_length(
+ struct xfarray *array)
+{
+ return array->nr;
+}
+
+/*
+ * Decide which array item we're going to read as part of an _iter_get.
+ * @cur is the array index, and @pos is the file offset of that array index in
+ * the backing xfile. Returns ENODATA if we reach the end of the records.
+ *
+ * Reading from a hole in a sparse xfile causes page instantiation, so for
+ * iterating a (possibly sparse) array we need to figure out if the cursor is
+ * pointing at a totally uninitialized hole and move the cursor up if
+ * necessary.
+ */
+static inline int
+xfarray_find_data(
+ struct xfarray *array,
+ xfarray_idx_t *cur,
+ loff_t *pos)
+{
+ unsigned int pgoff = offset_in_page(*pos);
+ loff_t end_pos = *pos + array->obj_size - 1;
+ loff_t new_pos;
+
+ /*
+ * If the current array record is not adjacent to a page boundary, we
+ * are in the middle of the page. We do not need to move the cursor.
+ */
+ if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
+ return 0;
+
+ /*
+ * Call SEEK_DATA on the last byte in the record we're about to read.
+ * If the record ends at (or crosses) the end of a page then we know
+ * that the first byte of the record is backed by pages and don't need
+ * to query it. If instead the record begins at the start of the page
+ * then we know that querying the last byte is just as good as querying
+ * the first byte, since records cannot be larger than a page.
+ *
+ * If the call returns the same file offset, we know this record is
+ * backed by real pages. We do not need to move the cursor.
+ */
+ new_pos = xfile_seek_data(array->xfile, end_pos);
+ if (new_pos == -ENXIO)
+ return -ENODATA;
+ if (new_pos < 0)
+ return new_pos;
+ if (new_pos == end_pos)
+ return 0;
+
+ /*
+ * Otherwise, SEEK_DATA told us how far up to move the file pointer to
+ * find more data. Move the array index to the first record past the
+ * byte offset we were given.
+ */
+ new_pos = roundup_64(new_pos, array->obj_size);
+ *cur = xfarray_idx(array, new_pos);
+ *pos = xfarray_pos(array, *cur);
+ return 0;
+}
+
+/*
+ * Starting at *idx, fetch the next non-null array entry and advance the index
+ * to set up the next _load_next call. Returns ENODATA if we reach the end of
+ * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
+ * call to this function.
+ */
+int
+xfarray_load_next(
+ struct xfarray *array,
+ xfarray_idx_t *idx,
+ void *rec)
+{
+ xfarray_idx_t cur = *idx;
+ loff_t pos = xfarray_pos(array, cur);
+ int error;
+
+ do {
+ if (cur >= array->nr)
+ return -ENODATA;
+
+ /*
+ * Ask the backing store for the location of next possible
+ * written record, then retrieve that record.
+ */
+ error = xfarray_find_data(array, &cur, &pos);
+ if (error)
+ return error;
+ error = xfarray_load(array, cur, rec);
+ if (error)
+ return error;
+
+ cur++;
+ pos += array->obj_size;
+ } while (xfarray_element_is_null(array, rec));
+
+ *idx = cur;
+ return 0;
+}
+
+/* Sorting functions */
+
+#ifdef DEBUG
+# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0)
+# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0)
+# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0)
+# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
+#else
+# define xfarray_sort_bump_loads(si)
+# define xfarray_sort_bump_stores(si)
+# define xfarray_sort_bump_compares(si)
+# define xfarray_sort_bump_heapsorts(si)
+#endif /* DEBUG */
+
+/* Load an array element for sorting. */
+static inline int
+xfarray_sort_load(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ xfarray_sort_bump_loads(si);
+ return xfarray_load(si->array, idx, ptr);
+}
+
+/* Store an array element for sorting. */
+static inline int
+xfarray_sort_store(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ xfarray_sort_bump_stores(si);
+ return xfarray_store(si->array, idx, ptr);
+}
+
+/* Compare an array element for sorting. */
+static inline int
+xfarray_sort_cmp(
+ struct xfarray_sortinfo *si,
+ const void *a,
+ const void *b)
+{
+ xfarray_sort_bump_compares(si);
+ return si->cmp_fn(a, b);
+}
+
+/* Return a pointer to the low index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
+{
+ return (xfarray_idx_t *)(si + 1);
+}
+
+/* Return a pointer to the high index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_lo(si) + si->max_stack_depth;
+}
+
+/* Size of each element in the quicksort pivot array. */
+static inline size_t
+xfarray_pivot_rec_sz(
+ struct xfarray *array)
+{
+ return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
+}
+
+/* Allocate memory to handle the sort. */
+static inline int
+xfarray_sortinfo_alloc(
+ struct xfarray *array,
+ xfarray_cmp_fn cmp_fn,
+ unsigned int flags,
+ struct xfarray_sortinfo **infop)
+{
+ struct xfarray_sortinfo *si;
+ size_t nr_bytes = sizeof(struct xfarray_sortinfo);
+ size_t pivot_rec_sz = xfarray_pivot_rec_sz(array);
+ int max_stack_depth;
+
+ /*
+ * The median-of-nine pivot algorithm doesn't work if a subset has
+ * fewer than 9 items. Make sure the in-memory sort will always take
+ * over for subsets where this wouldn't be the case.
+ */
+ BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
+
+ /*
+ * Tail-call recursion during the partitioning phase means that
+ * quicksort will never recurse more than log2(nr) times. We need one
+ * extra level of stack to hold the initial parameters. In-memory
+ * sort will always take care of the last few levels of recursion for
+ * us, so we can reduce the stack depth by that much.
+ */
+ max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
+ if (max_stack_depth < 1)
+ max_stack_depth = 1;
+
+ /* Each level of quicksort uses a lo and a hi index */
+ nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
+
+ /* Scratchpad for in-memory sort, or finding the pivot */
+ nr_bytes += max_t(size_t,
+ (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
+ XFARRAY_ISORT_NR * array->obj_size);
+
+ si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
+ if (!si)
+ return -ENOMEM;
+
+ si->array = array;
+ si->cmp_fn = cmp_fn;
+ si->flags = flags;
+ si->max_stack_depth = max_stack_depth;
+ si->max_stack_used = 1;
+
+ xfarray_sortinfo_lo(si)[0] = 0;
+ xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+
+ trace_xfarray_sort(si, nr_bytes);
+ *infop = si;
+ return 0;
+}
+
+/* Should this sort be terminated by a fatal signal? */
+static inline bool
+xfarray_sort_terminated(
+ struct xfarray_sortinfo *si,
+ int *error)
+{
+ /*
+ * If preemption is disabled, we need to yield to the scheduler every
+ * few seconds so that we don't run afoul of the soft lockup watchdog
+ * or RCU stall detector.
+ */
+ cond_resched();
+
+ if ((si->flags & XFARRAY_SORT_KILLABLE) &&
+ fatal_signal_pending(current)) {
+ if (*error == 0)
+ *error = -EINTR;
+ return true;
+ }
+ return false;
+}
+
+/* Do we want an in-memory sort? */
+static inline bool
+xfarray_want_isort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t start,
+ xfarray_idx_t end)
+{
+ /*
+ * For array subsets that fit in the scratchpad, it's much faster to
+ * use the kernel's heapsort than quicksort's stack machine.
+ */
+ return (end - start) < XFARRAY_ISORT_NR;
+}
+
+/* Return the scratch space within the sortinfo structure. */
+static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/*
+ * Sort a small number of array records using scratchpad memory. The records
+ * need not be contiguous in the xfile's memory pages.
+ */
+STATIC int
+xfarray_isort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *scratch = xfarray_sortinfo_isort_scratch(si);
+ loff_t lo_pos = xfarray_pos(si->array, lo);
+ loff_t len = xfarray_pos(si->array, hi - lo + 1);
+ int error;
+
+ trace_xfarray_isort(si, lo, hi);
+
+ xfarray_sort_bump_loads(si);
+ error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ if (error)
+ return error;
+
+ xfarray_sort_bump_heapsorts(si);
+ sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+ xfarray_sort_bump_stores(si);
+ return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+}
+
+/* Grab a page for sorting records. */
+static inline int
+xfarray_sort_get_page(
+ struct xfarray_sortinfo *si,
+ loff_t pos,
+ uint64_t len)
+{
+ int error;
+
+ error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
+ if (error)
+ return error;
+
+ /*
+ * xfile pages must never be mapped into userspace, so we skip the
+ * dcache flush when mapping the page.
+ */
+ si->page_kaddr = kmap_local_page(si->xfpage.page);
+ return 0;
+}
+
+/* Release a page we grabbed for sorting records. */
+static inline int
+xfarray_sort_put_page(
+ struct xfarray_sortinfo *si)
+{
+ if (!si->page_kaddr)
+ return 0;
+
+ kunmap_local(si->page_kaddr);
+ si->page_kaddr = NULL;
+
+ return xfile_put_page(si->array->xfile, &si->xfpage);
+}
+
+/* Decide if these records are eligible for in-page sorting. */
+static inline bool
+xfarray_want_pagesort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ pgoff_t lo_page;
+ pgoff_t hi_page;
+ loff_t end_pos;
+
+ /* We can only map one page at a time. */
+ lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
+ end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
+ hi_page = end_pos >> PAGE_SHIFT;
+
+ return lo_page == hi_page;
+}
+
+/* Sort a bunch of records that all live in the same memory page. */
+STATIC int
+xfarray_pagesort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *startp;
+ loff_t lo_pos = xfarray_pos(si->array, lo);
+ uint64_t len = xfarray_pos(si->array, hi - lo);
+ int error = 0;
+
+ trace_xfarray_pagesort(si, lo, hi);
+
+ xfarray_sort_bump_loads(si);
+ error = xfarray_sort_get_page(si, lo_pos, len);
+ if (error)
+ return error;
+
+ xfarray_sort_bump_heapsorts(si);
+ startp = si->page_kaddr + offset_in_page(lo_pos);
+ sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+ xfarray_sort_bump_stores(si);
+ return xfarray_sort_put_page(si);
+}
+
+/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
+static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/* Return a pointer to the start of the pivot array. */
+static inline void *
+xfarray_sortinfo_pivot_array(
+ struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_pivot(si) + si->array->obj_size;
+}
+
+/* The xfarray record is stored at the start of each pivot array element. */
+static inline void *
+xfarray_pivot_array_rec(
+ void *pa,
+ size_t pa_recsz,
+ unsigned int pa_idx)
+{
+ return pa + (pa_recsz * pa_idx);
+}
+
+/* The xfarray index is stored at the end of each pivot array element. */
+static inline xfarray_idx_t *
+xfarray_pivot_array_idx(
+ void *pa,
+ size_t pa_recsz,
+ unsigned int pa_idx)
+{
+ return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
+ sizeof(xfarray_idx_t);
+}
+
+/*
+ * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
+ * the cached pivot record for the next step.
+ *
+ * Load evenly-spaced records within the given range into memory, sort them,
+ * and choose the pivot from the median record. Using multiple points will
+ * improve the quality of the pivot selection, and hopefully avoid the worst
+ * quicksort behavior, since our array values are nearly always evenly sorted.
+ */
+STATIC int
+xfarray_qsort_pivot(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *pivot = xfarray_sortinfo_pivot(si);
+ void *parray = xfarray_sortinfo_pivot_array(si);
+ void *recp;
+ xfarray_idx_t *idxp;
+ xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
+ size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
+ int i, j;
+ int error;
+
+ ASSERT(step > 0);
+
+ /*
+ * Load the xfarray indexes of the records we intend to sample into the
+ * pivot array.
+ */
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
+ *idxp = lo;
+ for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+ *idxp = lo + (i * step);
+ }
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR - 1);
+ *idxp = hi;
+
+ /* Load the selected xfarray records into the pivot array. */
+ for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+ xfarray_idx_t idx;
+
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+
+ /* No unset records; load directly into the array. */
+ if (likely(si->array->unset_slots == 0)) {
+ error = xfarray_sort_load(si, *idxp, recp);
+ if (error)
+ return error;
+ continue;
+ }
+
+ /*
+ * Load non-null records into the scratchpad without changing
+ * the xfarray_idx_t in the pivot array.
+ */
+ idx = *idxp;
+ xfarray_sort_bump_loads(si);
+ error = xfarray_load_next(si->array, &idx, recp);
+ if (error)
+ return error;
+ }
+
+ xfarray_sort_bump_heapsorts(si);
+ sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
+
+ /*
+ * We sorted the pivot array records (which includes the xfarray
+ * indices) in xfarray record order. The median element of the pivot
+ * array contains the xfarray record that we will use as the pivot.
+ * Copy that xfarray record to the designated space.
+ */
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ memcpy(pivot, recp, si->array->obj_size);
+
+ /* If the pivot record we chose was already in a[lo] then we're done. */
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ if (*idxp == lo)
+ return 0;
+
+ /*
+ * Find the cached copy of a[lo] in the pivot array so that we can swap
+ * a[lo] and a[pivot].
+ */
+ for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+ if (*idxp == lo)
+ j = i;
+ }
+ if (j < 0) {
+ ASSERT(j >= 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* Swap a[lo] and a[pivot]. */
+ error = xfarray_sort_store(si, lo, pivot);
+ if (error)
+ return error;
+
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ return xfarray_sort_store(si, *idxp, recp);
+}
+
+/*
+ * Set up the pointers for the next iteration. We push onto the stack all of
+ * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
+ * current stack frame to point to the unsorted values between a[beg[i]] and
+ * a[lo] so that those values will be sorted when we pop the stack.
+ */
+static inline int
+xfarray_qsort_push(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t *si_lo,
+ xfarray_idx_t *si_hi,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ /* Check for stack overflows */
+ if (si->stack_depth >= si->max_stack_depth - 1) {
+ ASSERT(si->stack_depth < si->max_stack_depth - 1);
+ return -EFSCORRUPTED;
+ }
+
+ si->max_stack_used = max_t(uint8_t, si->max_stack_used,
+ si->stack_depth + 2);
+
+ si_lo[si->stack_depth + 1] = lo + 1;
+ si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
+ si_hi[si->stack_depth++] = lo - 1;
+
+ /*
+ * Always start with the smaller of the two partitions to keep the
+ * amount of recursion in check.
+ */
+ if (si_hi[si->stack_depth] - si_lo[si->stack_depth] >
+ si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
+ swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
+ swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
+ }
+
+ return 0;
+}
+
+/*
+ * Load an element from the array into the first scratchpad and cache the page,
+ * if possible.
+ */
+static inline int
+xfarray_sort_load_cached(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ loff_t idx_pos = xfarray_pos(si->array, idx);
+ pgoff_t startpage;
+ pgoff_t endpage;
+ int error = 0;
+
+ /*
+ * If this load would split a page, release the cached page, if any,
+ * and perform a traditional read.
+ */
+ startpage = idx_pos >> PAGE_SHIFT;
+ endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
+ if (startpage != endpage) {
+ error = xfarray_sort_put_page(si);
+ if (error)
+ return error;
+
+ if (xfarray_sort_terminated(si, &error))
+ return error;
+
+ return xfile_obj_load(si->array->xfile, ptr,
+ si->array->obj_size, idx_pos);
+ }
+
+ /* If the cached page is not the one we want, release it. */
+ if (xfile_page_cached(&si->xfpage) &&
+ xfile_page_index(&si->xfpage) != startpage) {
+ error = xfarray_sort_put_page(si);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we don't have a cached page (and we know the load is contained
+ * in a single page) then grab it.
+ */
+ if (!xfile_page_cached(&si->xfpage)) {
+ if (xfarray_sort_terminated(si, &error))
+ return error;
+
+ error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
+ PAGE_SIZE);
+ if (error)
+ return error;
+ }
+
+ memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
+ si->array->obj_size);
+ return 0;
+}
+
+/*
+ * Sort the array elements via quicksort. This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indices to store the next array partition
+ * to sort. This helps us to avoid recursion in the call stack, which is
+ * particularly expensive in the kernel.
+ *
+ * 2. For arrays with records in arbitrary or user-controlled order, choose the
+ * pivot element using a median-of-nine decision tree. This reduces the
+ * probability of selecting a bad pivot value which causes worst case
+ * behavior (i.e. partition sizes of 1).
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ * the next level of recursion, and the larger sub-partition replaces the
+ * current stack frame. This guarantees that we won't need more than
+ * log2(nr) stack space.
+ *
+ * 4. For small sets, load the records into the scratchpad and run heapsort on
+ * them because that is very fast. In the author's experience, this yields
+ * a ~10% reduction in runtime.
+ *
+ * If a small set is contained entirely within a single xfile memory page,
+ * map the page directly and run heap sort directly on the xfile page
+ * instead of using the load/store interface. This halves the runtime.
+ *
+ * 5. This optimization is specific to the implementation. When converging lo
+ * and hi after selecting a pivot, we will try to retain the xfile memory
+ * page between load calls, which reduces run time by 50%.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS (1ULL << 63)
+
+int
+xfarray_sort(
+ struct xfarray *array,
+ xfarray_cmp_fn cmp_fn,
+ unsigned int flags)
+{
+ struct xfarray_sortinfo *si;
+ xfarray_idx_t *si_lo, *si_hi;
+ void *pivot;
+ void *scratch = xfarray_scratch(array);
+ xfarray_idx_t lo, hi;
+ int error = 0;
+
+ if (array->nr < 2)
+ return 0;
+ if (array->nr >= QSORT_MAX_RECS)
+ return -E2BIG;
+
+ error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
+ if (error)
+ return error;
+ si_lo = xfarray_sortinfo_lo(si);
+ si_hi = xfarray_sortinfo_hi(si);
+ pivot = xfarray_sortinfo_pivot(si);
+
+ while (si->stack_depth >= 0) {
+ lo = si_lo[si->stack_depth];
+ hi = si_hi[si->stack_depth];
+
+ trace_xfarray_qsort(si, lo, hi);
+
+ /* Nothing left in this partition to sort; pop stack. */
+ if (lo >= hi) {
+ si->stack_depth--;
+ continue;
+ }
+
+ /*
+ * If directly mapping the page and sorting can solve our
+ * problems, we're done.
+ */
+ if (xfarray_want_pagesort(si, lo, hi)) {
+ error = xfarray_pagesort(si, lo, hi);
+ if (error)
+ goto out_free;
+ si->stack_depth--;
+ continue;
+ }
+
+ /* If insertion sort can solve our problems, we're done. */
+ if (xfarray_want_isort(si, lo, hi)) {
+ error = xfarray_isort(si, lo, hi);
+ if (error)
+ goto out_free;
+ si->stack_depth--;
+ continue;
+ }
+
+ /* Pick a pivot, move it to a[lo] and stash it. */
+ error = xfarray_qsort_pivot(si, lo, hi);
+ if (error)
+ goto out_free;
+
+ /*
+ * Rearrange a[lo..hi] such that everything smaller than the
+ * pivot is on the left side of the range and everything larger
+ * than the pivot is on the right side of the range.
+ */
+ while (lo < hi) {
+ /*
+ * Decrement hi until it finds an a[hi] less than the
+ * pivot value.
+ */
+ error = xfarray_sort_load_cached(si, hi, scratch);
+ if (error)
+ goto out_free;
+ while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
+ lo < hi) {
+ hi--;
+ error = xfarray_sort_load_cached(si, hi,
+ scratch);
+ if (error)
+ goto out_free;
+ }
+ error = xfarray_sort_put_page(si);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+
+ /* Copy that item (a[hi]) to a[lo]. */
+ if (lo < hi) {
+ error = xfarray_sort_store(si, lo++, scratch);
+ if (error)
+ goto out_free;
+ }
+
+ /*
+ * Increment lo until it finds an a[lo] greater than
+ * the pivot value.
+ */
+ error = xfarray_sort_load_cached(si, lo, scratch);
+ if (error)
+ goto out_free;
+ while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
+ lo < hi) {
+ lo++;
+ error = xfarray_sort_load_cached(si, lo,
+ scratch);
+ if (error)
+ goto out_free;
+ }
+ error = xfarray_sort_put_page(si);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+
+ /* Copy that item (a[lo]) to a[hi]. */
+ if (lo < hi) {
+ error = xfarray_sort_store(si, hi--, scratch);
+ if (error)
+ goto out_free;
+ }
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+ }
+
+ /*
+ * Put our pivot value in the correct place at a[lo]. All
+ * values between a[beg[i]] and a[lo - 1] should be less than
+ * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+ * should be greater than the pivot.
+ */
+ error = xfarray_sort_store(si, lo, pivot);
+ if (error)
+ goto out_free;
+
+ /* Set up the stack frame to process the two partitions. */
+ error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+ }
+
+out_free:
+ trace_xfarray_sort_stats(si, error);
+ kvfree(si);
+ return error;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
new file mode 100644
index 000000000000..4ecac01363d9
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFARRAY_H__
+#define __XFS_SCRUB_XFARRAY_H__
+
+/* xfile array index type, along with cursor initialization */
+typedef uint64_t xfarray_idx_t;
+#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0)
+
+/* Iterate each index of an xfile array. */
+#define foreach_xfarray_idx(array, idx) \
+ for ((idx) = XFARRAY_CURSOR_INIT; \
+ (idx) < xfarray_length(array); \
+ (idx)++)
+
+struct xfarray {
+ /* Underlying file that backs the array. */
+ struct xfile *xfile;
+
+ /* Number of array elements. */
+ xfarray_idx_t nr;
+
+ /* Maximum possible array size. */
+ xfarray_idx_t max_nr;
+
+ /* Number of unset slots in the array below @nr. */
+ uint64_t unset_slots;
+
+ /* Size of an array element. */
+ size_t obj_size;
+
+ /* log2 of array element size, if possible. */
+ int obj_size_log;
+};
+
+int xfarray_create(const char *descr, unsigned long long required_capacity,
+ size_t obj_size, struct xfarray **arrayp);
+void xfarray_destroy(struct xfarray *array);
+int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr);
+int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
+int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
+int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
+bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+
+/* Append an element to the array. */
+static inline int xfarray_append(struct xfarray *array, const void *ptr)
+{
+ return xfarray_store(array, array->nr, ptr);
+}
+
+uint64_t xfarray_length(struct xfarray *array);
+int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+
+/* Declarations for xfile array sort functionality. */
+
+typedef cmp_func_t xfarray_cmp_fn;
+
+/* Perform an in-memory heapsort for small subsets. */
+#define XFARRAY_ISORT_SHIFT (4)
+#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT)
+
+/* Evalulate this many points to find the qsort pivot. */
+#define XFARRAY_QSORT_PIVOT_NR (9)
+
+struct xfarray_sortinfo {
+ struct xfarray *array;
+
+ /* Comparison function for the sort. */
+ xfarray_cmp_fn cmp_fn;
+
+ /* Maximum height of the partition stack. */
+ uint8_t max_stack_depth;
+
+ /* Current height of the partition stack. */
+ int8_t stack_depth;
+
+ /* Maximum stack depth ever used. */
+ uint8_t max_stack_used;
+
+ /* XFARRAY_SORT_* flags; see below. */
+ unsigned int flags;
+
+ /* Cache a page here for faster access. */
+ struct xfile_page xfpage;
+ void *page_kaddr;
+
+#ifdef DEBUG
+ /* Performance statistics. */
+ uint64_t loads;
+ uint64_t stores;
+ uint64_t compares;
+ uint64_t heapsorts;
+#endif
+ /*
+ * Extra bytes are allocated beyond the end of the structure to store
+ * quicksort information. C does not permit multiple VLAs per struct,
+ * so we document all of this in a comment.
+ *
+ * Pretend that we have a typedef for array records:
+ *
+ * typedef char[array->obj_size] xfarray_rec_t;
+ *
+ * First comes the quicksort partition stack:
+ *
+ * xfarray_idx_t lo[max_stack_depth];
+ * xfarray_idx_t hi[max_stack_depth];
+ *
+ * union {
+ *
+ * If for a given subset we decide to use an in-memory sort, we use a
+ * block of scratchpad records here to compare items:
+ *
+ * xfarray_rec_t scratch[ISORT_NR];
+ *
+ * Otherwise, we want to partition the records to partition the array.
+ * We store the chosen pivot record at the start of the scratchpad area
+ * and use the rest to sample some records to estimate the median.
+ * The format of the qsort_pivot array enables us to use the kernel
+ * heapsort function to place the median value in the middle.
+ *
+ * struct {
+ * xfarray_rec_t pivot;
+ * struct {
+ * xfarray_rec_t rec; (rounded up to 8 bytes)
+ * xfarray_idx_t idx;
+ * } qsort_pivot[QSORT_PIVOT_NR];
+ * };
+ * }
+ */
+};
+
+/* Sort can be interrupted by a fatal signal. */
+#define XFARRAY_SORT_KILLABLE (1U << 0)
+
+int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn,
+ unsigned int flags);
+
+#endif /* __XFS_SCRUB_XFARRAY_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
new file mode 100644
index 000000000000..d98e8e77c684
--- /dev/null
+++ b/fs/xfs/scrub/xfile.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_format.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include <linux/shmem_fs.h>
+
+/*
+ * Swappable Temporary Memory
+ * ==========================
+ *
+ * Online checking sometimes needs to be able to stage a large amount of data
+ * in memory. This information might not fit in the available memory and it
+ * doesn't all need to be accessible at all times. In other words, we want an
+ * indexed data buffer to store data that can be paged out.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
+ * store our staging data. This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xfile must be freed with xfile_destroy.
+ *
+ * xfiles assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken. Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
+ * of a hole cause a page to be mapped into the file. If you are going to
+ * create a sparse xfile, please be careful about reading from uninitialized
+ * parts of the file. These pages are !Uptodate and will eventually be
+ * reclaimed if not written, but in the short term this boosts memory
+ * consumption.
+ */
+
+/*
+ * xfiles must not be exposed to userspace and require upper layers to
+ * coordinate access to the one handle returned by the constructor, so
+ * establish a separate lock class for xfiles to avoid confusing lockdep.
+ */
+static struct lock_class_key xfile_i_mutex_key;
+
+/*
+ * Create an xfile of the given size. The description will be used in the
+ * trace output.
+ */
+int
+xfile_create(
+ const char *description,
+ loff_t isize,
+ struct xfile **xfilep)
+{
+ struct inode *inode;
+ struct xfile *xf;
+ int error = -ENOMEM;
+
+ xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
+ if (!xf)
+ return -ENOMEM;
+
+ xf->file = shmem_file_setup(description, isize, 0);
+ if (!xf->file)
+ goto out_xfile;
+ if (IS_ERR(xf->file)) {
+ error = PTR_ERR(xf->file);
+ goto out_xfile;
+ }
+
+ /*
+ * We want a large sparse file that we can pread, pwrite, and seek.
+ * xfile users are responsible for keeping the xfile hidden away from
+ * all other callers, so we skip timestamp updates and security checks.
+ * Make the inode only accessible by root, just in case the xfile ever
+ * escapes.
+ */
+ xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
+ FMODE_LSEEK;
+ xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
+ inode = file_inode(xf->file);
+ inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
+ inode->i_mode &= ~0177;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+
+ lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+
+ trace_xfile_create(xf);
+
+ *xfilep = xf;
+ return 0;
+out_xfile:
+ kfree(xf);
+ return error;
+}
+
+/* Close the file and release all resources. */
+void
+xfile_destroy(
+ struct xfile *xf)
+{
+ struct inode *inode = file_inode(xf->file);
+
+ trace_xfile_destroy(xf);
+
+ lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
+ fput(xf->file);
+ kfree(xf);
+}
+
+/*
+ * Read a memory object directly from the xfile's page cache. Unlike regular
+ * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
+ * high an offset, instead of truncating the read. Otherwise, we return
+ * bytes read or an error code, like regular pread.
+ */
+ssize_t
+xfile_pread(
+ struct xfile *xf,
+ void *buf,
+ size_t count,
+ loff_t pos)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page = NULL;
+ ssize_t read = 0;
+ unsigned int pflags;
+ int error = 0;
+
+ if (count > MAX_RW_COUNT)
+ return -E2BIG;
+ if (inode->i_sb->s_maxbytes - pos < count)
+ return -EFBIG;
+
+ trace_xfile_pread(xf, pos, count);
+
+ pflags = memalloc_nofs_save();
+ while (count > 0) {
+ void *p, *kaddr;
+ unsigned int len;
+
+ len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+ /*
+ * In-kernel reads of a shmem file cause it to allocate a page
+ * if the mapping shows a hole. Therefore, if we hit ENOMEM
+ * we can continue by zeroing the caller's buffer.
+ */
+ page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
+ __GFP_NOWARN);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ if (error != -ENOMEM)
+ break;
+
+ memset(buf, 0, len);
+ goto advance;
+ }
+
+ if (PageUptodate(page)) {
+ /*
+ * xfile pages must never be mapped into userspace, so
+ * we skip the dcache flush.
+ */
+ kaddr = kmap_local_page(page);
+ p = kaddr + offset_in_page(pos);
+ memcpy(buf, p, len);
+ kunmap_local(kaddr);
+ } else {
+ memset(buf, 0, len);
+ }
+ put_page(page);
+
+advance:
+ count -= len;
+ pos += len;
+ buf += len;
+ read += len;
+ }
+ memalloc_nofs_restore(pflags);
+
+ if (read > 0)
+ return read;
+ return error;
+}
+
+/*
+ * Write a memory object directly to the xfile's page cache. Unlike regular
+ * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
+ * high an offset, instead of truncating the write. Otherwise, we return
+ * bytes written or an error code, like regular pwrite.
+ */
+ssize_t
+xfile_pwrite(
+ struct xfile *xf,
+ const void *buf,
+ size_t count,
+ loff_t pos)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ struct page *page = NULL;
+ ssize_t written = 0;
+ unsigned int pflags;
+ int error = 0;
+
+ if (count > MAX_RW_COUNT)
+ return -E2BIG;
+ if (inode->i_sb->s_maxbytes - pos < count)
+ return -EFBIG;
+
+ trace_xfile_pwrite(xf, pos, count);
+
+ pflags = memalloc_nofs_save();
+ while (count > 0) {
+ void *fsdata = NULL;
+ void *p, *kaddr;
+ unsigned int len;
+ int ret;
+
+ len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+ /*
+ * We call write_begin directly here to avoid all the freezer
+ * protection lock-taking that happens in the normal path.
+ * shmem doesn't support fs freeze, but lockdep doesn't know
+ * that and will trip over that.
+ */
+ error = aops->write_begin(NULL, mapping, pos, len, &page,
+ &fsdata);
+ if (error)
+ break;
+
+ /*
+ * xfile pages must never be mapped into userspace, so we skip
+ * the dcache flush. If the page is not uptodate, zero it
+ * before writing data.
+ */
+ kaddr = kmap_local_page(page);
+ if (!PageUptodate(page)) {
+ memset(kaddr, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ }
+ p = kaddr + offset_in_page(pos);
+ memcpy(p, buf, len);
+ kunmap_local(kaddr);
+
+ ret = aops->write_end(NULL, mapping, pos, len, len, page,
+ fsdata);
+ if (ret < 0) {
+ error = ret;
+ break;
+ }
+
+ written += ret;
+ if (ret != len)
+ break;
+
+ count -= ret;
+ pos += ret;
+ buf += ret;
+ }
+ memalloc_nofs_restore(pflags);
+
+ if (written > 0)
+ return written;
+ return error;
+}
+
+/* Find the next written area in the xfile data for a given offset. */
+loff_t
+xfile_seek_data(
+ struct xfile *xf,
+ loff_t pos)
+{
+ loff_t ret;
+
+ ret = vfs_llseek(xf->file, pos, SEEK_DATA);
+ trace_xfile_seek_data(xf, pos, ret);
+ return ret;
+}
+
+/* Query stat information for an xfile. */
+int
+xfile_stat(
+ struct xfile *xf,
+ struct xfile_stat *statbuf)
+{
+ struct kstat ks;
+ int error;
+
+ error = vfs_getattr_nosec(&xf->file->f_path, &ks,
+ STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
+ if (error)
+ return error;
+
+ statbuf->size = ks.size;
+ statbuf->bytes = ks.blocks << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * Grab the (locked) page for a memory object. The object cannot span a page
+ * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
+ * cannot grab the page, or the usual negative errno.
+ */
+int
+xfile_get_page(
+ struct xfile *xf,
+ loff_t pos,
+ unsigned int len,
+ struct xfile_page *xfpage)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ struct page *page = NULL;
+ void *fsdata = NULL;
+ loff_t key = round_down(pos, PAGE_SIZE);
+ unsigned int pflags;
+ int error;
+
+ if (inode->i_sb->s_maxbytes - pos < len)
+ return -ENOMEM;
+ if (len > PAGE_SIZE - offset_in_page(pos))
+ return -ENOTBLK;
+
+ trace_xfile_get_page(xf, pos, len);
+
+ pflags = memalloc_nofs_save();
+
+ /*
+ * We call write_begin directly here to avoid all the freezer
+ * protection lock-taking that happens in the normal path. shmem
+ * doesn't support fs freeze, but lockdep doesn't know that and will
+ * trip over that.
+ */
+ error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
+ &fsdata);
+ if (error)
+ goto out_pflags;
+
+ /* We got the page, so make sure we push out EOF. */
+ if (i_size_read(inode) < pos + len)
+ i_size_write(inode, pos + len);
+
+ /*
+ * If the page isn't up to date, fill it with zeroes before we hand it
+ * to the caller and make sure the backing store will hold on to them.
+ */
+ if (!PageUptodate(page)) {
+ void *kaddr;
+
+ kaddr = kmap_local_page(page);
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_local(kaddr);
+ SetPageUptodate(page);
+ }
+
+ /*
+ * Mark each page dirty so that the contents are written to some
+ * backing store when we drop this buffer, and take an extra reference
+ * to prevent the xfile page from being swapped or removed from the
+ * page cache by reclaim if the caller unlocks the page.
+ */
+ set_page_dirty(page);
+ get_page(page);
+
+ xfpage->page = page;
+ xfpage->fsdata = fsdata;
+ xfpage->pos = key;
+out_pflags:
+ memalloc_nofs_restore(pflags);
+ return error;
+}
+
+/*
+ * Release the (locked) page for a memory object. Returns 0 or a negative
+ * errno.
+ */
+int
+xfile_put_page(
+ struct xfile *xf,
+ struct xfile_page *xfpage)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ unsigned int pflags;
+ int ret;
+
+ trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
+
+ /* Give back the reference that we took in xfile_get_page. */
+ put_page(xfpage->page);
+
+ pflags = memalloc_nofs_save();
+ ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
+ xfpage->page, xfpage->fsdata);
+ memalloc_nofs_restore(pflags);
+ memset(xfpage, 0, sizeof(struct xfile_page));
+
+ if (ret < 0)
+ return ret;
+ if (ret != PAGE_SIZE)
+ return -EIO;
+ return 0;
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
new file mode 100644
index 000000000000..d56643b0f429
--- /dev/null
+++ b/fs/xfs/scrub/xfile.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFILE_H__
+#define __XFS_SCRUB_XFILE_H__
+
+struct xfile_page {
+ struct page *page;
+ void *fsdata;
+ loff_t pos;
+};
+
+static inline bool xfile_page_cached(const struct xfile_page *xfpage)
+{
+ return xfpage->page != NULL;
+}
+
+static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
+{
+ return xfpage->page->index;
+}
+
+struct xfile {
+ struct file *file;
+};
+
+int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
+void xfile_destroy(struct xfile *xf);
+
+ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
+ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+ loff_t pos);
+
+/*
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
+{
+ ssize_t ret = xfile_pread(xf, buf, count, pos);
+
+ if (ret < 0 || ret != count)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
+{
+ ssize_t ret = xfile_pwrite(xf, buf, count, pos);
+
+ if (ret < 0 || ret != count)
+ return -ENOMEM;
+ return 0;
+}
+
+loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
+
+struct xfile_stat {
+ loff_t size;
+ unsigned long long bytes;
+};
+
+int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+
+int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
+ struct xfile_page *xbuf);
+int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+
+#endif /* __XFS_SCRUB_XFILE_H__ */