diff options
Diffstat (limited to 'fs')
42 files changed, 3910 insertions, 647 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 52e1823241fb..c9d653168ad0 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -128,6 +128,7 @@ config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + depends on TMPFS && SHMEM select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a @@ -142,6 +143,23 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_SCRUB_STATS + bool "XFS online metadata check usage data collection" + default y + depends on XFS_ONLINE_SCRUB + select FS_DEBUG + help + If you say Y here, the kernel will gather usage data about + the online metadata check subsystem. This includes the number + of invocations, the outcomes, and the results of repairs, if any. + This may slow down scrub slightly due to the use of high precision + timers and the need to merge per-invocation information into the + filesystem counters. + + Usage data are collected in /sys/kernel/debug/xfs/scrub. + + If unsure, say N. + config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 16e4eb431230..7762c01a85cf 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -164,15 +164,24 @@ xfs-y += $(addprefix scrub/, \ rmap.o \ scrub.o \ symlink.o \ + xfarray.o \ + xfile.o \ + ) + +xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap.o \ + rtsummary.o \ ) -xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + reap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2cbf9ea39b8c..6360073865db 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -743,7 +743,11 @@ struct xfs_scrub_metadata { */ #define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) -#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) +/* i: Rebuild the data structure. */ +#define XFS_SCRUB_IFLAG_FORCE_REBUILD (1u << 8) + +#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR | \ + XFS_SCRUB_IFLAG_FORCE_REBUILD) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bbaa65422c4f..876a2f41b063 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/reap.h" /* Superblock */ @@ -48,6 +49,10 @@ xrep_superblock( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -423,6 +428,10 @@ xrep_agf( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agf_init_header(sc, agf_bp, &old_agf); xrep_agf_set_roots(sc, agf, fab); @@ -444,13 +453,13 @@ out_revert: struct xrep_agfl { /* Bitmap of alleged AGFL blocks that we're not going to add. */ - struct xbitmap crossed; + struct xagb_bitmap crossed; /* Bitmap of other OWN_AG metadata blocks. */ - struct xbitmap agmetablocks; + struct xagb_bitmap agmetablocks; /* Bitmap of free space. */ - struct xbitmap *freesp; + struct xagb_bitmap *freesp; /* rmapbt cursor for finding crosslinked blocks */ struct xfs_btree_cur *rmap_cur; @@ -466,7 +475,6 @@ xrep_agfl_walk_rmap( void *priv) { struct xrep_agfl *ra = priv; - xfs_fsblock_t fsb; int error = 0; if (xchk_should_terminate(ra->sc, &error)) @@ -474,14 +482,13 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); - error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xagb_bitmap_set(ra->freesp, rec->rm_startblock, + rec->rm_blockcount); if (error) return error; } - return xbitmap_set_btcur_path(&ra->agmetablocks, cur); + return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur); } /* Strike out the blocks that are cross-linked according to the rmapbt. */ @@ -492,12 +499,10 @@ xrep_agfl_check_extent( void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; - ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); - while (agbno <= last_agbno) { bool other_owners; @@ -507,7 +512,7 @@ xrep_agfl_check_extent( return error; if (other_owners) { - error = xbitmap_set(&ra->crossed, agbno, 1); + error = xagb_bitmap_set(&ra->crossed, agbno, 1); if (error) return error; } @@ -533,7 +538,7 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; @@ -543,8 +548,8 @@ xrep_agfl_collect_blocks( ra.sc = sc; ra.freesp = agfl_extents; - xbitmap_init(&ra.agmetablocks); - xbitmap_init(&ra.crossed); + xagb_bitmap_init(&ra.agmetablocks); + xagb_bitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); @@ -556,7 +561,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -564,7 +569,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -573,17 +578,17 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks); if (error) goto out_bmp; /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); + error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; - error = xbitmap_disunion(agfl_extents, &ra.crossed); + error = xagb_bitmap_disunion(agfl_extents, &ra.crossed); if (error) goto out_bmp; @@ -591,12 +596,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents), xfs_agfl_size(mp)); out_bmp: - xbitmap_destroy(&ra.crossed); - xbitmap_destroy(&ra.agmetablocks); + xagb_bitmap_destroy(&ra.crossed); + xagb_bitmap_destroy(&ra.agmetablocks); return error; } @@ -615,18 +620,24 @@ xrep_agfl_update_agf( xfs_force_summary_recalc(sc->mp); /* Update the AGF counters. */ - if (xfs_perag_initialised_agf(sc->sa.pag)) + if (xfs_perag_initialised_agf(sc->sa.pag)) { sc->sa.pag->pagf_flcount = flcount; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, + &sc->sa.pag->pag_opstate); + } agf->agf_flfirst = cpu_to_be32(0); agf->agf_flcount = cpu_to_be32(flcount); - agf->agf_fllast = cpu_to_be32(flcount - 1); + if (flcount) + agf->agf_fllast = cpu_to_be32(flcount - 1); + else + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1); xfs_alloc_log_agf(sc->tp, agf_bp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } struct xrep_agfl_fill { - struct xbitmap used_extents; + struct xagb_bitmap used_extents; struct xfs_scrub *sc; __be32 *agfl_bno; xfs_agblock_t flcount; @@ -642,17 +653,15 @@ xrep_agfl_fill( { struct xrep_agfl_fill *af = priv; struct xfs_scrub *sc = af->sc; - xfs_fsblock_t fsbno = start; + xfs_agblock_t agbno = start; int error; - while (fsbno < start + len && af->fl_off < af->flcount) - af->agfl_bno[af->fl_off++] = - cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + trace_xrep_agfl_insert(sc->sa.pag, agbno, len); - trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, - XFS_FSB_TO_AGBNO(sc->mp, start), len); + while (agbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); - error = xbitmap_set(&af->used_extents, start, fsbno - 1); + error = xagb_bitmap_set(&af->used_extents, start, agbno - 1); if (error) return error; @@ -667,7 +676,7 @@ STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t flcount) { struct xrep_agfl_fill af = { @@ -695,17 +704,17 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - xbitmap_init(&af.used_extents); + xagb_bitmap_init(&af.used_extents); af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), - xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); - error = xbitmap_disunion(agfl_extents, &af.used_extents); + xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); if (error) return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); - xbitmap_destroy(&af.used_extents); + xagb_bitmap_destroy(&af.used_extents); return 0; } @@ -714,7 +723,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xbitmap agfl_extents; + struct xagb_bitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -725,7 +734,7 @@ xrep_agfl( if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xbitmap_init(&agfl_extents); + xagb_bitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -753,6 +762,10 @@ xrep_agfl( if (error) goto err; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err; + /* * Update AGF and AGFL. We reset the global free block counter when * we adjust the AGF flcount (which can fail) so avoid updating any @@ -774,10 +787,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xbitmap_destroy(&agfl_extents); + xagb_bitmap_destroy(&agfl_extents); return error; } @@ -1000,6 +1013,10 @@ xrep_agi( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agi_init_header(sc, agi_bp, &old_agi); xrep_agi_set_roots(sc, agi, fab); diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0c959be396ea..e0c89a9a0ca0 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks( * blocks going from the leaf towards the root. */ int -xbitmap_set_btcur_path( - struct xbitmap *bitmap, +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur) { - struct xfs_buf *bp; - xfs_fsblock_t fsb; int i; int error; for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - xfs_btree_get_block(cur, i, &bp); - if (!bp) - continue; - fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - error = xbitmap_set(bitmap, fsb, 1); + error = xagb_bitmap_visit_btblock(cur, i, bitmap); if (error) return error; } @@ -323,35 +317,6 @@ xbitmap_set_btcur_path( return 0; } -/* Collect a btree's block in the bitmap. */ -STATIC int -xbitmap_collect_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) -{ - struct xbitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xbitmap_set(bitmap, fsbno, 1); -} - -/* Walk the btree and mark the bitmap wherever a btree block is found. */ -int -xbitmap_set_btblocks( - struct xbitmap *bitmap, - struct xfs_btree_cur *cur) -{ - return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, - XFS_BTREE_VISIT_ALL, bitmap); -} - /* How many bits are set in this bitmap? */ uint64_t xbitmap_hweight( @@ -385,43 +350,6 @@ xbitmap_walk( return error; } -struct xbitmap_walk_bits { - xbitmap_walk_bits_fn fn; - void *priv; -}; - -/* Walk all the bits in a run. */ -static int -xbitmap_walk_bits_in_run( - uint64_t start, - uint64_t len, - void *priv) -{ - struct xbitmap_walk_bits *wb = priv; - uint64_t i; - int error = 0; - - for (i = start; i < start + len; i++) { - error = wb->fn(i, wb->priv); - if (error) - break; - } - - return error; -} - -/* Call a function for every set bit in this bitmap. */ -int -xbitmap_walk_bits( - struct xbitmap *bitmap, - xbitmap_walk_bits_fn fn, - void *priv) -{ - struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; - - return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); -} - /* Does this bitmap have no bits set at all? */ bool xbitmap_empty( diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 84981724ecaf..4fe58bad6734 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap); int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -int xbitmap_set_btcur_path(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); -int xbitmap_set_btblocks(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); /* @@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, void *priv); -typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); -int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, - void *priv); - bool xbitmap_empty(struct xbitmap *bitmap); bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); @@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 5bf4326e9783..75588915572e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -38,8 +38,7 @@ xchk_setup_inode_bmap( if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); + xchk_ilock(sc, XFS_IOLOCK_EXCL); /* * We don't want any ephemeral data/cow fork updates sitting around @@ -50,8 +49,7 @@ xchk_setup_inode_bmap( sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; - sc->ilock_flags |= XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + xchk_ilock(sc, XFS_MMAPLOCK_EXCL); inode_dio_wait(VFS_I(sc->ip)); @@ -79,9 +77,8 @@ xchk_setup_inode_bmap( error = xchk_trans_alloc(sc, 0); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ return error; @@ -844,7 +841,7 @@ xchk_bmap( /* Non-existent forks can be ignored. */ if (!ifp) - goto out; + return -ENOENT; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -853,10 +850,10 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink inodes/filesystems. */ - if (!xfs_is_reflink_inode(ip)) { + /* No CoW forks on non-reflink filesystems. */ + if (!xfs_has_reflink(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } break; case XFS_ATTR_FORK: @@ -876,31 +873,31 @@ xchk_bmap( /* No mappings to check. */ if (whichfork == XFS_COW_FORK) xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; case XFS_DINODE_FMT_EXTENTS: break; case XFS_DINODE_FMT_BTREE: if (whichfork == XFS_COW_FORK) { xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } error = xchk_bmap_btree(sc, whichfork, &info); if (error) - goto out; + return error; break; default: xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + return 0; /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; + return error; /* * Scrub extent records. We use a special iterator function here that @@ -913,12 +910,12 @@ xchk_bmap( while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; + return 0; if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); - goto out; + return 0; } if (isnullstartblock(irec.br_startblock)) @@ -931,10 +928,10 @@ xchk_bmap( if (xchk_bmap_want_check_rmaps(&info)) { error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + return error; } -out: - return error; + + return 0; } /* Scrub an inode's data fork. */ @@ -958,8 +955,5 @@ int xchk_bmap_cow( struct xfs_scrub *sc) { - if (!xfs_is_reflink_inode(sc->ip)) - return -ENOENT; - return xchk_bmap(sc, XFS_COW_FORK); } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 7a20256be969..de24532fe083 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -832,6 +832,25 @@ xchk_install_handle_inode( } /* + * Install an already-referenced inode for scrubbing. Get our own reference to + * the inode to make disposal simpler. The inode must not be in I_FREEING or + * I_WILL_FREE state! + */ +int +xchk_install_live_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!igrab(VFS_I(ip))) { + xchk_ino_set_corrupt(sc, ip->i_ino); + return -EFSCORRUPTED; + } + + sc->ip = ip; + return 0; +} + +/* * In preparation to scrub metadata structures that hang off of an inode, * grab either the inode referenced in the scrub control structure or the * inode passed in. If the inumber does not reference an allocated inode @@ -854,10 +873,8 @@ xchk_iget_for_scrubbing( ASSERT(sc->tp == NULL); /* We want to scan the inode we already had opened. */ - if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; - return 0; - } + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) + return xchk_install_live_inode(sc, ip_in); /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) @@ -1005,20 +1022,48 @@ xchk_setup_inode_contents( return error; /* Lock the inode so the VFS cannot touch this file. */ - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, resblks); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ return error; } +void +xchk_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_ilock(sc->ip, ilock_flags); + sc->ilock_flags |= ilock_flags; +} + +bool +xchk_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->ip, ilock_flags)) { + sc->ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xchk_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->ilock_flags &= ~ilock_flags; + xfs_iunlock(sc->ip, ilock_flags); +} + /* * Predicate that decides if we need to evaluate the cross-reference check. * If there was an error accessing the cross-reference btree, just delete @@ -1185,3 +1230,155 @@ xchk_fsgates_enable( sc->flags |= scrub_fsgates; } + +/* + * Decide if this is this a cached inode that's also allocated. The caller + * must hold a reference to an AG and the AGI buffer lock to prevent inodes + * from being allocated or freed. + * + * Look up an inode by number in the given file system. If the inode number + * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. + * If the inode is being reclaimed, return -ENODATA because we know the inode + * cache cannot be updating the ondisk metadata. + * + * Otherwise, the incore inode is the one we want, and it is either live, + * somewhere in the inactivation machinery, or reclaimable. The inode is + * allocated if i_mode is nonzero. In all three cases, the cached inode will + * be more up to date than the ondisk inode buffer, so we must use the incore + * i_mode. + */ +int +xchk_inode_is_allocated( + struct xfs_scrub *sc, + xfs_agino_t agino, + bool *inuse) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + xfs_ino_t ino; + struct xfs_inode *ip; + int error; + + /* caller must hold perag reference */ + if (pag == NULL) { + ASSERT(pag != NULL); + return -EINVAL; + } + + /* caller must have AGI buffer */ + if (sc->sa.agi_bp == NULL) { + ASSERT(sc->sa.agi_bp != NULL); + return -EINVAL; + } + + /* reject inode numbers outside existing AGs */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + if (!xfs_verify_ino(mp, ino)) + return -EINVAL; + + error = -ENODATA; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* cache miss */ + goto out_rcu; + } + + /* + * If the inode number doesn't match, the incore inode got reused + * during an RCU grace period and the radix tree hasn't been updated. + * This isn't the inode we want. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) + goto out_skip; + + trace_xchk_inode_is_allocated(ip); + + /* + * We have an incore inode that matches the inode we want, and the + * caller holds the perag structure and the AGI buffer. Let's check + * our assumptions below: + */ + +#ifdef DEBUG + /* + * (1) If the incore inode is live (i.e. referenced from the dcache), + * it will not be INEW, nor will it be in the inactivation or reclaim + * machinery. The ondisk inode had better be allocated. This is the + * most trivial case. + */ + if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | + XFS_INACTIVATING))) { + /* live inode */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * If the incore inode is INEW, there are several possibilities: + * + * (2) For a file that is being created, note that we allocate the + * ondisk inode before allocating, initializing, and adding the incore + * inode to the radix tree. + * + * (3) If the incore inode is being recycled, the inode has to be + * allocated because we don't allow freed inodes to be recycled. + * Recycling doesn't touch i_mode. + */ + if (ip->i_flags & XFS_INEW) { + /* created on disk already or recycling */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * (4) If the inode is queued for inactivation (NEED_INACTIVE) but + * inactivation has not started (!INACTIVATING), it is still allocated. + */ + if ((ip->i_flags & XFS_NEED_INACTIVE) && + !(ip->i_flags & XFS_INACTIVATING)) { + /* definitely before difree */ + ASSERT(VFS_I(ip)->i_mode != 0); + } +#endif + + /* + * If the incore inode is undergoing inactivation (INACTIVATING), there + * are two possibilities: + * + * (5) It is before the point where it would get freed ondisk, in which + * case i_mode is still nonzero. + * + * (6) It has already been freed, in which case i_mode is zero. + * + * We don't take the ILOCK here, but difree and dialloc update the AGI, + * and we've taken the AGI buffer lock, which prevents that from + * happening. + */ + + /* + * (7) Inodes undergoing inactivation (INACTIVATING) or queued for + * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still + * reflects the ondisk state. + */ + + /* + * (8) If the inode is in IFLUSHING, it's safe to query i_mode because + * the flush code uses i_mode to format the ondisk inode. + */ + + /* + * (9) If the inode is in IRECLAIM and was reachable via the radix + * tree, it still has the same i_mode as it did before it entered + * reclaim. The inode object is still alive because we hold the RCU + * read lock. + */ + + *inuse = VFS_I(ip)->i_mode != 0; + error = 0; + +out_skip: + spin_unlock(&ip->i_flags_lock); +out_rcu: + rcu_read_unlock(); + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 791235cd9b00..cabdc0e16838 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT -int xchk_setup_rt(struct xfs_scrub *sc); +int xchk_setup_rtbitmap(struct xfs_scrub *sc); +int xchk_setup_rtsummary(struct xfs_scrub *sc); #else static inline int -xchk_setup_rt(struct xfs_scrub *sc) +xchk_setup_rtbitmap(struct xfs_scrub *sc) +{ + return -ENOENT; +} +static inline int +xchk_setup_rtsummary(struct xfs_scrub *sc) { return -ENOENT; } @@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); +int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + +void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags); +void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); @@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Decide if a repair is required. */ +static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN); +} +#else +# define xchk_needs_repair(sc) (false) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + int xchk_metadata_inode_forks(struct xfs_scrub *sc); /* + * Helper macros to allocate and format xfile description strings. + * Callers must kfree the pointer returned. + */ +#define xchk_xfile_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ + (sc)->mp->m_super->s_id, ##__VA_ARGS__) + +/* * Setting up a hook to wait for intents to drain is costly -- we have to take * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it * up, and again to tear it down. These costs add up quickly, so we only want @@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); +int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, + bool *inuse); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index d2b2a1cb6533..5e2b09ed6e29 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough( return true; } + /* + * If we just repaired some AG metadata, sc->sick_mask will reflect all + * the per-AG metadata types that were repaired. Exclude these from + * the filesystem health query because we have not yet updated the + * health status and we want everything to be scanned. + */ + if ((sc->flags & XREP_ALREADY_FIXED) && + type_to_health_flag[sc->sm->sm_type].group == XHG_AG) + mask &= ~sc->sick_mask; + if (xfs_ag_has_sickness(pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; return false; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 575f22a02ebe..fb7bbf47ae5d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree( goto out; } - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, - &ino_inuse); + error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ freemask_ok = irec_free ^ !!(dip->di_mode); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3e1e02e340a6..59d7912fb75f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -32,15 +32,13 @@ xchk_prepare_iscrub( { int error; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -83,7 +81,10 @@ xchk_setup_inode( /* We want to scan the opened inode, so lock it and exit. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; + error = xchk_install_live_inode(sc, ip_in); + if (error) + return error; + return xchk_prepare_iscrub(sc); } diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 58d5dfb7ea21..e6155d86f791 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -150,8 +150,8 @@ xchk_parent_validate( lock_mode = xchk_parent_ilock_dir(dp); if (!lock_mode) { - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); error = -EAGAIN; goto out_rele; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e6caa358cbda..5671c8153433 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -59,9 +59,12 @@ xchk_setup_quota( error = xchk_setup_fs(sc); if (error) return error; - sc->ip = xfs_quota_inode(sc->mp, dqtype); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - sc->ilock_flags = XFS_ILOCK_EXCL; + + error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype)); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -235,13 +238,11 @@ xchk_quota( * data fork we have to drop ILOCK_EXCL to use the regular dquot * functions. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; + xchk_iunlock(sc, sc->ilock_flags); sqi.sc = sc; sqi.last_id = 0; error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - sc->ilock_flags = XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c new file mode 100644 index 000000000000..86a62420e02c --- /dev/null +++ b/fs/xfs/scrub/reap.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/reap.h" + +/* + * Disposal of Blocks from Old Metadata + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (bitmap) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from bitmap. In theory the extents + * remaining in bitmap are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @bitmap, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We must also invalidate any buffers associated with + * @bitmap. + */ + +/* Information about reaping extents after a repair. */ +struct xreap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + + /* If true, roll the transaction before reaping the next extent. */ + bool force_roll; + + /* Number of deferred reaps attached to the current transaction. */ + unsigned int deferred; + + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int invalidated; + + /* Number of deferred reaps queued during the whole reap sequence. */ + unsigned long long total_deferred; +}; + +/* Put a block back on the AGFL. */ +STATIC int +xreap_put_freelist( + struct xfs_scrub *sc, + xfs_agblock_t agbno) +{ + struct xfs_buf *agfl_bp; + int error; + + /* Make sure there's space on the freelist. */ + error = xrep_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + agfl_bp, agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Are there any uncommitted reap operations? */ +static inline bool xreap_dirty(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred) + return true; + if (rs->invalidated) + return true; + if (rs->total_deferred) + return true; + return false; +} + +#define XREAP_MAX_BINVAL (2048) + +/* + * Decide if we want to roll the transaction after reaping an extent. We don't + * want to overrun the transaction reservation, so we prohibit more than + * 128 EFIs per transaction. For the same reason, we limit the number + * of buffer invalidations to 2048. + */ +static inline bool xreap_want_roll(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) + return true; + if (rs->invalidated > XREAP_MAX_BINVAL) + return true; + return false; +} + +static inline void xreap_reset(struct xreap_state *rs) +{ + rs->total_deferred += rs->deferred; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +#define XREAP_MAX_DEFER_CHAIN (2048) + +/* + * Decide if we want to finish the deferred ops that are attached to the scrub + * transaction. We don't want to queue huge chains of deferred ops because + * that can consume a lot of log space and kernel memory. Hence we trigger a + * xfs_defer_finish if there are more than 2048 deferred reap operations or the + * caller did some real work. + */ +static inline bool +xreap_want_defer_finish(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) + return true; + return false; +} + +static inline void xreap_defer_finish_reset(struct xreap_state *rs) +{ + rs->total_deferred = 0; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +/* Try to invalidate the incore buffers for an extent that we're freeing. */ +STATIC void +xreap_agextent_binval( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t agbno_next = agbno + *aglenp; + xfs_agblock_t bno = agbno; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return; + + /* + * If there are incore buffers for these blocks, invalidate them. We + * assume that the lack of any other known owners means that the buffer + * can be locked without risk of deadlocking. The buffer cache cannot + * detect aliasing, so employ nested loops to scan for incore buffers + * of any plausible size. + */ + while (bno < agbno_next) { + xfs_agblock_t fsbcount; + xfs_agblock_t max_fsbs; + + /* + * Max buffer size is the max remote xattr buffer size, which + * is one fs block larger than 64k. + */ + max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); + + for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + struct xfs_buf *bp = NULL; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, agno, bno); + error = xfs_buf_incore(mp->m_ddev_targp, daddr, + XFS_FSB_TO_BB(mp, fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + rs->invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * far we've gotten. + */ + if (rs->invalidated > XREAP_MAX_BINVAL) { + *aglenp -= agbno_next - bno; + goto out; + } + } + + bno++; + } + +out: + trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); +} + +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. AGFL blocks can only be put back one at + * a time. + */ +STATIC int +xreap_agextent_select( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_agblock_t agbno_next, + bool *crosslinked, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_agblock_t bno = agbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* AGFL blocks can only be deal with one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) + goto out_found; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < agbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + +out_found: + *aglenp = len; + trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this AG extent as possible. The + * number of blocks disposed of will be returned in @aglenp. + */ +STATIC int +xreap_agextent_iter( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_fsblock_t fsbno; + int error = 0; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + + rs->force_roll = true; + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, + *aglenp, rs->oinfo); + } + + trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + + /* + * Invalidate as many buffers as we can, starting at agbno. If this + * function sets *aglenp to zero, the transaction is full of logged + * buffer invalidations, so we need to return early so that we can + * roll and retry. + */ + xreap_agextent_binval(rs, agbno, aglenp); + if (*aglenp == 0) { + ASSERT(xreap_want_roll(rs)); + return 0; + } + + /* Put blocks back on the AGFL one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) { + ASSERT(*aglenp == 1); + error = xreap_put_freelist(sc, agbno); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + + /* + * Use deferred frees to get rid of the old btree blocks to try to + * minimize the window in which we could crash and lose the old blocks. + */ + error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + rs->resv, true); + if (error) + return error; + + rs->deferred++; + return 0; +} + +/* + * Break an AG metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. + */ +STATIC int +xreap_agmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agblock_t agbno = fsbno; + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip == NULL); + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + return error; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + return error; + + if (xreap_want_defer_finish(rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xrep_roll_ag_trans(sc); + if (error) + return error; + xreap_reset(rs); + } + + agbno += aglen; + } + + return 0; +} + +/* Dispose of every block of every AG metadata extent in the bitmap. */ +int +xrep_reap_agblocks( + struct xfs_scrub *sc, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip == NULL); + + error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h new file mode 100644 index 000000000000..fe24626af164 --- /dev/null +++ b/fs/xfs/scrub/reap.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_REAP_H__ +#define __XFS_SCRUB_REAP_H__ + +int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +#endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ac6d8803e660..1b8b5439f2d7 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -26,11 +26,13 @@ #include "xfs_ag_resv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/stats.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -39,8 +41,10 @@ */ int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { + u64 repair_start; int error = 0; trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); @@ -49,8 +53,11 @@ xrep_attempt( /* Repair whatever's broken. */ ASSERT(sc->ops->repair); + run->repair_attempted = true; + repair_start = xchk_stats_now(); error = sc->ops->repair(sc); trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); + run->repair_ns += xchk_stats_elapsed_ns(repair_start); switch (error) { case 0: /* @@ -59,14 +66,17 @@ xrep_attempt( */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; + run->repair_succeeded = true; return -EAGAIN; case -ECHRNG: sc->flags |= XCHK_NEED_DRAIN; + run->retries++; return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; + run->retries++; return -EAGAIN; } /* @@ -166,6 +176,56 @@ xrep_roll_ag_trans( return 0; } +/* Finish all deferred work attached to the repair transaction. */ +int +xrep_defer_finish( + struct xfs_scrub *sc) +{ + int error; + + /* + * Keep the AG header buffers locked while we complete deferred work + * items. Ensure that both AG buffers are dirty and held when we roll + * the transaction so that they move forward in the log without losing + * the bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } + + /* + * Finish all deferred work items. We still hold the AG header buffers + * locked regardless of whether or not that succeeds. On failure, the + * buffers will be released during teardown on our way out of the + * kernel. If successful, join the buffers to the new transaction + * and move on. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * Release the hold that we set above because defer_finish won't do + * that for us. The defer roll code redirties held buffers after each + * roll, so the AG header buffers should be ready for logging. + */ + if (sc->sa.agi_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + if (sc->sa.agf_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + + return 0; +} + /* * Does the given AG have enough space to rebuild a btree? Neither AG * reservation can be critical, and we must have enough space (factoring @@ -297,89 +357,6 @@ xrep_calc_ag_resblks( return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } -/* Allocate a block in an AG. */ -int -xrep_alloc_ag_block( - struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv) -{ - struct xfs_alloc_arg args = {0}; - xfs_agblock_t bno; - int error; - - switch (resv) { - case XFS_AG_RESV_AGFL: - case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, - sc->sa.agf_bp, &bno, 1); - if (error) - return error; - if (bno == NULLAGBLOCK) - return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); - *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); - if (resv == XFS_AG_RESV_RMAPBT) - xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); - return 0; - default: - break; - } - - args.tp = sc->tp; - args.mp = sc->mp; - args.pag = sc->sa.pag; - args.oinfo = *oinfo; - args.minlen = 1; - args.maxlen = 1; - args.prod = 1; - args.resv = resv; - - error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno); - if (error) - return error; - if (args.fsbno == NULLFSBLOCK) - return -ENOSPC; - ASSERT(args.len == 1); - *fsbno = args.fsbno; - - return 0; -} - -/* Initialize a new AG btree root block with zero entries. */ -int -xrep_init_btblock( - struct xfs_scrub *sc, - xfs_fsblock_t fsb, - struct xfs_buf **bpp, - xfs_btnum_t btnum, - const struct xfs_buf_ops *ops) -{ - struct xfs_trans *tp = sc->tp; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; - int error; - - trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), btnum); - - ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, - &bp); - if (error) - return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno); - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); - bp->b_ops = ops; - *bpp = bp; - - return 0; -} - /* * Reconstructing per-AG Btrees * @@ -404,91 +381,8 @@ xrep_init_btblock( * sublist. As with the other btrees we subtract sublist from bitmap, and the * result (since the rmapbt lives in the free space) are the blocks from the * old rmapbt. - * - * Disposal of Blocks from Old per-AG Btrees - * - * Now that we've constructed a new btree to replace the damaged one, we want - * to dispose of the blocks that (we think) the old btree was using. - * Previously, we used the rmapbt to collect the extents (bitmap) with the - * rmap owner corresponding to the tree we rebuilt, collected extents for any - * blocks with the same rmap owner that are owned by another data structure - * (sublist), and subtracted sublist from bitmap. In theory the extents - * remaining in bitmap are the old btree's blocks. - * - * Unfortunately, it's possible that the btree was crosslinked with other - * blocks on disk. The rmap data can tell us if there are multiple owners, so - * if the rmapbt says there is an owner of this block other than @oinfo, then - * the block is crosslinked. Remove the reverse mapping and continue. - * - * If there is one rmap record, we can free the block, which removes the - * reverse mapping but doesn't add the block to the free space. Our repair - * strategy is to hope the other metadata objects crosslinked on this block - * will be rebuilt (atop different blocks), thereby removing all the cross - * links. - * - * If there are no rmap records at all, we also free the block. If the btree - * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't - * supposed to be a rmap record and everything is ok. For other btrees there - * had to have been an rmap entry for the block to have ended up on @bitmap, - * so if it's gone now there's something wrong and the fs will shut down. - * - * Note: If there are multiple rmap records with only the same rmap owner as - * the btree we're trying to rebuild and the block is indeed owned by another - * data structure with the same rmap owner, then the block will be in sublist - * and therefore doesn't need disposal. If there are multiple rmap records - * with only the same rmap owner but the block is not owned by something with - * the same rmap owner, the block will be freed. - * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We also assume that the caller already invalidated any - * buffers associated with @bitmap. */ -static int -xrep_invalidate_block( - uint64_t fsbno, - void *priv) -{ - struct xfs_scrub *sc = priv; - struct xfs_buf *bp; - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - return 0; - - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - return 0; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - return 0; -} - -/* - * Invalidate buffers for per-AG btree blocks we're dumping. This function - * is not intended for use with file data repairs; we have bunmapi for that. - */ -int -xrep_invalidate_blocks( - struct xfs_scrub *sc, - struct xbitmap *bitmap) -{ - /* - * For each block in each extent, see if there's an incore buffer for - * exactly that block; if so, invalidate it. The buffer cache only - * lets us look for one buffer at a time, so we have to look one block - * at a time. Avoid invalidating AG headers and post-EOFS blocks - * because we never own those; and if we can't TRYLOCK the buffer we - * assume it's owned by someone else. - */ - return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); -} - /* Ensure the freelist is the correct size. */ int xrep_fix_freelist( @@ -507,155 +401,6 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } -/* Information about reaping extents after a repair. */ -struct xrep_reap_state { - struct xfs_scrub *sc; - - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; -}; - -/* - * Put a block back on the AGFL. - */ -STATIC int -xrep_put_freelist( - struct xfs_scrub *sc, - xfs_agblock_t agbno) -{ - struct xfs_buf *agfl_bp; - int error; - - /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); - if (error) - return error; - - /* - * Since we're "freeing" a lost block onto the AGFL, we have to - * create an rmap for the block prior to merging it or else other - * parts will break. - */ - error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, - &XFS_RMAP_OINFO_AG); - if (error) - return error; - - /* Put the block on the AGFL. */ - error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); - if (error) - return error; - - error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - agfl_bp, agbno, 0); - if (error) - return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, - XFS_EXTENT_BUSY_SKIP_DISCARD); - - return 0; -} - -/* Dispose of a single block. */ -STATIC int -xrep_reap_block( - uint64_t fsbno, - void *priv) -{ - struct xrep_reap_state *rs = priv; - struct xfs_scrub *sc = rs->sc; - struct xfs_btree_cur *cur; - struct xfs_buf *agf_bp = NULL; - xfs_agblock_t agbno; - bool has_other_rmap; - int error; - - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - - /* - * If we are repairing per-inode metadata, we need to read in the AGF - * buffer. Otherwise, we're repairing a per-AG structure, so reuse - * the AGF buffer that the setup functions already grabbed. - */ - if (sc->ip) { - error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); - if (error) - return error; - } else { - agf_bp = sc->sa.agf_bp; - } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); - - /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, - &has_other_rmap); - xfs_btree_del_cursor(cur, error); - if (error) - goto out_free; - - /* - * If there are other rmappings, this block is cross linked and must - * not be freed. Remove the reverse mapping and move on. Otherwise, - * we were the only owner of the block, so free the extent, which will - * also remove the rmap. - * - * XXX: XFS doesn't support detecting the case where a single block - * metadata structure is crosslinked with a multi-block structure - * because the buffer cache doesn't detect aliasing problems, so we - * can't fix 100% of crosslinking problems (yet). The verifiers will - * blow on writeout, the filesystem will shut down, and the admin gets - * to run xfs_repair. - */ - if (has_other_rmap) - error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, rs->oinfo); - else if (rs->resv == XFS_AG_RESV_AGFL) - error = xrep_put_freelist(sc, agbno); - else - error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, - rs->resv); - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - if (error) - return error; - - if (sc->ip) - return xfs_trans_roll_inode(&sc->tp, sc->ip); - return xrep_roll_ag_trans(sc); - -out_free: - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - return error; -} - -/* Dispose of every block of every extent in the bitmap. */ -int -xrep_reap_extents( - struct xfs_scrub *sc, - struct xbitmap *bitmap, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - struct xrep_reap_state rs = { - .sc = sc, - .oinfo = oinfo, - .resv = type, - }; - - ASSERT(xfs_has_rmapbt(sc->mp)); - - return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); -} - /* * Finding per-AG Btree Roots for AGF/AGI Reconstruction * diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index dce791c679ee..60d2a9ae5f2e 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,8 @@ #include "xfs_quota_defs.h" +struct xchk_stats_run; + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + + /* Repair helpers */ -int xrep_attempt(struct xfs_scrub *sc); +int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); -int xrep_alloc_ag_block(struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv); -int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, - struct xfs_buf **bpp, xfs_btnum_t btnum, - const struct xfs_buf_ops *ops); struct xbitmap; struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, - const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e7dace7b4be8..008ddb599e13 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -19,19 +19,20 @@ /* Set us up with the realtime metadata locked. */ int -xchk_setup_rt( +xchk_setup_rtbitmap( struct xfs_scrub *sc) { int error; - error = xchk_setup_fs(sc); + error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; - sc->ip = sc->mp->m_rbmip; - xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); return 0; } @@ -123,43 +124,6 @@ out: return error; } -/* Scrub the realtime summary. */ -int -xchk_rtsummary( - struct xfs_scrub *sc) -{ - struct xfs_inode *rsumip = sc->mp->m_rsumip; - struct xfs_inode *old_ip = sc->ip; - uint old_ilock_flags = sc->ilock_flags; - int error = 0; - - /* - * We ILOCK'd the rt bitmap ip in the setup routine, now lock the - * rt summary ip in compliance with the rt inode locking rules. - * - * Since we switch sc->ip to rsumip we have to save the old ilock - * flags so that we don't mix up the inode state that @sc tracks. - */ - sc->ip = rsumip; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; - xfs_ilock(sc->ip, sc->ilock_flags); - - /* Invoke the fork scrubber. */ - error = xchk_metadata_inode_forks(sc); - if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; - - /* XXX: implement this some day */ - xchk_set_incomplete(sc); -out: - /* Switch back to the rtbitmap inode and lock flags. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = old_ilock_flags; - sc->ip = old_ip; - return error; -} - - /* xref check that the extent is not free in the rtbitmap */ void xchk_xref_is_used_rt_space( diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c new file mode 100644 index 000000000000..437ed9acbb27 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" + +/* + * Realtime Summary + * ================ + * + * We check the realtime summary by scanning the realtime bitmap file to create + * a new summary file incore, and then we compare the computed version against + * the ondisk version. We use the 'xfile' functionality to store this + * (potentially large) amount of data in pageable memory. + */ + +/* Set us up to check the rtsummary file. */ +int +xchk_setup_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* + * Create an xfile to construct a new rtsummary file. The xfile allows + * us to avoid pinning kernel memory for this purpose. + */ + descr = xchk_xfile_descr(sc, "realtime summary file"); + error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + kfree(descr); + if (error) + return error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + /* Allocate a memory buffer for the summary comparison. */ + sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + error = xchk_install_live_inode(sc, mp->m_rsumip); + if (error) + return error; + + /* + * Locking order requires us to take the rtbitmap first. We must be + * careful to unlock it ourselves when we are done with the rtbitmap + * file since the scrub infrastructure won't do that for us. Only + * then we can lock the rtsummary inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return 0; +} + +/* Helper functions to record suminfo words in an xfile. */ + +typedef unsigned int xchk_rtsumoff_t; + +static inline int +xfsum_load( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info) +{ + return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_store( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + const xfs_suminfo_t info) +{ + return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_copyout( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info, + unsigned int nr_words) +{ + return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + sumoff << XFS_WORDLOG); +} + +/* Update the summary file to reflect the free extent that we've accumulated. */ +STATIC int +xchk_rtsum_record_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub *sc = priv; + xfs_fileoff_t rbmoff; + xfs_rtblock_t rtbno; + xfs_filblks_t rtlen; + xchk_rtsumoff_t offs; + unsigned int lenlog; + xfs_suminfo_t v = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* Compute the relevant location in the rtsum file. */ + rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); + lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + + if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + return -EFSCORRUPTED; + } + + /* Bump the summary count. */ + error = xfsum_load(sc, offs, &v); + if (error) + return error; + + v++; + trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, + lenlog, offs, v); + + return xfsum_store(sc, offs, v); +} + +/* Compute the realtime summary from the realtime bitmap. */ +STATIC int +xchk_rtsum_compute( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long rtbmp_bytes; + + /* If the bitmap size doesn't match the computed size, bail. */ + rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); + if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != + mp->m_rbmip->i_disk_size) + return -EFSCORRUPTED; + + return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, + sc); +} + +/* Compare the rtsummary file against the one we computed. */ +STATIC int +xchk_rtsum_compare( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_bmbt_irec map; + xfs_fileoff_t off; + xchk_rtsumoff_t sumoff = 0; + int nmap; + + for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + return 0; + } + + /* Read a block's worth of ondisk rtsummary file. */ + error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + /* Read a block's worth of computed rtsummary file. */ + error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + if (error) { + xfs_trans_brelse(sc->tp, bp); + return error; + } + + if (memcmp(bp->b_addr, sc->buf, + mp->m_blockwsize << XFS_WORDLOG) != 0) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + + xfs_trans_brelse(sc->tp, bp); + sumoff += mp->m_blockwsize; + } + + return 0; +} + +/* Scrub the realtime summary. */ +int +xchk_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out_rbm; + + /* Construct the new summary file from the rtbitmap. */ + error = xchk_rtsum_compute(sc); + if (error == -EFSCORRUPTED) { + /* + * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref + * error since we're checking the summary file. + */ + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + error = 0; + goto out_rbm; + } + if (error) + goto out_rbm; + + /* Does the computed summary file match the actual rtsummary file? */ + error = xchk_rtsum_compare(sc); + +out_rbm: + /* Unlock the rtbitmap since we're done with it. */ + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index a0fffbcd022b..7d3aa14d81b5 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -22,6 +22,8 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/stats.h" +#include "scrub/xfile.h" /* * Online Scrub and Repair @@ -166,8 +168,6 @@ xchk_teardown( struct xfs_scrub *sc, int error) { - struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); - xchk_ag_free(sc, &sc->sa); if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) @@ -178,16 +178,18 @@ xchk_teardown( } if (sc->ip) { if (sc->ilock_flags) - xfs_iunlock(sc->ip, sc->ilock_flags); - if (sc->ip != ip_in && - !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xchk_irele(sc, sc->ip); + xchk_iunlock(sc, sc->ilock_flags); + xchk_irele(sc, sc->ip); sc->ip = NULL; } if (sc->flags & XCHK_HAVE_FREEZE_PROT) { sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xfile) { + xfile_destroy(sc->xfile); + sc->xfile = NULL; + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -322,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .has = xfs_has_realtime, .repair = xrep_notsupported, @@ -409,6 +411,11 @@ xchk_validate_inputs( goto out; } + /* No rebuild without repair. */ + if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && + !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return -EINVAL; + /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to @@ -463,8 +470,10 @@ xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) { + struct xchk_stats_run run = { }; struct xfs_scrub *sc; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; + u64 check_start; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -521,7 +530,9 @@ retry_op: goto out_teardown; /* Scrub for errors. */ + check_start = xchk_stats_now(); error = sc->ops->scrub(sc); + run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) @@ -533,15 +544,16 @@ retry_op: if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix; + bool needs_fix = xchk_needs_repair(sc->sm); + + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + needs_fix = true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + needs_fix = true; - needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT | - XFS_SCRUB_OFLAG_PREEN)); /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. @@ -555,7 +567,7 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(sc); + error = xrep_attempt(sc, &run); if (error == -EAGAIN) { /* * Either the repair function succeeded or it couldn't @@ -583,12 +595,15 @@ out: sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; error = 0; } + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); return error; need_drain: error = xchk_teardown(sc, 0); if (error) goto out_sc; sc->flags |= XCHK_NEED_DRAIN; + run.retries++; goto retry_op; try_harder: /* @@ -600,5 +615,6 @@ try_harder: if (error) goto out_sc; sc->flags |= XCHK_TRY_HARDER; + run.retries++; goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index f8ba00e51ca9..1ef9c6b4842a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,6 +88,10 @@ struct xfs_scrub { */ void (*buf_cleanup)(void *buf); + /* xfile used by the scrubbers; freed at teardown. */ + struct xfile *xfile; + + /* Lock flags for @ip. */ uint ilock_flags; /* See the XCHK/XREP state flags below. */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c new file mode 100644 index 000000000000..aeb92624176b --- /dev/null +++ b/fs/xfs/scrub/stats.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_sysfs.h" +#include "xfs_btree.h" +#include "xfs_super.h" +#include "scrub/scrub.h" +#include "scrub/stats.h" +#include "scrub/trace.h" + +struct xchk_scrub_stats { + /* all 32-bit counters here */ + + /* checking stats */ + uint32_t invocations; + uint32_t clean; + uint32_t corrupt; + uint32_t preen; + uint32_t xfail; + uint32_t xcorrupt; + uint32_t incomplete; + uint32_t warning; + uint32_t retries; + + /* repair stats */ + uint32_t repair_invocations; + uint32_t repair_success; + + /* all 64-bit items here */ + + /* runtimes */ + uint64_t checktime_us; + uint64_t repairtime_us; + + /* non-counter state must go at the end for clearall */ + spinlock_t css_lock; +}; + +struct xchk_stats { + struct dentry *cs_debugfs; + struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR]; +}; + + +static struct xchk_stats global_stats; + +static const char *name_map[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = "sb", + [XFS_SCRUB_TYPE_AGF] = "agf", + [XFS_SCRUB_TYPE_AGFL] = "agfl", + [XFS_SCRUB_TYPE_AGI] = "agi", + [XFS_SCRUB_TYPE_BNOBT] = "bnobt", + [XFS_SCRUB_TYPE_CNTBT] = "cntbt", + [XFS_SCRUB_TYPE_INOBT] = "inobt", + [XFS_SCRUB_TYPE_FINOBT] = "finobt", + [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt", + [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt", + [XFS_SCRUB_TYPE_INODE] = "inode", + [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd", + [XFS_SCRUB_TYPE_BMBTA] = "bmapbta", + [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc", + [XFS_SCRUB_TYPE_DIR] = "directory", + [XFS_SCRUB_TYPE_XATTR] = "xattr", + [XFS_SCRUB_TYPE_SYMLINK] = "symlink", + [XFS_SCRUB_TYPE_PARENT] = "parent", + [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap", + [XFS_SCRUB_TYPE_RTSUM] = "rtsummary", + [XFS_SCRUB_TYPE_UQUOTA] = "usrquota", + [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", + [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", + [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", +}; + +/* Format the scrub stats into a text buffer, similar to pcp style. */ +STATIC ssize_t +xchk_stats_format( + struct xchk_stats *cs, + char *buf, + size_t remaining) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + ssize_t copied = 0; + int ret = 0; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + ret = scnprintf(buf, remaining, + "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n", + name_map[i], + (unsigned int)css->invocations, + (unsigned int)css->clean, + (unsigned int)css->corrupt, + (unsigned int)css->preen, + (unsigned int)css->xfail, + (unsigned int)css->xcorrupt, + (unsigned int)css->incomplete, + (unsigned int)css->warning, + (unsigned int)css->retries, + (unsigned long long)css->checktime_us, + (unsigned int)css->repair_invocations, + (unsigned int)css->repair_success, + (unsigned long long)css->repairtime_us); + if (ret <= 0) + break; + + remaining -= ret; + copied += ret; + buf += ret; + } + + return copied > 0 ? copied : ret; +} + +/* Estimate the worst case buffer size required to hold the whole report. */ +STATIC size_t +xchk_stats_estimate_bufsize( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + size_t field_width; + size_t ret = 0; + + /* 4294967296 plus one space for each u32 field */ + field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) / + sizeof(uint32_t)); + + /* 18446744073709551615 plus one space for each u64 field */ + field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) - + offsetof(struct xchk_scrub_stats, checktime_us)) / + sizeof(uint64_t)); + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + /* name plus one space */ + ret += 1 + strlen(name_map[i]); + + /* all fields, plus newline */ + ret += field_width + 1; + } + + return ret; +} + +/* Clear all counters. */ +STATIC void +xchk_stats_clearall( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + spin_lock(&css->css_lock); + memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock)); + spin_unlock(&css->css_lock); + } +} + +#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) + +STATIC void +xchk_stats_merge_one( + struct xchk_stats *cs, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + struct xchk_scrub_stats *css; + + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + + css = &cs->cs_stats[sm->sm_type]; + spin_lock(&css->css_lock); + css->invocations++; + if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN)) + css->clean++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + css->corrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + css->preen++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL) + css->xfail++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT) + css->xcorrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + css->incomplete++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING) + css->warning++; + css->retries += run->retries; + css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC); + + if (run->repair_attempted) + css->repair_invocations++; + if (run->repair_succeeded) + css->repair_success++; + css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC); + spin_unlock(&css->css_lock); +} + +/* Merge these scrub-run stats into the global and mount stat data. */ +void +xchk_stats_merge( + struct xfs_mount *mp, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + xchk_stats_merge_one(&global_stats, sm, run); + xchk_stats_merge_one(mp->m_scrub_stats, sm, run); +} + +/* debugfs boilerplate */ + +static ssize_t +xchk_scrub_stats_read( + struct file *file, + char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + char *buf; + size_t bufsize; + ssize_t avail, ret; + + /* + * This generates stringly snapshot of all the scrub counters, so we + * do not want userspace to receive garbled text from multiple calls. + * If the file position is greater than 0, return a short read. + */ + if (*ppos > 0) + return 0; + + bufsize = xchk_stats_estimate_bufsize(cs); + + buf = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!buf) + return -ENOMEM; + + avail = xchk_stats_format(cs, buf, bufsize); + if (avail < 0) { + ret = avail; + goto out; + } + + ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail); +out: + kvfree(buf); + return ret; +} + +static const struct file_operations scrub_stats_fops = { + .open = simple_open, + .read = xchk_scrub_stats_read, +}; + +static ssize_t +xchk_clear_scrub_stats_write( + struct file *file, + const char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + unsigned int val; + int ret; + + ret = kstrtouint_from_user(ubuf, count, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + xchk_stats_clearall(cs); + return count; +} + +static const struct file_operations clear_scrub_stats_fops = { + .open = simple_open, + .write = xchk_clear_scrub_stats_write, +}; + +/* Initialize the stats object. */ +STATIC int +xchk_stats_init( + struct xchk_stats *cs, + struct xfs_mount *mp) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) + spin_lock_init(&css->css_lock); + + return 0; +} + +/* Connect the stats object to debugfs. */ +void +xchk_stats_register( + struct xchk_stats *cs, + struct dentry *parent) +{ + if (!parent) + return; + + cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent); + if (!cs->cs_debugfs) + return; + + debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + &scrub_stats_fops); + debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + &clear_scrub_stats_fops); +} + +/* Free all resources related to the stats object. */ +STATIC int +xchk_stats_teardown( + struct xchk_stats *cs) +{ + return 0; +} + +/* Disconnect the stats object from debugfs. */ +void +xchk_stats_unregister( + struct xchk_stats *cs) +{ + debugfs_remove(cs->cs_debugfs); +} + +/* Initialize global stats and register them */ +int __init +xchk_global_stats_setup( + struct dentry *parent) +{ + int error; + + error = xchk_stats_init(&global_stats, NULL); + if (error) + return error; + + xchk_stats_register(&global_stats, parent); + return 0; +} + +/* Unregister global stats and tear them down */ +void +xchk_global_stats_teardown(void) +{ + xchk_stats_unregister(&global_stats); + xchk_stats_teardown(&global_stats); +} + +/* Allocate per-mount stats */ +int +xchk_mount_stats_alloc( + struct xfs_mount *mp) +{ + struct xchk_stats *cs; + int error; + + cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + error = xchk_stats_init(cs, mp); + if (error) + goto out_free; + + mp->m_scrub_stats = cs; + return 0; +out_free: + kvfree(cs); + return error; +} + +/* Free per-mount stats */ +void +xchk_mount_stats_free( + struct xfs_mount *mp) +{ + xchk_stats_teardown(mp->m_scrub_stats); + kvfree(mp->m_scrub_stats); + mp->m_scrub_stats = NULL; +} diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h new file mode 100644 index 000000000000..b358ad8d8b90 --- /dev/null +++ b/fs/xfs/scrub/stats.h @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_STATS_H__ +#define __XFS_SCRUB_STATS_H__ + +struct xchk_stats_run { + u64 scrub_ns; + u64 repair_ns; + unsigned int retries; + bool repair_attempted; + bool repair_succeeded; +}; + +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS +struct xchk_stats; + +int __init xchk_global_stats_setup(struct dentry *parent); +void xchk_global_stats_teardown(void); + +int xchk_mount_stats_alloc(struct xfs_mount *mp); +void xchk_mount_stats_free(struct xfs_mount *mp); + +void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent); +void xchk_stats_unregister(struct xchk_stats *cs); + +void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run); + +static inline u64 xchk_stats_now(void) { return ktime_get_ns(); } +static inline u64 xchk_stats_elapsed_ns(u64 since) +{ + u64 now = xchk_stats_now(); + + /* + * If the system doesn't have a high enough resolution clock, charge at + * least one nanosecond so that our stats don't report instantaneous + * runtimes. + */ + if (now == since) + return 1; + + return now - since; +} +#else +# define xchk_global_stats_setup(parent) (0) +# define xchk_global_stats_teardown() ((void)0) +# define xchk_mount_stats_alloc(mp) (0) +# define xchk_mount_stats_free(mp) ((void)0) +# define xchk_stats_register(cs, parent) ((void)0) +# define xchk_stats_unregister(cs) ((void)0) +# define xchk_stats_now() (0) +# define xchk_stats_elapsed_ns(x) (0 * (x)) +# define xchk_stats_merge(mp, sm, run) ((void)0) +#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */ + +#endif /* __XFS_SCRUB_STATS_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 0a975439d2b6..46249e7b17e0 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -12,8 +12,10 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "scrub/scrub.h" #include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 0b54f1a1cf0c..cbd4d01e253c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -16,6 +16,10 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +struct xfile; +struct xfarray; +struct xfarray_sortinfo; + /* * ftrace's __print_symbolic requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace @@ -94,7 +98,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ - { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \ + { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" } #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ @@ -636,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->cluster_ino) ) +TRACE_EVENT(xchk_inode_is_allocated, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned long, iflags) + __field(umode_t, mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; + __entry->mode = VFS_I(ip)->i_mode; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->mode) +); + TRACE_EVENT(xchk_fscounters_calc, TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, uint64_t fdblocks, uint64_t delalloc), @@ -751,13 +778,302 @@ TRACE_EVENT(xchk_refcount_incorrect, __entry->seen) ) +TRACE_EVENT(xfile_create, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + + __entry->ino = file_inode(xf->file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(xf->file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xfino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xfile_destroy, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); + +DECLARE_EVENT_CLASS(xfile_class, + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), + TP_ARGS(xf, pos, bytecount), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes_used) + __field(loff_t, pos) + __field(loff_t, size) + __field(unsigned long long, bytecount) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes_used = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes_used = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + __entry->pos = pos; + __entry->bytecount = bytecount; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes_used, + __entry->pos, + __entry->bytecount, + __entry->size) +); +#define DEFINE_XFILE_EVENT(name) \ +DEFINE_EVENT(xfile_class, name, \ + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ + TP_ARGS(xf, pos, bytecount)) +DEFINE_XFILE_EVENT(xfile_pread); +DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_seek_data); +DEFINE_XFILE_EVENT(xfile_get_page); +DEFINE_XFILE_EVENT(xfile_put_page); + +TRACE_EVENT(xfarray_create, + TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), + TP_ARGS(xfa, required_capacity), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(uint64_t, max_nr) + __field(size_t, obj_size) + __field(int, obj_size_log) + __field(unsigned long long, required_capacity) + ), + TP_fast_assign( + __entry->max_nr = xfa->max_nr; + __entry->obj_size = xfa->obj_size; + __entry->obj_size_log = xfa->obj_size_log; + __entry->ino = file_inode(xfa->xfile->file)->i_ino; + __entry->required_capacity = required_capacity; + ), + TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d", + __entry->ino, + __entry->max_nr, + __entry->required_capacity, + __entry->obj_size, + __entry->obj_size_log) +); + +TRACE_EVENT(xfarray_isort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_pagesort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_qsort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + __field(int, stack_depth) + __field(int, max_stack_depth) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + __entry->stack_depth = si->stack_depth; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo, + __entry->stack_depth, + __entry->max_stack_depth) +); + +TRACE_EVENT(xfarray_sort, + TP_PROTO(struct xfarray_sortinfo *si, size_t bytes), + TP_ARGS(si, bytes), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(size_t, bytes) + __field(unsigned int, max_stack_depth) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->bytes = bytes; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->max_stack_depth, + __entry->bytes) +); + +TRACE_EVENT(xfarray_sort_stats, + TP_PROTO(struct xfarray_sortinfo *si, int error), + TP_ARGS(si, error), + TP_STRUCT__entry( + __field(unsigned long, ino) +#ifdef DEBUG + __field(unsigned long long, loads) + __field(unsigned long long, stores) + __field(unsigned long long, compares) + __field(unsigned long long, heapsorts) +#endif + __field(unsigned int, max_stack_depth) + __field(unsigned int, max_stack_used) + __field(int, error) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; +#ifdef DEBUG + __entry->loads = si->loads; + __entry->stores = si->stores; + __entry->compares = si->compares; + __entry->heapsorts = si->heapsorts; +#endif + __entry->max_stack_depth = si->max_stack_depth; + __entry->max_stack_used = si->max_stack_used; + __entry->error = error; + ), + TP_printk( +#ifdef DEBUG + "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d", +#else + "xfino 0x%lx stack_depth %u/%u error %d", +#endif + __entry->ino, +#ifdef DEBUG + __entry->loads, + __entry->stores, + __entry->compares, + __entry->heapsorts, +#endif + __entry->max_stack_used, + __entry->max_stack_depth, + __entry->error) +); + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xchk_rtsum_record_free, + TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, + uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v), + TP_ARGS(mp, start, len, log, pos, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtblock_t, start) + __field(unsigned long long, len) + __field(unsigned int, log) + __field(loff_t, pos) + __field(xfs_suminfo_t, v) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->len = len; + __entry->log = log; + __entry->pos = pos; + __entry->v = v; + ), + TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->len, + __entry->log, + __entry->pos, + __entry->v) +); +#endif /* CONFIG_XFS_RT */ + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(pag, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -765,8 +1081,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; __entry->agbno = agbno; __entry->len = len; ), @@ -778,12 +1094,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class, ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) -DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent); + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(pag, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_find_class, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, + bool crosslinked), + TP_ARGS(pag, agbno, len, crosslinked), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(bool, crosslinked) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->crosslinked = crosslinked; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->crosslinked ? 1 : 0) +); +#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ +DEFINE_EVENT(xrep_reap_find_class, name, \ + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ + bool crosslinked), \ + TP_ARGS(pag, agbno, len, crosslinked)) +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); + DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, @@ -853,28 +1202,6 @@ TRACE_EVENT(xrep_refcount_extent_fn, __entry->refcount) ) -TRACE_EVENT(xrep_init_btblock, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_btnum_t btnum), - TP_ARGS(mp, agno, agbno, btnum), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) - __field(uint32_t, btnum) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->btnum = btnum; - ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, - __entry->agbno, - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS)) -) TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c new file mode 100644 index 000000000000..f0f532c10a5a --- /dev/null +++ b/fs/xfs/scrub/xfarray.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" + +/* + * Large Arrays of Fixed-Size Records + * ================================== + * + * This memory array uses an xfile (which itself is a memfd "file") to store + * large numbers of fixed-size records in memory that can be paged out. This + * puts less stress on the memory reclaim algorithms during an online repair + * because we don't have to pin so much memory. However, array access is less + * direct than would be in a regular memory array. Access to the array is + * performed via indexed load and store methods, and an append method is + * provided for convenience. Array elements can be unset, which sets them to + * all zeroes. Unset entries are skipped during iteration, though direct loads + * will return a zeroed buffer. Callers are responsible for concurrency + * control. + */ + +/* + * Pointer to scratch space. Because we can't access the xfile data directly, + * we allocate a small amount of memory on the end of the xfarray structure to + * buffer array items when we need space to store values temporarily. + */ +static inline void *xfarray_scratch(struct xfarray *array) +{ + return (array + 1); +} + +/* Compute array index given an xfile offset. */ +static xfarray_idx_t +xfarray_idx( + struct xfarray *array, + loff_t pos) +{ + if (array->obj_size_log >= 0) + return (xfarray_idx_t)pos >> array->obj_size_log; + + return div_u64((xfarray_idx_t)pos, array->obj_size); +} + +/* Compute xfile offset of array element. */ +static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx) +{ + if (array->obj_size_log >= 0) + return idx << array->obj_size_log; + + return idx * array->obj_size; +} + +/* + * Initialize a big memory array. Array records cannot be larger than a + * page, and the array cannot span more bytes than the page cache supports. + * If @required_capacity is nonzero, the maximum array size will be set to this + * quantity and the array creation will fail if the underlying storage cannot + * support that many records. + */ +int +xfarray_create( + const char *description, + unsigned long long required_capacity, + size_t obj_size, + struct xfarray **arrayp) +{ + struct xfarray *array; + struct xfile *xfile; + int error; + + ASSERT(obj_size < PAGE_SIZE); + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + error = -ENOMEM; + array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS); + if (!array) + goto out_xfile; + + array->xfile = xfile; + array->obj_size = obj_size; + + if (is_power_of_2(obj_size)) + array->obj_size_log = ilog2(obj_size); + else + array->obj_size_log = -1; + + array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE); + trace_xfarray_create(array, required_capacity); + + if (required_capacity > 0) { + if (array->max_nr < required_capacity) { + error = -ENOMEM; + goto out_xfarray; + } + array->max_nr = required_capacity; + } + + *arrayp = array; + return 0; + +out_xfarray: + kfree(array); +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy the array. */ +void +xfarray_destroy( + struct xfarray *array) +{ + xfile_destroy(array->xfile); + kfree(array); +} + +/* Load an element from the array. */ +int +xfarray_load( + struct xfarray *array, + xfarray_idx_t idx, + void *ptr) +{ + if (idx >= array->nr) + return -ENODATA; + + return xfile_obj_load(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); +} + +/* Is this array element potentially unset? */ +static inline bool +xfarray_is_unset( + struct xfarray *array, + loff_t pos) +{ + void *temp = xfarray_scratch(array); + int error; + + if (array->unset_slots == 0) + return false; + + error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + if (!error && xfarray_element_is_null(array, temp)) + return true; + + return false; +} + +/* + * Unset an array element. If @idx is the last element in the array, the + * array will be truncated. Otherwise, the entry will be zeroed. + */ +int +xfarray_unset( + struct xfarray *array, + xfarray_idx_t idx) +{ + void *temp = xfarray_scratch(array); + loff_t pos = xfarray_pos(array, idx); + int error; + + if (idx >= array->nr) + return -ENODATA; + + if (idx == array->nr - 1) { + array->nr--; + return 0; + } + + if (xfarray_is_unset(array, pos)) + return 0; + + memset(temp, 0, array->obj_size); + error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + if (error) + return error; + + array->unset_slots++; + return 0; +} + +/* + * Store an element in the array. The element must not be completely zeroed, + * because those are considered unset sparse elements. + */ +int +xfarray_store( + struct xfarray *array, + xfarray_idx_t idx, + const void *ptr) +{ + int ret; + + if (idx >= array->max_nr) + return -EFBIG; + + ASSERT(!xfarray_element_is_null(array, ptr)); + + ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); + if (ret) + return ret; + + array->nr = max(array->nr, idx + 1); + return 0; +} + +/* Is this array element NULL? */ +bool +xfarray_element_is_null( + struct xfarray *array, + const void *ptr) +{ + return !memchr_inv(ptr, 0, array->obj_size); +} + +/* + * Store an element anywhere in the array that is unset. If there are no + * unset slots, append the element to the array. + */ +int +xfarray_store_anywhere( + struct xfarray *array, + const void *ptr) +{ + void *temp = xfarray_scratch(array); + loff_t endpos = xfarray_pos(array, array->nr); + loff_t pos; + int error; + + /* Find an unset slot to put it in. */ + for (pos = 0; + pos < endpos && array->unset_slots > 0; + pos += array->obj_size) { + error = xfile_obj_load(array->xfile, temp, array->obj_size, + pos); + if (error || !xfarray_element_is_null(array, temp)) + continue; + + error = xfile_obj_store(array->xfile, ptr, array->obj_size, + pos); + if (error) + return error; + + array->unset_slots--; + return 0; + } + + /* No unset slots found; attach it on the end. */ + array->unset_slots = 0; + return xfarray_append(array, ptr); +} + +/* Return length of array. */ +uint64_t +xfarray_length( + struct xfarray *array) +{ + return array->nr; +} + +/* + * Decide which array item we're going to read as part of an _iter_get. + * @cur is the array index, and @pos is the file offset of that array index in + * the backing xfile. Returns ENODATA if we reach the end of the records. + * + * Reading from a hole in a sparse xfile causes page instantiation, so for + * iterating a (possibly sparse) array we need to figure out if the cursor is + * pointing at a totally uninitialized hole and move the cursor up if + * necessary. + */ +static inline int +xfarray_find_data( + struct xfarray *array, + xfarray_idx_t *cur, + loff_t *pos) +{ + unsigned int pgoff = offset_in_page(*pos); + loff_t end_pos = *pos + array->obj_size - 1; + loff_t new_pos; + + /* + * If the current array record is not adjacent to a page boundary, we + * are in the middle of the page. We do not need to move the cursor. + */ + if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE) + return 0; + + /* + * Call SEEK_DATA on the last byte in the record we're about to read. + * If the record ends at (or crosses) the end of a page then we know + * that the first byte of the record is backed by pages and don't need + * to query it. If instead the record begins at the start of the page + * then we know that querying the last byte is just as good as querying + * the first byte, since records cannot be larger than a page. + * + * If the call returns the same file offset, we know this record is + * backed by real pages. We do not need to move the cursor. + */ + new_pos = xfile_seek_data(array->xfile, end_pos); + if (new_pos == -ENXIO) + return -ENODATA; + if (new_pos < 0) + return new_pos; + if (new_pos == end_pos) + return 0; + + /* + * Otherwise, SEEK_DATA told us how far up to move the file pointer to + * find more data. Move the array index to the first record past the + * byte offset we were given. + */ + new_pos = roundup_64(new_pos, array->obj_size); + *cur = xfarray_idx(array, new_pos); + *pos = xfarray_pos(array, *cur); + return 0; +} + +/* + * Starting at *idx, fetch the next non-null array entry and advance the index + * to set up the next _load_next call. Returns ENODATA if we reach the end of + * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first + * call to this function. + */ +int +xfarray_load_next( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + xfarray_idx_t cur = *idx; + loff_t pos = xfarray_pos(array, cur); + int error; + + do { + if (cur >= array->nr) + return -ENODATA; + + /* + * Ask the backing store for the location of next possible + * written record, then retrieve that record. + */ + error = xfarray_find_data(array, &cur, &pos); + if (error) + return error; + error = xfarray_load(array, cur, rec); + if (error) + return error; + + cur++; + pos += array->obj_size; + } while (xfarray_element_is_null(array, rec)); + + *idx = cur; + return 0; +} + +/* Sorting functions */ + +#ifdef DEBUG +# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0) +# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0) +# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0) +# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0) +#else +# define xfarray_sort_bump_loads(si) +# define xfarray_sort_bump_stores(si) +# define xfarray_sort_bump_compares(si) +# define xfarray_sort_bump_heapsorts(si) +#endif /* DEBUG */ + +/* Load an array element for sorting. */ +static inline int +xfarray_sort_load( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_loads(si); + return xfarray_load(si->array, idx, ptr); +} + +/* Store an array element for sorting. */ +static inline int +xfarray_sort_store( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_stores(si); + return xfarray_store(si->array, idx, ptr); +} + +/* Compare an array element for sorting. */ +static inline int +xfarray_sort_cmp( + struct xfarray_sortinfo *si, + const void *a, + const void *b) +{ + xfarray_sort_bump_compares(si); + return si->cmp_fn(a, b); +} + +/* Return a pointer to the low index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si) +{ + return (xfarray_idx_t *)(si + 1); +} + +/* Return a pointer to the high index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_lo(si) + si->max_stack_depth; +} + +/* Size of each element in the quicksort pivot array. */ +static inline size_t +xfarray_pivot_rec_sz( + struct xfarray *array) +{ + return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t); +} + +/* Allocate memory to handle the sort. */ +static inline int +xfarray_sortinfo_alloc( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags, + struct xfarray_sortinfo **infop) +{ + struct xfarray_sortinfo *si; + size_t nr_bytes = sizeof(struct xfarray_sortinfo); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(array); + int max_stack_depth; + + /* + * The median-of-nine pivot algorithm doesn't work if a subset has + * fewer than 9 items. Make sure the in-memory sort will always take + * over for subsets where this wouldn't be the case. + */ + BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR); + + /* + * Tail-call recursion during the partitioning phase means that + * quicksort will never recurse more than log2(nr) times. We need one + * extra level of stack to hold the initial parameters. In-memory + * sort will always take care of the last few levels of recursion for + * us, so we can reduce the stack depth by that much. + */ + max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1); + if (max_stack_depth < 1) + max_stack_depth = 1; + + /* Each level of quicksort uses a lo and a hi index */ + nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2; + + /* Scratchpad for in-memory sort, or finding the pivot */ + nr_bytes += max_t(size_t, + (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz, + XFARRAY_ISORT_NR * array->obj_size); + + si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS); + if (!si) + return -ENOMEM; + + si->array = array; + si->cmp_fn = cmp_fn; + si->flags = flags; + si->max_stack_depth = max_stack_depth; + si->max_stack_used = 1; + + xfarray_sortinfo_lo(si)[0] = 0; + xfarray_sortinfo_hi(si)[0] = array->nr - 1; + + trace_xfarray_sort(si, nr_bytes); + *infop = si; + return 0; +} + +/* Should this sort be terminated by a fatal signal? */ +static inline bool +xfarray_sort_terminated( + struct xfarray_sortinfo *si, + int *error) +{ + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + + if ((si->flags & XFARRAY_SORT_KILLABLE) && + fatal_signal_pending(current)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +/* Do we want an in-memory sort? */ +static inline bool +xfarray_want_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t start, + xfarray_idx_t end) +{ + /* + * For array subsets that fit in the scratchpad, it's much faster to + * use the kernel's heapsort than quicksort's stack machine. + */ + return (end - start) < XFARRAY_ISORT_NR; +} + +/* Return the scratch space within the sortinfo structure. */ +static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* + * Sort a small number of array records using scratchpad memory. The records + * need not be contiguous in the xfile's memory pages. + */ +STATIC int +xfarray_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *scratch = xfarray_sortinfo_isort_scratch(si); + loff_t lo_pos = xfarray_pos(si->array, lo); + loff_t len = xfarray_pos(si->array, hi - lo + 1); + int error; + + trace_xfarray_isort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); +} + +/* Grab a page for sorting records. */ +static inline int +xfarray_sort_get_page( + struct xfarray_sortinfo *si, + loff_t pos, + uint64_t len) +{ + int error; + + error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); + if (error) + return error; + + /* + * xfile pages must never be mapped into userspace, so we skip the + * dcache flush when mapping the page. + */ + si->page_kaddr = kmap_local_page(si->xfpage.page); + return 0; +} + +/* Release a page we grabbed for sorting records. */ +static inline int +xfarray_sort_put_page( + struct xfarray_sortinfo *si) +{ + if (!si->page_kaddr) + return 0; + + kunmap_local(si->page_kaddr); + si->page_kaddr = NULL; + + return xfile_put_page(si->array->xfile, &si->xfpage); +} + +/* Decide if these records are eligible for in-page sorting. */ +static inline bool +xfarray_want_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + pgoff_t lo_page; + pgoff_t hi_page; + loff_t end_pos; + + /* We can only map one page at a time. */ + lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; + end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; + hi_page = end_pos >> PAGE_SHIFT; + + return lo_page == hi_page; +} + +/* Sort a bunch of records that all live in the same memory page. */ +STATIC int +xfarray_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *startp; + loff_t lo_pos = xfarray_pos(si->array, lo); + uint64_t len = xfarray_pos(si->array, hi - lo); + int error = 0; + + trace_xfarray_pagesort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfarray_sort_get_page(si, lo_pos, len); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + startp = si->page_kaddr + offset_in_page(lo_pos); + sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfarray_sort_put_page(si); +} + +/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ +static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* Return a pointer to the start of the pivot array. */ +static inline void * +xfarray_sortinfo_pivot_array( + struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_pivot(si) + si->array->obj_size; +} + +/* The xfarray record is stored at the start of each pivot array element. */ +static inline void * +xfarray_pivot_array_rec( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return pa + (pa_recsz * pa_idx); +} + +/* The xfarray index is stored at the end of each pivot array element. */ +static inline xfarray_idx_t * +xfarray_pivot_array_idx( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) - + sizeof(xfarray_idx_t); +} + +/* + * Find a pivot value for quicksort partitioning, swap it with a[lo], and save + * the cached pivot record for the next step. + * + * Load evenly-spaced records within the given range into memory, sort them, + * and choose the pivot from the median record. Using multiple points will + * improve the quality of the pivot selection, and hopefully avoid the worst + * quicksort behavior, since our array values are nearly always evenly sorted. + */ +STATIC int +xfarray_qsort_pivot( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *pivot = xfarray_sortinfo_pivot(si); + void *parray = xfarray_sortinfo_pivot_array(si); + void *recp; + xfarray_idx_t *idxp; + xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array); + int i, j; + int error; + + ASSERT(step > 0); + + /* + * Load the xfarray indexes of the records we intend to sample into the + * pivot array. + */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0); + *idxp = lo; + for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + *idxp = lo + (i * step); + } + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR - 1); + *idxp = hi; + + /* Load the selected xfarray records into the pivot array. */ + for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) { + xfarray_idx_t idx; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + + /* No unset records; load directly into the array. */ + if (likely(si->array->unset_slots == 0)) { + error = xfarray_sort_load(si, *idxp, recp); + if (error) + return error; + continue; + } + + /* + * Load non-null records into the scratchpad without changing + * the xfarray_idx_t in the pivot array. + */ + idx = *idxp; + xfarray_sort_bump_loads(si); + error = xfarray_load_next(si->array, &idx, recp); + if (error) + return error; + } + + xfarray_sort_bump_heapsorts(si); + sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL); + + /* + * We sorted the pivot array records (which includes the xfarray + * indices) in xfarray record order. The median element of the pivot + * array contains the xfarray record that we will use as the pivot. + * Copy that xfarray record to the designated space. + */ + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + memcpy(pivot, recp, si->array->obj_size); + + /* If the pivot record we chose was already in a[lo] then we're done. */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + if (*idxp == lo) + return 0; + + /* + * Find the cached copy of a[lo] in the pivot array so that we can swap + * a[lo] and a[pivot]. + */ + for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + if (*idxp == lo) + j = i; + } + if (j < 0) { + ASSERT(j >= 0); + return -EFSCORRUPTED; + } + + /* Swap a[lo] and a[pivot]. */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + return error; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + return xfarray_sort_store(si, *idxp, recp); +} + +/* + * Set up the pointers for the next iteration. We push onto the stack all of + * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the + * current stack frame to point to the unsorted values between a[beg[i]] and + * a[lo] so that those values will be sorted when we pop the stack. + */ +static inline int +xfarray_qsort_push( + struct xfarray_sortinfo *si, + xfarray_idx_t *si_lo, + xfarray_idx_t *si_hi, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + /* Check for stack overflows */ + if (si->stack_depth >= si->max_stack_depth - 1) { + ASSERT(si->stack_depth < si->max_stack_depth - 1); + return -EFSCORRUPTED; + } + + si->max_stack_used = max_t(uint8_t, si->max_stack_used, + si->stack_depth + 2); + + si_lo[si->stack_depth + 1] = lo + 1; + si_hi[si->stack_depth + 1] = si_hi[si->stack_depth]; + si_hi[si->stack_depth++] = lo - 1; + + /* + * Always start with the smaller of the two partitions to keep the + * amount of recursion in check. + */ + if (si_hi[si->stack_depth] - si_lo[si->stack_depth] > + si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) { + swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]); + swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]); + } + + return 0; +} + +/* + * Load an element from the array into the first scratchpad and cache the page, + * if possible. + */ +static inline int +xfarray_sort_load_cached( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + loff_t idx_pos = xfarray_pos(si->array, idx); + pgoff_t startpage; + pgoff_t endpage; + int error = 0; + + /* + * If this load would split a page, release the cached page, if any, + * and perform a traditional read. + */ + startpage = idx_pos >> PAGE_SHIFT; + endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; + if (startpage != endpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + + if (xfarray_sort_terminated(si, &error)) + return error; + + return xfile_obj_load(si->array->xfile, ptr, + si->array->obj_size, idx_pos); + } + + /* If the cached page is not the one we want, release it. */ + if (xfile_page_cached(&si->xfpage) && + xfile_page_index(&si->xfpage) != startpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + } + + /* + * If we don't have a cached page (and we know the load is contained + * in a single page) then grab it. + */ + if (!xfile_page_cached(&si->xfpage)) { + if (xfarray_sort_terminated(si, &error)) + return error; + + error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, + PAGE_SIZE); + if (error) + return error; + } + + memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), + si->array->obj_size); + return 0; +} + +/* + * Sort the array elements via quicksort. This implementation incorporates + * four optimizations discussed in Sedgewick: + * + * 1. Use an explicit stack of array indices to store the next array partition + * to sort. This helps us to avoid recursion in the call stack, which is + * particularly expensive in the kernel. + * + * 2. For arrays with records in arbitrary or user-controlled order, choose the + * pivot element using a median-of-nine decision tree. This reduces the + * probability of selecting a bad pivot value which causes worst case + * behavior (i.e. partition sizes of 1). + * + * 3. The smaller of the two sub-partitions is pushed onto the stack to start + * the next level of recursion, and the larger sub-partition replaces the + * current stack frame. This guarantees that we won't need more than + * log2(nr) stack space. + * + * 4. For small sets, load the records into the scratchpad and run heapsort on + * them because that is very fast. In the author's experience, this yields + * a ~10% reduction in runtime. + * + * If a small set is contained entirely within a single xfile memory page, + * map the page directly and run heap sort directly on the xfile page + * instead of using the load/store interface. This halves the runtime. + * + * 5. This optimization is specific to the implementation. When converging lo + * and hi after selecting a pivot, we will try to retain the xfile memory + * page between load calls, which reduces run time by 50%. + */ + +/* + * Due to the use of signed indices, we can only support up to 2^63 records. + * Files can only grow to 2^63 bytes, so this is not much of a limitation. + */ +#define QSORT_MAX_RECS (1ULL << 63) + +int +xfarray_sort( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags) +{ + struct xfarray_sortinfo *si; + xfarray_idx_t *si_lo, *si_hi; + void *pivot; + void *scratch = xfarray_scratch(array); + xfarray_idx_t lo, hi; + int error = 0; + + if (array->nr < 2) + return 0; + if (array->nr >= QSORT_MAX_RECS) + return -E2BIG; + + error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si); + if (error) + return error; + si_lo = xfarray_sortinfo_lo(si); + si_hi = xfarray_sortinfo_hi(si); + pivot = xfarray_sortinfo_pivot(si); + + while (si->stack_depth >= 0) { + lo = si_lo[si->stack_depth]; + hi = si_hi[si->stack_depth]; + + trace_xfarray_qsort(si, lo, hi); + + /* Nothing left in this partition to sort; pop stack. */ + if (lo >= hi) { + si->stack_depth--; + continue; + } + + /* + * If directly mapping the page and sorting can solve our + * problems, we're done. + */ + if (xfarray_want_pagesort(si, lo, hi)) { + error = xfarray_pagesort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* If insertion sort can solve our problems, we're done. */ + if (xfarray_want_isort(si, lo, hi)) { + error = xfarray_isort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* Pick a pivot, move it to a[lo] and stash it. */ + error = xfarray_qsort_pivot(si, lo, hi); + if (error) + goto out_free; + + /* + * Rearrange a[lo..hi] such that everything smaller than the + * pivot is on the left side of the range and everything larger + * than the pivot is on the right side of the range. + */ + while (lo < hi) { + /* + * Decrement hi until it finds an a[hi] less than the + * pivot value. + */ + error = xfarray_sort_load_cached(si, hi, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && + lo < hi) { + hi--; + error = xfarray_sort_load_cached(si, hi, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[hi]) to a[lo]. */ + if (lo < hi) { + error = xfarray_sort_store(si, lo++, scratch); + if (error) + goto out_free; + } + + /* + * Increment lo until it finds an a[lo] greater than + * the pivot value. + */ + error = xfarray_sort_load_cached(si, lo, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && + lo < hi) { + lo++; + error = xfarray_sort_load_cached(si, lo, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[lo]) to a[hi]. */ + if (lo < hi) { + error = xfarray_sort_store(si, hi--, scratch); + if (error) + goto out_free; + } + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + + /* + * Put our pivot value in the correct place at a[lo]. All + * values between a[beg[i]] and a[lo - 1] should be less than + * the pivot; and all values between a[lo + 1] and a[end[i]-1] + * should be greater than the pivot. + */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + goto out_free; + + /* Set up the stack frame to process the two partitions. */ + error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + +out_free: + trace_xfarray_sort_stats(si, error); + kvfree(si); + return error; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h new file mode 100644 index 000000000000..4ecac01363d9 --- /dev/null +++ b/fs/xfs/scrub/xfarray.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFARRAY_H__ +#define __XFS_SCRUB_XFARRAY_H__ + +/* xfile array index type, along with cursor initialization */ +typedef uint64_t xfarray_idx_t; +#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) + +/* Iterate each index of an xfile array. */ +#define foreach_xfarray_idx(array, idx) \ + for ((idx) = XFARRAY_CURSOR_INIT; \ + (idx) < xfarray_length(array); \ + (idx)++) + +struct xfarray { + /* Underlying file that backs the array. */ + struct xfile *xfile; + + /* Number of array elements. */ + xfarray_idx_t nr; + + /* Maximum possible array size. */ + xfarray_idx_t max_nr; + + /* Number of unset slots in the array below @nr. */ + uint64_t unset_slots; + + /* Size of an array element. */ + size_t obj_size; + + /* log2 of array element size, if possible. */ + int obj_size_log; +}; + +int xfarray_create(const char *descr, unsigned long long required_capacity, + size_t obj_size, struct xfarray **arrayp); +void xfarray_destroy(struct xfarray *array); +int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr); +int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); +int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); +int xfarray_store_anywhere(struct xfarray *array, const void *ptr); +bool xfarray_element_is_null(struct xfarray *array, const void *ptr); + +/* Append an element to the array. */ +static inline int xfarray_append(struct xfarray *array, const void *ptr) +{ + return xfarray_store(array, array->nr, ptr); +} + +uint64_t xfarray_length(struct xfarray *array); +int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); + +/* Declarations for xfile array sort functionality. */ + +typedef cmp_func_t xfarray_cmp_fn; + +/* Perform an in-memory heapsort for small subsets. */ +#define XFARRAY_ISORT_SHIFT (4) +#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT) + +/* Evalulate this many points to find the qsort pivot. */ +#define XFARRAY_QSORT_PIVOT_NR (9) + +struct xfarray_sortinfo { + struct xfarray *array; + + /* Comparison function for the sort. */ + xfarray_cmp_fn cmp_fn; + + /* Maximum height of the partition stack. */ + uint8_t max_stack_depth; + + /* Current height of the partition stack. */ + int8_t stack_depth; + + /* Maximum stack depth ever used. */ + uint8_t max_stack_used; + + /* XFARRAY_SORT_* flags; see below. */ + unsigned int flags; + + /* Cache a page here for faster access. */ + struct xfile_page xfpage; + void *page_kaddr; + +#ifdef DEBUG + /* Performance statistics. */ + uint64_t loads; + uint64_t stores; + uint64_t compares; + uint64_t heapsorts; +#endif + /* + * Extra bytes are allocated beyond the end of the structure to store + * quicksort information. C does not permit multiple VLAs per struct, + * so we document all of this in a comment. + * + * Pretend that we have a typedef for array records: + * + * typedef char[array->obj_size] xfarray_rec_t; + * + * First comes the quicksort partition stack: + * + * xfarray_idx_t lo[max_stack_depth]; + * xfarray_idx_t hi[max_stack_depth]; + * + * union { + * + * If for a given subset we decide to use an in-memory sort, we use a + * block of scratchpad records here to compare items: + * + * xfarray_rec_t scratch[ISORT_NR]; + * + * Otherwise, we want to partition the records to partition the array. + * We store the chosen pivot record at the start of the scratchpad area + * and use the rest to sample some records to estimate the median. + * The format of the qsort_pivot array enables us to use the kernel + * heapsort function to place the median value in the middle. + * + * struct { + * xfarray_rec_t pivot; + * struct { + * xfarray_rec_t rec; (rounded up to 8 bytes) + * xfarray_idx_t idx; + * } qsort_pivot[QSORT_PIVOT_NR]; + * }; + * } + */ +}; + +/* Sort can be interrupted by a fatal signal. */ +#define XFARRAY_SORT_KILLABLE (1U << 0) + +int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn, + unsigned int flags); + +#endif /* __XFS_SCRUB_XFARRAY_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c new file mode 100644 index 000000000000..d98e8e77c684 --- /dev/null +++ b/fs/xfs/scrub/xfile.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" +#include <linux/shmem_fs.h> + +/* + * Swappable Temporary Memory + * ========================== + * + * Online checking sometimes needs to be able to stage a large amount of data + * in memory. This information might not fit in the available memory and it + * doesn't all need to be accessible at all times. In other words, we want an + * indexed data buffer to store data that can be paged out. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xfile must be freed with xfile_destroy. + * + * xfiles assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * NOTE: The current shmemfs implementation has a quirk that in-kernel reads + * of a hole cause a page to be mapped into the file. If you are going to + * create a sparse xfile, please be careful about reading from uninitialized + * parts of the file. These pages are !Uptodate and will eventually be + * reclaimed if not written, but in the short term this boosts memory + * consumption. + */ + +/* + * xfiles must not be exposed to userspace and require upper layers to + * coordinate access to the one handle returned by the constructor, so + * establish a separate lock class for xfiles to avoid confusing lockdep. + */ +static struct lock_class_key xfile_i_mutex_key; + +/* + * Create an xfile of the given size. The description will be used in the + * trace output. + */ +int +xfile_create( + const char *description, + loff_t isize, + struct xfile **xfilep) +{ + struct inode *inode; + struct xfile *xf; + int error = -ENOMEM; + + xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); + if (!xf) + return -ENOMEM; + + xf->file = shmem_file_setup(description, isize, 0); + if (!xf->file) + goto out_xfile; + if (IS_ERR(xf->file)) { + error = PTR_ERR(xf->file); + goto out_xfile; + } + + /* + * We want a large sparse file that we can pread, pwrite, and seek. + * xfile users are responsible for keeping the xfile hidden away from + * all other callers, so we skip timestamp updates and security checks. + * Make the inode only accessible by root, just in case the xfile ever + * escapes. + */ + xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | + FMODE_LSEEK; + xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; + inode = file_inode(xf->file); + inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; + inode->i_mode &= ~0177; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + + lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + + trace_xfile_create(xf); + + *xfilep = xf; + return 0; +out_xfile: + kfree(xf); + return error; +} + +/* Close the file and release all resources. */ +void +xfile_destroy( + struct xfile *xf) +{ + struct inode *inode = file_inode(xf->file); + + trace_xfile_destroy(xf); + + lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); + fput(xf->file); + kfree(xf); +} + +/* + * Read a memory object directly from the xfile's page cache. Unlike regular + * pread, we return -E2BIG and -EFBIG for reads that are too large or at too + * high an offset, instead of truncating the read. Otherwise, we return + * bytes read or an error code, like regular pread. + */ +ssize_t +xfile_pread( + struct xfile *xf, + void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + struct page *page = NULL; + ssize_t read = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pread(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *p, *kaddr; + unsigned int len; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * In-kernel reads of a shmem file cause it to allocate a page + * if the mapping shows a hole. Therefore, if we hit ENOMEM + * we can continue by zeroing the caller's buffer. + */ + page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, + __GFP_NOWARN); + if (IS_ERR(page)) { + error = PTR_ERR(page); + if (error != -ENOMEM) + break; + + memset(buf, 0, len); + goto advance; + } + + if (PageUptodate(page)) { + /* + * xfile pages must never be mapped into userspace, so + * we skip the dcache flush. + */ + kaddr = kmap_local_page(page); + p = kaddr + offset_in_page(pos); + memcpy(buf, p, len); + kunmap_local(kaddr); + } else { + memset(buf, 0, len); + } + put_page(page); + +advance: + count -= len; + pos += len; + buf += len; + read += len; + } + memalloc_nofs_restore(pflags); + + if (read > 0) + return read; + return error; +} + +/* + * Write a memory object directly to the xfile's page cache. Unlike regular + * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too + * high an offset, instead of truncating the write. Otherwise, we return + * bytes written or an error code, like regular pwrite. + */ +ssize_t +xfile_pwrite( + struct xfile *xf, + const void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + ssize_t written = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pwrite(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *fsdata = NULL; + void *p, *kaddr; + unsigned int len; + int ret; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. + * shmem doesn't support fs freeze, but lockdep doesn't know + * that and will trip over that. + */ + error = aops->write_begin(NULL, mapping, pos, len, &page, + &fsdata); + if (error) + break; + + /* + * xfile pages must never be mapped into userspace, so we skip + * the dcache flush. If the page is not uptodate, zero it + * before writing data. + */ + kaddr = kmap_local_page(page); + if (!PageUptodate(page)) { + memset(kaddr, 0, PAGE_SIZE); + SetPageUptodate(page); + } + p = kaddr + offset_in_page(pos); + memcpy(p, buf, len); + kunmap_local(kaddr); + + ret = aops->write_end(NULL, mapping, pos, len, len, page, + fsdata); + if (ret < 0) { + error = ret; + break; + } + + written += ret; + if (ret != len) + break; + + count -= ret; + pos += ret; + buf += ret; + } + memalloc_nofs_restore(pflags); + + if (written > 0) + return written; + return error; +} + +/* Find the next written area in the xfile data for a given offset. */ +loff_t +xfile_seek_data( + struct xfile *xf, + loff_t pos) +{ + loff_t ret; + + ret = vfs_llseek(xf->file, pos, SEEK_DATA); + trace_xfile_seek_data(xf, pos, ret); + return ret; +} + +/* Query stat information for an xfile. */ +int +xfile_stat( + struct xfile *xf, + struct xfile_stat *statbuf) +{ + struct kstat ks; + int error; + + error = vfs_getattr_nosec(&xf->file->f_path, &ks, + STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); + if (error) + return error; + + statbuf->size = ks.size; + statbuf->bytes = ks.blocks << SECTOR_SHIFT; + return 0; +} + +/* + * Grab the (locked) page for a memory object. The object cannot span a page + * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we + * cannot grab the page, or the usual negative errno. + */ +int +xfile_get_page( + struct xfile *xf, + loff_t pos, + unsigned int len, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + void *fsdata = NULL; + loff_t key = round_down(pos, PAGE_SIZE); + unsigned int pflags; + int error; + + if (inode->i_sb->s_maxbytes - pos < len) + return -ENOMEM; + if (len > PAGE_SIZE - offset_in_page(pos)) + return -ENOTBLK; + + trace_xfile_get_page(xf, pos, len); + + pflags = memalloc_nofs_save(); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. shmem + * doesn't support fs freeze, but lockdep doesn't know that and will + * trip over that. + */ + error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, + &fsdata); + if (error) + goto out_pflags; + + /* We got the page, so make sure we push out EOF. */ + if (i_size_read(inode) < pos + len) + i_size_write(inode, pos + len); + + /* + * If the page isn't up to date, fill it with zeroes before we hand it + * to the caller and make sure the backing store will hold on to them. + */ + if (!PageUptodate(page)) { + void *kaddr; + + kaddr = kmap_local_page(page); + memset(kaddr, 0, PAGE_SIZE); + kunmap_local(kaddr); + SetPageUptodate(page); + } + + /* + * Mark each page dirty so that the contents are written to some + * backing store when we drop this buffer, and take an extra reference + * to prevent the xfile page from being swapped or removed from the + * page cache by reclaim if the caller unlocks the page. + */ + set_page_dirty(page); + get_page(page); + + xfpage->page = page; + xfpage->fsdata = fsdata; + xfpage->pos = key; +out_pflags: + memalloc_nofs_restore(pflags); + return error; +} + +/* + * Release the (locked) page for a memory object. Returns 0 or a negative + * errno. + */ +int +xfile_put_page( + struct xfile *xf, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + unsigned int pflags; + int ret; + + trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); + + /* Give back the reference that we took in xfile_get_page. */ + put_page(xfpage->page); + + pflags = memalloc_nofs_save(); + ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, + xfpage->page, xfpage->fsdata); + memalloc_nofs_restore(pflags); + memset(xfpage, 0, sizeof(struct xfile_page)); + + if (ret < 0) + return ret; + if (ret != PAGE_SIZE) + return -EIO; + return 0; +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h new file mode 100644 index 000000000000..d56643b0f429 --- /dev/null +++ b/fs/xfs/scrub/xfile.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFILE_H__ +#define __XFS_SCRUB_XFILE_H__ + +struct xfile_page { + struct page *page; + void *fsdata; + loff_t pos; +}; + +static inline bool xfile_page_cached(const struct xfile_page *xfpage) +{ + return xfpage->page != NULL; +} + +static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) +{ + return xfpage->page->index; +} + +struct xfile { + struct file *file; +}; + +int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); +void xfile_destroy(struct xfile *xf); + +ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); +ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, + loff_t pos); + +/* + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pread(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +/* + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pwrite(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +loff_t xfile_seek_data(struct xfile *xf, loff_t pos); + +struct xfile_stat { + loff_t size; + unsigned long long bytes; +}; + +int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); + +int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, + struct xfile_page *xbuf); +int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); + +#endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2fca4b4e7fd8..465d7630bb21 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -478,7 +478,7 @@ xfs_discard_folio( folio, ip->i_ino, pos); /* - * The end of the punch range is always the offset of the the first + * The end of the punch range is always the offset of the first * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 3b903f6bce98..c1ece4a08ff4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -481,7 +481,8 @@ _xfs_buf_obj_cmp( * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. */ - ASSERT(bp->b_flags & XBF_STALE); + if (!(map->bm_flags & XBM_LIVESCAN)) + ASSERT(bp->b_flags & XBF_STALE); return 1; } return 0; @@ -559,6 +560,10 @@ xfs_buf_find_lock( * intact here. */ if (bp->b_flags & XBF_STALE) { + if (flags & XBF_LIVESCAN) { + xfs_buf_unlock(bp); + return -ENOENT; + } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; @@ -682,6 +687,8 @@ xfs_buf_get_map( int error; int i; + if (flags & XBF_LIVESCAN) + cmap.bm_flags |= XBM_LIVESCAN; for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 549c60942208..df8f47953bb4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -44,6 +44,11 @@ struct xfs_buf; #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ @@ -67,6 +72,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -114,8 +120,15 @@ typedef struct xfs_buftarg { struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ + unsigned int bm_flags; }; +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBM_LIVESCAN (1U << 0) + #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7f071757f278..ac6ba646624d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1386,7 +1386,7 @@ xfs_qm_dqiterate( return error; error = iter_fn(dq, type, priv); - id = dq->q_id; + id = dq->q_id + 1; xfs_qm_dqput(dq); } while (error == 0 && id != 0); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 453890942d9f..e541f5c0bc25 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -803,44 +803,6 @@ out_error_or_again: } /* - * "Is this a cached inode that's also allocated?" - * - * Look up an inode by number in the given file system. If the inode is - * in cache and isn't in purgatory, return 1 if the inode is allocated - * and 0 if it is not. For all other cases (not in cache, being torn - * down, etc.), return a negative error code. - * - * The caller has to prevent inode allocation and freeing activity, - * presumably by locking the AGI buffer. This is to ensure that an - * inode cannot transition from allocated to freed until the caller is - * ready to allow that. If the inode is in an intermediate state (new, - * reclaimable, or being reclaimed), -EAGAIN will be returned; if the - * inode is not in the cache, -ENOENT will be returned. The caller must - * deal with these scenarios appropriately. - * - * This is a specialized use case for the online scrubber; if you're - * reading this, you probably want xfs_iget. - */ -int -xfs_icache_inode_is_allocated( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_ino_t ino, - bool *inuse) -{ - struct xfs_inode *ip; - int error; - - error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); - if (error) - return error; - - *inuse = !!(VFS_I(ip)->i_mode); - xfs_irele(ip); - return 0; -} - -/* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 1dcdcb23796e..2fa6f2e09d07 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -71,10 +71,6 @@ void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); - -int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_ino_t ino, bool *inuse); - void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 74dcb05069e8..e9d317a3dafe 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -63,6 +63,7 @@ typedef __u32 xfs_nlink_t; #include <linux/rhashtable.h> #include <linux/xattr.h> #include <linux/mnt_idmapping.h> +#include <linux/debugfs.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fb87ffb48f7f..0a0fd19573d8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" +#include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -716,9 +717,11 @@ xfs_mountfs( if (error) goto out_remove_sysfs; + xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); + error = xfs_error_sysfs_init(mp); if (error) - goto out_del_stats; + goto out_remove_scrub_stats; error = xfs_errortag_init(mp); if (error) @@ -1033,7 +1036,8 @@ xfs_mountfs( xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); - out_del_stats: + out_remove_scrub_stats: + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); @@ -1105,6 +1109,7 @@ xfs_unmountfs( xfs_errortag_del(mp); xfs_error_sysfs_del(mp); + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e2866e7fa60c..a25eece3be2b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -208,11 +208,15 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS + struct xchk_stats *m_scrub_stats; +#endif xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c79eac048456..1f77014c6e1a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -42,6 +42,7 @@ #include "xfs_xattr.h" #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" +#include "scrub/stats.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -49,6 +50,7 @@ static const struct super_operations xfs_super_operations; +static struct dentry *xfs_debugfs; /* top-level xfs debugfs dir */ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ @@ -783,6 +785,7 @@ xfs_mount_free( if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); + debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -1163,6 +1166,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); @@ -1497,6 +1501,21 @@ xfs_fs_validate_params( return 0; } +struct dentry * +xfs_debugfs_mkdir( + const char *name, + struct dentry *parent) +{ + struct dentry *child; + + /* Apparently we're expected to ignore error returns?? */ + child = debugfs_create_dir(name, parent); + if (IS_ERR(child)) + return NULL; + + return child; +} + static int xfs_fs_fill_super( struct super_block *sb, @@ -1539,6 +1558,13 @@ xfs_fs_fill_super( if (error) return error; + if (xfs_debugfs) { + mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id, + xfs_debugfs); + } else { + mp->m_debugfs = NULL; + } + error = xfs_init_mount_workqueues(mp); if (error) goto out_shutdown_devices; @@ -1565,10 +1591,14 @@ xfs_fs_fill_super( goto out_destroy_inodegc; } - error = xfs_readsb(mp, flags); + error = xchk_mount_stats_alloc(mp); if (error) goto out_free_stats; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_scrub_stats; + error = xfs_finish_flags(mp); if (error) goto out_free_sb; @@ -1746,6 +1776,8 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); + out_free_scrub_stats: + xchk_mount_stats_free(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_inodegc: @@ -2377,10 +2409,12 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL); + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister; + goto out_debugfs_unregister; } xfsstats.xs_kobj.kobject.kset = xfs_kset; @@ -2396,11 +2430,15 @@ init_xfs_fs(void) if (error) goto out_free_stats; + error = xchk_global_stats_setup(xfs_debugfs); + if (error) + goto out_remove_stats_kobj; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_remove_stats_kobj; + goto out_remove_scrub_stats; #endif error = xfs_qm_init(); @@ -2417,14 +2455,17 @@ init_xfs_fs(void) out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_remove_stats_kobj: + out_remove_scrub_stats: #endif + xchk_global_stats_teardown(); + out_remove_stats_kobj: xfs_sysfs_del(&xfsstats.xs_kobj); out_free_stats: free_percpu(xfsstats.xs_stats); out_kset_unregister: kset_unregister(xfs_kset); - out_sysctl_unregister: + out_debugfs_unregister: + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); @@ -2448,9 +2489,11 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xchk_global_stats_teardown(); xfs_sysfs_del(&xfsstats.xs_kobj); free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_mru_cache_uninit(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 364e2c2648a8..302e6e5d6c7e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -100,4 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) +struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent); + #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fd789e00dfd6..902c7f67a117 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -22,6 +22,9 @@ * daddr: physical block number in 512b blocks * bbcount: number of blocks in a physical extent, in 512b blocks * + * rtx: physical rt extent number for extent mappings + * rtxcount: number of rt extents in an extent mapping + * * owner: reverse-mapping owner, usually inodes * * fileoff: file offset, in fs blocks |