diff options
61 files changed, 2457 insertions, 2898 deletions
diff --git a/block/blk-merge.c b/block/blk-merge.c index b7c193d67185..64bf7d9dd8e8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -336,6 +336,7 @@ split: bio_clear_polled(bio); return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** * __bio_split_to_limits - split a bio to fit the queue limits diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 46851511b661..90e40d5ceccd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8affc88b0e0a..d8b90f95b157 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *orig_bbio = bbio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); + bbio = btrfs_bio(bio); + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + if (ret) + goto fail_put_bio; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + bio_put(bio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct bio *bio, int mirror_num) +{ + while (!btrfs_submit_chunk(bio, mirror_num)) + ; +} + /* * Submit a repair write. * @@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * - * The I/O is issued sychronously to block the repair read completion from + * The I/O is issued synchronously to block the repair read completion from * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, @@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; } void __cold btrfs_bioset_exit(void) { + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); bioset_exit(&btrfs_bioset); } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index b12f84b3b341..873ff85817f0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -26,32 +26,23 @@ struct btrfs_fs_info; typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ + /* Inode and offset into it that this I/O operates on. */ + struct btrfs_inode *inode; u64 file_offset; - /* @device is for stripe IO submission. */ - struct btrfs_device *device; union { - /* For data checksum verification. */ + /* + * Data checksumming and original I/O information for internal + * use in the btrfs_submit_bio machinery. + */ struct { u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; }; /* For metadata parentness verification. */ @@ -62,7 +53,9 @@ struct btrfs_bio { btrfs_bio_end_io_t end_io; void *private; - /* For read end I/O handling */ + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; struct work_struct end_io_work; /* @@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} +/* Bio only refers to one ordered extent. */ +#define REQ_BTRFS_ONE_ORDERED REQ_DRV -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct bio *bio, int mirror_num); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 708d843daa72..5b10401d803b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> #include <linux/list_sort.h> #include "misc.h" #include "ctree.h" @@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + int ret = 0; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + key->objectid = block_group->start + search_offset; + key->type = BTRFS_EXTENT_ITEM_KEY; + key->offset = 0; + + while (1) { + ret = btrfs_search_forward(extent_root, key, path, 0); + if (ret != 0) + goto out; + /* Success; sampled an extent item in the block group */ + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->objectid >= block_group->start && + key->objectid + key->offset <= search_end) + goto out; + + /* We can't possibly find a valid extent item anymore */ + if (key->objectid >= search_end) { + ret = 1; + break; + } + if (key->type < BTRFS_EXTENT_ITEM_KEY) + key->type = BTRFS_EXTENT_ITEM_KEY; + else + key->objectid++; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + } +out: + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } + +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group - * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len) + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; - if (bdev && map->stripes[i].dev->bdev != bdev) - continue; - stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); @@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, NULL, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { @@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cf..6e4a0b429ac3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len); + u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7c1527fcc7b3..9dc21622806e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct rb_root io_failure_tree; - spinlock_t io_failure_lock; - /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate @@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static void finish_compressed_bio_read(struct compressed_bio *cb) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { + struct compressed_bio *cb = bbio->private; unsigned int index; struct page *page; - if (cb->status == BLK_STS_OK) + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; + else cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); /* Release the compressed pages */ @@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); -} - -/* - * Verify the checksums and kick off repair if needed on the uncompressed data - * before decompressing it into the original bio and freeing the uncompressed - * pages. - */ -static void end_compressed_bio_read(struct btrfs_bio *bbio) -{ - struct compressed_bio *cb = bbio->private; - struct inode *inode = cb->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); - bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bbio->bio.bi_status; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (!status && - (!csum || !btrfs_check_data_csum(bi, bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(bi, start, bv.bv_page, - bv.bv_offset); - } else { - int ret; - - refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, - true); - if (ret) { - refcount_dec(&cb->pending_ios); - status = errno_to_blk_status(ret); - } - } - } - - if (status) - cb->status = status; - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; - - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - - btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } - bio_put(&bbio->bio); -} - -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - * @next_stripe_start: Return value of logical bytenr of where next stripe starts. - * Let the caller know to only fill the bio up to the stripe - * boundary. - */ - - -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func, - u64 *next_stripe_start) -{ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct btrfs_io_geometry geom; - struct extent_map *em; - struct bio *bio; - int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); - free_extent_map(em); - if (ret < 0) { - bio_put(bio); - return ERR_PTR(ret); - } - *next_stripe_start = disk_bytenr + geom.len; - refcount_inc(&cb->pending_ios); - return bio; + bio_put(&bbio->bio); } /* @@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; - u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + BTRFS_I(cb->inode), end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = start; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write, - &next_stripe_start); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_bytenr != next_stripe_start); /* * We have various limits on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - /* Reached stripe boundary */ - if (cur_disk_bytenr == next_stripe_start) - submit = true; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range. */ + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio = NULL; + struct bio *comp_bio; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; - u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; @@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; + comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), + end_compressed_bio_read, cb); + comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); + while (cur_disk_byte < disk_bytenr + compressed_len) { u64 offset = cur_disk_byte - disk_bytenr; unsigned int index = offset >> PAGE_SHIFT; unsigned int real_size; unsigned int added; struct page *page = cb->compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!comp_bio) { - comp_bio = alloc_compressed_bio(cb, cur_disk_byte, - REQ_OP_READ, end_compressed_bio_read, - &next_stripe_start); - if (IS_ERR(comp_bio)) { - cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); - break; - } - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_byte != next_stripe_start); + /* * We have various limit on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); @@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ ASSERT(added == real_size); cur_disk_byte += added; - - /* Reached stripe boundary, need to submit */ - if (cur_disk_byte == next_stripe_start) - submit = true; - - /* Has finished the range, need to submit */ - if (cur_disk_byte == disk_bytenr + compressed_len) - submit = true; - - if (submit) { - /* Save the original iter for read repair */ - if (bio_op(comp_bio) == REQ_OP_READ) - btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - - /* - * Save the initial offset of this chunk, as there - * is no direct correlation between compressed pages and - * the original file offset. The field is only used for - * priting error messages. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(comp_bio), ret); - break; - } - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, comp_bio, mirror_num); - comp_bio = NULL; - } } if (memstall) psi_memstall_leave(&pflags); - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); + /* + * Stash the initial offset of this chunk, as there is no direct + * correlation between compressed pages and the original file offset. + * The field is only used for printing error messages anyway. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; + + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(comp_bio, mirror_num); return; fail: @@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; @@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * For now is's a naive and optimistic 'return true', we'll extend the logic to * quickly (compared to direct compression) detect data characteristics - * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * (compressible/incompressible) to avoid wasting CPU time on incompressible * data. * * The following types of analysis can be performed: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 6209d40a1e08..a5e3377db9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4754c9101a4c..a5b6bb54545f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * Search for a key in the given extent_buffer. * - * The lower boundary for the search is specified by the slot number @low. Use a - * value of 0 to search over the whole extent buffer. + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, int low, - const struct btrfs_key *key, int *slot) +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; - int high = btrfs_header_nritems(eb); + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); - if (low > high) { + if (unlikely(low > high)) { btrfs_err(eb->fs_info, - "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; @@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, return 1; } -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot) -{ - return generic_bin_search(eb, 0, key, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_generic_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, @@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (left_nritems) btrfs_mark_buffer_dirty(left); else - btrfs_clean_tree_block(left); + btrfs_clear_buffer_dirty(trans, left); btrfs_mark_buffer_dirty(right); @@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clean_tree_block(path->nodes[0]); + btrfs_clear_buffer_dirty(trans, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3259,7 +3255,8 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) @@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (right_nritems) btrfs_mark_buffer_dirty(right); else - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EUCLEAN; goto out; } - return __push_leaf_left(path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6965703a81b6..97897107fab5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); + +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +/* + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). + */ +static inline int btrfs_bin_search(struct extent_buffer *eb, + const struct btrfs_key *key, + int *slot) +{ + return btrfs_generic_bin_search(eb, 0, key, slot); +} + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a7644..8065341d831a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e2..886ffb232eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4..2eb34abf700f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ff2e524d9937..317aeff6c1da 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; @@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); @@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + bool queued; + spin_lock(&discard_ctl->lock); + queued = !list_empty(&block_group->discard_list); + if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; @@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); @@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; + bool queued = false; spin_lock(&discard_ctl->lock); @@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, } block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); + /* + * If the block group is currently running in the discard workfn, we + * don't want to deref it, since it's still being used by the workfn. + * The workfn will notice this case and deref the block group when it is + * finished. + */ + if (queued && !running) + btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -214,10 +233,12 @@ again: if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { - if (btrfs_is_block_group_data_only(block_group)) + if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); - else + } else { list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { @@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; + /* + * If the block group was removed from the discard list while it was + * running in this workfn, then we didn't deref it, since this function + * still owned that reference. But we set the discard_ctl->block_group + * back to NULL, so we can use that condition to know that now we need + * to deref the block_group. + */ + if (discard_ctl->block_group == NULL) + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); @@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3aa04224315e..b53f0e30ce2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -79,23 +79,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) } /* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct bvec_iter iter; + struct bio_vec bv; + int ret = 0; + + bio_for_each_segment(bv, &bbio->bio, iter) { + ret = csum_dirty_buffer(fs_info, &bv); + if (ret) + break; + } + + return errno_to_blk_status(ret); +} + static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -700,172 +699,6 @@ err: return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->bio); - break; - case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); - break; - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, - async->bio, async->dio_file_offset); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static blk_status_t btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec; - struct btrfs_root *root; - int ret = 0; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec); - if (ret) - break; - } - - return errno_to_blk_status(ret); -} - -blk_status_t btree_submit_bio_start(struct bio *bio) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_submit_bio. - */ - return btree_csum_one_bio(bio); -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - - bio->bi_opf |= REQ_META; - bbio->is_metadata = 1; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void btrfs_clean_tree_block(struct extent_buffer *buf) -{ - struct btrfs_fs_info *fs_info = buf->fs_info; - if (btrfs_header_generation(buf) == - fs_info->running_transaction->transid) { - btrfs_assert_tree_write_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); - clear_extent_buffer_dirty(buf); - } - } -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) goto sleep; } + if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) + btrfs_sysfs_feature_update(fs_info); + btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); @@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; + + btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags)) - clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2f295eb6103..4d5772330110 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, @@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct bio *bio); +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa69..29a225836e28 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1625,7 +1625,7 @@ search: } /* - * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * Search a range in the state tree for a given mask. If 'filled' == 1, this * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 * is returned if any bit in the range is found set. */ diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e3eeec380844..21766e49ec02 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -6,7 +6,6 @@ #include "misc.h" struct extent_changeset; -struct io_failure_record; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..824c657f59e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,14 +32,12 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" @@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3535,8 @@ refill_cluster: if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4277,7 @@ search: &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4319,7 @@ search: } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4344,9 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4381,8 @@ have_block_group: ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4395,7 @@ have_block_group: ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: @@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); @@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae5425253603..0c958fc1b3b8 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3bbf8703db2a..c25fa74d7615 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -36,6 +36,7 @@ #include "file.h" #include "dev-replace.h" #include "super.h" +#include "transaction.h" static struct kmem_cache *extent_buffer_cache; @@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { struct bio *bio; int mirror_num; enum btrfs_compression_type compress_type; - u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct btrfs_inode *inode; + struct inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = BTRFS_I(bv->bv_page->mapping->host); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - - if (!is_data_inode(&inode->vfs_inode)) { + if (!is_data_inode(inode)) { if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, @@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - btrfs_submit_metadata_bio(inode, bio, mirror_num); - } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_submit_data_write_bio(inode, bio, mirror_num); - } else { - btrfs_submit_data_read_bio(inode, bio, mirror_num, - bio_ctrl->compress_type); + bio->bi_opf |= REQ_META; } + if (btrfs_op(bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(inode, bio, mirror_num); + else + btrfs_submit_bio(bio, mirror_num); + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } -static int insert_failrec(struct btrfs_inode *inode, - struct io_failure_record *failrec) -{ - struct rb_node *exist; - - spin_lock(&inode->io_failure_lock); - exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, - &failrec->rb_node); - spin_unlock(&inode->io_failure_lock); - - return (exist == NULL) ? 0 : -EEXIST; -} - -static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) -{ - struct rb_node *node; - struct io_failure_record *failrec = ERR_PTR(-ENOENT); - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search(&inode->io_failure_tree, start); - if (node) - failrec = rb_entry(node, struct io_failure_record, rb_node); - spin_unlock(&inode->io_failure_lock); - return failrec; -} - -static void free_io_failure(struct btrfs_inode *inode, - struct io_failure_record *rec) -{ - spin_lock(&inode->io_failure_lock); - rb_erase(&rec->rb_node, &inode->io_failure_tree); - spin_unlock(&inode->io_failure_lock); - - kfree(rec); -} - -static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == failrec->num_copies) - return cur_mirror + 1 - failrec->num_copies; - return cur_mirror + 1; -} - -static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == 1) - return failrec->num_copies; - return cur_mirror - 1; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - u64 ino = btrfs_ino(inode); - u64 locked_start, locked_end; - struct io_failure_record *failrec; - int mirror; - int ret; - - failrec = get_failrec(inode, start); - if (IS_ERR(failrec)) - return 0; - - BUG_ON(!failrec->this_mirror); - - if (sb_rdonly(fs_info->sb)) - goto out; - - ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, - &locked_end, EXTENT_LOCKED, NULL); - if (ret || locked_start > failrec->bytenr || - locked_end < failrec->bytenr + failrec->len - 1) - goto out; - - mirror = failrec->this_mirror; - do { - mirror = prev_mirror(failrec, mirror); - btrfs_repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, mirror); - } while (mirror != failrec->failed_mirror); - -out: - free_io_failure(inode, failrec); - return 0; -} - -/* - * Can be called when - * - hold extent lock - * - under ordered extent - * - the inode is freeing - */ -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct io_failure_record *failrec; - struct rb_node *node, *next; - - if (RB_EMPTY_ROOT(&inode->io_failure_tree)) - return; - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search_first(&inode->io_failure_tree, start); - while (node) { - failrec = rb_entry(node, struct io_failure_record, rb_node); - if (failrec->bytenr > end) - break; - - next = rb_next(node); - rb_erase(&failrec->rb_node, &inode->io_failure_tree); - kfree(failrec); - - node = next; - } - spin_unlock(&inode->io_failure_lock); -} - -static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - struct btrfs_bio *bbio, - unsigned int bio_offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - const u32 sectorsize = fs_info->sectorsize; - int ret; - - failrec = get_failrec(BTRFS_I(inode), start); - if (!IS_ERR(failrec)) { - btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->bytenr, failrec->len); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - ASSERT(failrec->this_mirror == bbio->mirror_num); - ASSERT(failrec->len == fs_info->sectorsize); - return failrec; - } - - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return ERR_PTR(-ENOMEM); - - RB_CLEAR_NODE(&failrec->rb_node); - failrec->bytenr = start; - failrec->len = sectorsize; - failrec->failed_mirror = bbio->mirror_num; - failrec->this_mirror = bbio->mirror_num; - failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - - btrfs_debug(fs_info, - "new io failure record logical %llu start %llu", - failrec->logical, start); - - failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); - if (failrec->num_copies == 1) { - /* - * We only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. No - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "cannot repair logical %llu num_copies %d", - failrec->logical, failrec->num_copies); - kfree(failrec); - return ERR_PTR(-EIO); - } - - /* Set the bits in the private failure tree */ - ret = insert_failrec(BTRFS_I(inode), failrec); - if (ret) { - kfree(failrec); - return ERR_PTR(ret); - } - - return failrec; -} - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered) -{ - u64 start = failed_bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *failed_bio = &failed_bbio->bio; - const int icsum = bio_offset >> fs_info->sectorsize_bits; - struct bio *repair_bio; - struct btrfs_bio *repair_bbio; - - btrfs_debug(fs_info, - "repair read error: read error at %llu", start); - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); - if (IS_ERR(failrec)) - return PTR_ERR(failrec); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for btrfs_repair_io_failure to do the rest for us. - */ - failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); - if (failrec->this_mirror == failrec->failed_mirror) { - btrfs_debug(fs_info, - "failed to repair num_copies %d this_mirror %d failed_mirror %d", - failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(inode, failrec); - return -EIO; - } - - repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, - failed_bbio->private); - repair_bbio = btrfs_bio(repair_bio); - repair_bbio->file_offset = start; - repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - - if (failed_bbio->csum) { - const u32 csum_size = fs_info->csum_size; - - repair_bbio->csum = repair_bbio->csum_inline; - memcpy(repair_bbio->csum, - failed_bbio->csum + csum_size * icsum, csum_size); - } - - bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_bbio->iter = repair_bio->bi_iter; - - btrfs_debug(fs_info, - "repair read error: submitting new read to mirror %d", - failrec->this_mirror); - - /* - * At this point we have a bio, so any errors from bio submission will - * be handled by the endio on the repair_bio, so we can't return an - * error here. - */ - if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, - failrec->this_mirror, 0); - else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); - - return BLK_STS_OK; -} - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static void end_sector_io(struct page *page, u64 offset, bool uptodate) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - const u32 sectorsize = inode->root->fs_info->sectorsize; - - end_page_read(page, uptodate, offset, sectorsize); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); -} - -static void submit_data_read_repair(struct inode *inode, - struct btrfs_bio *failed_bbio, - u32 bio_offset, const struct bio_vec *bvec, - unsigned int error_bitmap) -{ - const unsigned int pgoff = bvec->bv_offset; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page *page = bvec->bv_page; - const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; - const u64 end = start + bvec->bv_len - 1; - const u32 sectorsize = fs_info->sectorsize; - const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int i; - - BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); - - /* This repair is only for data */ - ASSERT(is_data_inode(inode)); - - /* We're here because we had some read errors or csum mismatch */ - ASSERT(error_bitmap); - - /* - * We only get called on buffered IO, thus page must be mapped and bio - * must not be cloned. - */ - ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); - - /* Iterate through all the sectors in the range */ - for (i = 0; i < nr_bits; i++) { - const unsigned int offset = i * sectorsize; - bool uptodate = false; - int ret; - - if (!(error_bitmap & (1U << i))) { - /* - * This sector has no error, just end the page read - * and unlock the range. - */ - uptodate = true; - goto next; - } - - ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, - bio_offset + offset, page, pgoff + offset, - true); - if (!ret) { - /* - * We have submitted the read repair, the page release - * will be handled by the endio function of the - * submitted repair bio. - * Thus we don't need to do any thing here. - */ - continue; - } - /* - * Continue on failed repair, otherwise the remaining sectors - * will not be properly unlocked. - */ -next: - end_sector_io(page, start + offset, uptodate); - } -} - /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) @@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) u64 start; u64 end; struct bvec_iter_all iter_all; - bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; - if (first_bvec) { - btrfs_record_physical_zoned(inode, start, bio); - first_bvec = false; - } - end_extent_writepage(page, error, start, end); btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); @@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - unsigned int error_bitmap = (unsigned int)-1; - bool repair = false; u64 start; u64 end; u32 len; @@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) len = bvec->bv_len; mirror = bbio->mirror_num; - if (likely(uptodate)) { - if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(bbio, - bio_offset, page, start, end); - if (error_bitmap) - uptodate = false; - } else { - if (btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror)) - uptodate = false; - } - } + if (uptodate && !is_data_inode(inode) && + btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) + uptodate = false; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); - /* * Zero out the remaining part if this range straddles * i_size. @@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (is_data_inode(inode)) { - /* - * Only try to repair bios that actually made it to a - * device. If the bio failed to be submitted mirror - * is 0 and we need to fail it without retrying. - * - * This also includes the high level bios for compressed - * extents - these never make it to a device and repair - * is already handled on the lower compressed bio. - */ - if (mirror > 0) - repair = true; - } else { + } else if (!is_data_inode(inode)) { struct extent_buffer *eb; eb = find_extent_buffer_readpage(fs_info, page, start); @@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) atomic_dec(&eb->io_pages); } - if (repair) { - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bbio, bio_offset, bvec, - error_bitmap); - } else { - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); - } + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + ASSERT(bio_ctrl->len_to_oe_boundary); if (bio_ctrl->compress_type != compress_type) return 0; @@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (!contig) return 0; - real_size = min(bio_ctrl->len_to_oe_boundary, - bio_ctrl->len_to_stripe_boundary) - bio_size; - real_size = min(real_size, size); + real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); /* * If real_size is 0, never call bio_add_*_page(), as even size is 0, @@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } -static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_io_geometry geom; struct btrfs_ordered_extent *ordered; - struct extent_map *em; - u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); - int ret; /* - * Pages for compressed extent are never submitted to disk directly, - * thus it has no real boundary, just set them to U32_MAX. - * - * The split happens for real compressed bio, which happens in - * btrfs_submit_compressed_read/write(). + * Limit the extent to the ordered boundary for Zone Append. + * Compressed bios aren't submitted directly, so it doesn't apply to + * them. */ - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - bio_ctrl->len_to_stripe_boundary = U32_MAX; - return 0; - } - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), - logical, &geom); - free_extent_map(em); - if (ret < 0) { - return ret; - } - if (geom.len > U32_MAX) - bio_ctrl->len_to_stripe_boundary = U32_MAX; - else - bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - - if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; - } - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!ordered) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && + btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); + return; + } } - bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, - ordered->disk_bytenr + ordered->disk_num_bytes - logical); - btrfs_put_ordered_extent(ordered); - return 0; + bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; - ASSERT(bio_ctrl->end_io_func); - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, + NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = file_offset; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); - if (ret < 0) - goto error; + calc_bio_boundaries(bio_ctrl, inode, file_offset); if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, + offset, page_offset(page) + cur, + compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; @@ -2361,13 +1911,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) mapping_set_error(page->mapping, -EIO); /* - * If we error out, we should add back the dirty_metadata_bytes - * to make it consistent. - */ - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - eb->len, fs_info->dirty_metadata_batch); - - /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -void clear_extent_buffer_dirty(const struct extent_buffer *eb) +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i; int num_pages; struct page *page; + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c82448b2e0..4341ad978fb8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,6 +11,8 @@ #include "ulist.h" #include "misc.h" +struct btrfs_trans_handle; + enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -60,11 +62,9 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_fs_info; -struct io_failure_record; struct extent_io_tree; struct btrfs_tree_parent_check; @@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); @@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the sector is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - /* Use rb_simple_node for search/insert */ - struct { - struct rb_node rb_node; - u64 bytenr; - }; - struct page *page; - u64 len; - u64 logical; - int this_mirror; - int failed_mirror; - int num_copies; -}; - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5de73466b2ca..41c77a100853 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, /* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. - * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_bio *bbio = NULL; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_bytenr; - u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; blk_status_t ret = BLK_STS_OK; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; @@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst if (!path) return BLK_STS_RESOURCE; - if (!dst) { - bbio = btrfs_bio(bio); - - if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } - } else { - bbio->csum = bbio->csum_inline; + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; } - csum = bbio->csum; } else { - csum = dst; + bbio->csum = bbio->csum_inline; } /* @@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> fs_info->sectorsize_bits; - csum_dst = csum + sector_offset * csum_size; + csum_dst = bbio->csum + sector_offset * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); - if (bbio) - btrfs_bio_free_csum(bbio); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; break; } @@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst memset(csum_dst, 0, csum_size); count = 1; - if (BTRFS_I(inode)->root->root_key.objectid == + if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; int ret; - ret = search_file_offset_in_bio(bio, inode, + ret = search_file_offset_in_bio(bio, + &inode->vfs_inode, cur_disk_bytenr, &file_offset); if (ret) set_extent_bits(io_tree, file_offset, @@ -784,23 +772,16 @@ fail: /* * Calculate checksums of the data contained inside a bio. - * - * @inode: Owner of the data inside the bio - * @bio: Contains the data to be checksummed - * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the - * file offsets are determined from the page offsets in the bio. - * Otherwise, this is the starting file offset of the bio vecs in - * @bio, which must be contiguous. - * @one_ordered: If true, @bio only refers to one ordered extent. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered) +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; - const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (use_page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); /* @@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - 1); for (i = 0; i < blockcount; i++) { - if (!one_ordered && + if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && !in_range(offset, ordered->file_offset, ordered->num_bytes)) { unsigned long bytes_left; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 031225668434..cd7f2ae515c0 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af046d22300e..5cc5a1faaef5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - btrfs_clean_tree_block(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 5553e1f8afe8..31c1648bc0b4 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 37b86acfcbcf..4c477eae6891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/btrfs_tree.h> #include <linux/sizes.h> @@ -125,6 +126,12 @@ enum { */ BTRFS_FS_NO_OVERCOMMIT, + /* + * Indicate if we have some features changed, this is mostly for + * cleaner thread to update the sysfs interface. + */ + BTRFS_FS_FEATURE_CHANGED, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -742,8 +749,10 @@ struct btrfs_fs_info { */ u64 zone_size; - /* Max size to emit ZONE_APPEND write command */ + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index efee6d35af52..6c18dc9a1831 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,27 +84,12 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct btrfs_inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ + /* Range of I/O */ u64 file_offset; - /* Used for bio::bi_size */ u32 bytes; - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* Array of checksums */ - u8 *csums; - /* This must be last */ - struct bio bio; + struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; @@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ - return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - -/* * Split an extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). @@ -2663,19 +2635,19 @@ out: return ret; } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; struct btrfs_ordered_extent *ordered; - u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; - u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); + ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; @@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - ret = split_zoned_em(inode, file_offset, file_len, pre, post); + ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); @@ -2723,75 +2695,6 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(inode, bio, - page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing the bio - * if there were any errors, so just return here. - */ - btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); - return; - } - - /* Save the original iter for read repair */ - btrfs_bio(bio)->iter = bio->bi_iter; - - /* - * Lookup bio sums does extra checks around whether we need to csum or - * not, which is why we ignore skip_sum here. - */ - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2872,7 @@ again: unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid bdev implies a write on a sequential zone */ - if (ordered_extent->bdev) { + /* A valid ->physical implies a write on a sequential zone. */ + if (ordered_extent->physical != (u64)-1) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } - btrfs_free_io_failure_record(inode, start, end); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: inode - * @bbio: btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @page: page where is the data to be verified - * @pgoff: offset inside the page + * @bv: bio_vec to check * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false. */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len = fs_info->sectorsize; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - ASSERT(pgoff + len <= PAGE_SIZE); + ASSERT(bv->bv_len == fs_info->sectorsize); - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (!bbio->csum) + return true; - if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) goto zeroit; - return 0; + return true; zeroit: - btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, - csum, csum_expected, bbio->mirror_num); - if (bbio->device) - btrfs_dev_stat_inc_and_print(bbio->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_page(page, pgoff, len); - return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset: offset to the beginning of the bio (in bytes) - * @start: file offset of the range start - * @end: file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - const u32 sectorsize = root->fs_info->sectorsize; - u32 pg_off; - unsigned int result = 0; - - /* - * This only happens for NODATASUM or compressed read. - * Normally this should be covered by above check for compressed read - * or the next check for NODATASUM. Just do a quicker exit here. - */ - if (bbio->csum == NULL) - return 0; - - if (inode->flags & BTRFS_INODE_NODATASUM) - return 0; - - if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) - return 0; - - ASSERT(page_offset(page) <= start && - end <= page_offset(page) + PAGE_SIZE - 1); - for (pg_off = offset_in_page(start); - pg_off < offset_in_page(end); - pg_off += sectorsize, bio_offset += sectorsize) { - u64 file_offset = pg_off + page_offset(page); - int ret; - - if (btrfs_is_data_reloc_root(root) && - test_range_bit(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, 1, NULL)) { - /* Skip the range without csum for data reloc inode */ - clear_extent_bits(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); - continue; - } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); - if (ret < 0) { - const int nr_bit = (pg_off - offset_in_page(start)) >> - root->fs_info->sectorsize_bits; - - result |= (1U << nr_bit); - } - } - return result; + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; } /* @@ -4987,7 +4834,7 @@ again: unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; @@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; @@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ - /* - * This implies a barrier so that stores to dio_bio->bi_status before - * this and loads of dio_bio->bi_status after this are fully ordered. - */ - if (!refcount_dec_and_test(&dip->refs)) - return; - - if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(dip->inode, NULL, - dip->file_offset, dip->bytes, - !dip->bio.bi_status); - } else { - unlock_extent(&dip->inode->io_tree, - dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - kfree(dip->csums); - bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); - - refcount_inc(&dip->refs); - btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, - struct btrfs_bio *bbio, - const bool uptodate) -{ - struct inode *inode = &dip->inode->vfs_inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - blk_status_t err = BLK_STS_OK; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (uptodate && - (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(BTRFS_I(inode), start, - bv.bv_page, bv.bv_offset); - } else { - int ret; - - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, false); - if (ret) - err = errno_to_blk_status(ret); - } - } - - return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) +static void btrfs_dio_end_io(struct btrfs_bio *bbio) { - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = bbio->private; + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; - blk_status_t err = bio->bi_status; - - if (err) - btrfs_warn(dip->inode->root->fs_info, - "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), - bio->bi_opf, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); - - if (err) - dip->bio.bi_status = err; - - btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); - - bio_put(bio); - btrfs_dio_private_put(dip); -} -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - blk_status_t ret; - - /* Save the original iter for read repair */ - if (btrfs_op(bio) == BTRFS_MAP_READ) - btrfs_bio(bio)->iter = bio->bi_iter; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, + dip->bytes, !bio->bi_status); + else + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } else { - btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, - file_offset - dip->file_offset); - } -map: - btrfs_submit_bio(fs_info, bio, 0); + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); } -static void btrfs_submit_direct(const struct iomap_iter *iter, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) { + struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = - container_of(dio_bio, struct btrfs_dio_private, bio); - struct inode *inode = iter->inode; - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); - struct bio *bio; - u64 start_sector; - int async_submit = 0; - u64 submit_len; - u64 clone_offset = 0; - u64 clone_len; - u64 logical; - int ret; - blk_status_t status; - struct btrfs_io_geometry geom; + container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - struct extent_map *em = NULL; - - dip->inode = BTRFS_I(inode); - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - refcount_set(&dip->refs, 1); - dip->csums = NULL; - - if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - unsigned int nr_sectors = - (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - - /* - * Load the csums up front to reduce csum tree searches and - * contention when submitting bios. - */ - status = BLK_STS_RESOURCE; - dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip->csums) - goto out_err; - - status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); - if (status != BLK_STS_OK) - goto out_err; - } - - start_sector = dio_bio->bi_iter.bi_sector; - submit_len = dio_bio->bi_iter.bi_size; - - do { - logical = start_sector << 9; - em = btrfs_get_chunk_map(fs_info, logical, submit_len); - if (IS_ERR(em)) { - status = errno_to_blk_status(PTR_ERR(em)); - em = NULL; - goto out_err_em; - } - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, &geom); - if (ret) { - status = errno_to_blk_status(ret); - goto out_err_em; - } - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); + btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + bbio->file_offset = file_offset; - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. - */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, - btrfs_end_dio_bio, dip); - btrfs_bio(bio)->file_offset = file_offset; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - status = extract_ordered_extent(BTRFS_I(inode), bio, - file_offset); - if (status) { - bio_put(bio); - goto out_err; - } - } - - ASSERT(submit_len >= clone_len); - submit_len -= clone_len; - - /* - * Increase the count before we submit the bio so we know - * the end IO handler won't happen before we increase the - * count. Otherwise, the dip might get freed before we're - * done setting it up. - * - * We transfer the initial reference to the last bio, so we - * don't need to increment the reference count for the last one. - */ - if (submit_len > 0) { - refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); - - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; - file_offset += clone_len; - - free_extent_map(em); - } while (submit_len > 0); - return; + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; -out_err_em: - free_extent_map(em); -out_err: - dio_bio->bi_status = status; - btrfs_dio_private_put(dip); + dio_data->submitted += bio->bi_iter.bi_size; + btrfs_submit_bio(bio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_submit_direct, + .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; @@ -8552,7 +8173,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); - spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bio), + offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; @@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; - bool skip_csum; }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, - struct bio *bio, int mirror_num) -{ - struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (!priv->skip_csum) { - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) - return ret; - } - - atomic_inc(&priv->pending); - btrfs_submit_bio(fs_info, bio, mirror_num); - return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ - const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->private; - struct btrfs_inode *inode = priv->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - u32 bio_offset = 0; - - if (priv->skip_csum || !uptodate) - return bbio->bio.bi_status; - - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(inode, bbio, bio_offset, - bvec->bv_page, pgoff)) - return BLK_STS_IOERR; - bio_offset += sectorsize; - pgoff += sectorsize; - } - } - return BLK_STS_OK; -} - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; - blk_status_t status; - status = btrfs_encoded_read_verify_csum(bbio); - if (status) { + if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the @@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ - WRITE_ONCE(priv->status, status); + WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .inode = inode, .file_offset = file_offset, .pending = ATOMIC_INIT(1), - .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), }; unsigned long i = 0; u64 cur = 0; - int ret; init_waitqueue_head(&priv.wait); - /* - * Submit bios for the extent, splitting due to bio or stripe limits as - * necessary. - */ + /* Submit bios for the extent, splitting due to bio limits as necessary. */ while (cur < disk_io_size) { - struct extent_map *em; - struct btrfs_io_geometry geom; struct bio *bio = NULL; - u64 remaining; + u64 remaining = disk_io_size - cur; - em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, - disk_io_size - cur); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - } else { - ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, - disk_bytenr + cur, &geom); - free_extent_map(em); - } - if (ret) { - WRITE_ONCE(priv.status, errno_to_blk_status(ret)); - break; - } - remaining = min(geom.len, disk_io_size - cur); while (bio || remaining) { size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + inode, btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = @@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { - blk_status_t status; - - status = submit_encoded_read_bio(inode, bio, 0); - if (status) { - WRITE_ONCE(priv.status, status); - bio_put(bio); - goto out; - } + atomic_inc(&priv.pending); + btrfs_submit_bio(bio, 0); bio = NULL; continue; } @@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } -out: if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5ba1ff31713b..84626c8ad5bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -707,7 +707,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, * exists). */ btrfs_tree_lock(leaf); - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 000000000000..0fe0ae54ac67 --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 000000000000..de3e18bce24a --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include <linux/maple_tree.h> +#include <linux/list.h> + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc..71f6d8302d50 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608..fde5aaa6e7c9 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -293,36 +293,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - -/* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. */ diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9..8c516ee58ff9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include <linux/types.h> struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1..6c24b69e2d0a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590..eb40cb39f842 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct block_device *bdev; }; static inline void @@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index af97413abcf4..52a7d2fa2284 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - btrfs_clean_tree_block(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ff4b1d583788..642828c1b299 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. @@ -1183,7 +1183,15 @@ not_found: trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; @@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for + * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, @@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; /* @@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, @@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* - * No errors in the veritical stripe, skip it. Can happen for recovery + * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) @@ -1949,14 +1914,25 @@ out: return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2204,11 +2134,9 @@ no_csum: static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) @@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe @@ -2356,7 +2290,7 @@ write: bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2373,32 +2307,22 @@ write: break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2637,8 +2560,7 @@ submit_write: return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2733,15 +2655,12 @@ out: return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub @@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 7c73a443939e..df0e0abdeb1f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -65,7 +65,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* * Checksum buffer if the rbio is for data. The buffer should cover - * all data sectors (exlcuding P/Q sectors). + * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce..ef13a9d4e370 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b346795f66..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { }; #ifndef CONFIG_64BIT -/* This structure is for archtectures whose (void *) is smaller than u64 */ +/* This structure is for architectures whose (void *) is smaller than u64 */ struct scrub_page_private { u64 logical; }; @@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; } - if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + } - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + } shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + } +out: return sblock->header_error || sblock->checksum_error; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d50182b6deec..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ out: static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ out: return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ out: return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ out_cache: goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ out_cache: else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2884,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -2971,6 +2903,23 @@ out: return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ out: kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 433ce221dc5c..581845bc206a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } /* - * Metadata in mixed block goup profiles are accounted in data + * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 45615ce36498..8c5efa5813b3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -static struct kobj_type btrfs_raid_ktype = { +static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -static struct kobj_type space_info_ktype = { +static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) complete(&fs_devs->kobj_unregister); } -static struct kobj_type btrfs_ktype = { +static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; @@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) complete(&device->kobj_unregister); } -static struct kobj_type devid_ktype = { +static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, @@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) kfree(kobj); } -static struct kobj_type qgroups_ktype = { +static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, @@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) memset(&qgroup->kobj, 0, sizeof(*kobj)); } -static struct kobj_type qgroup_ktype = { +static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, @@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set) +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 __maybe_unused features; - int __maybe_unused ret; + int ret; if (!fs_info) return; - /* - * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not - * safe when called from some contexts (eg. balance) - */ - features = get_features(fs_info, set); - ASSERT(bit & supported_feature_masks[set]); - - fs_devs = fs_info->fs_devices; - fsid_kobj = &fs_devs->fsid_kobj; - + fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; - /* - * FIXME: this is too heavy to update just one value, ideally we'd like - * to use sysfs_update_group but some refactoring is needed first. - */ - sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); - ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret < 0) + btrfs_warn(fs_info, + "failed to update /sys/fs/btrfs/%pU/features: %d", + fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index bacef43f7267..86c7eef12873 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); int __init btrfs_init_sysfs(void); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8c52e89688c..18329ebcb1cb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + /* If we have features changed, wake up the cleaner to update sysfs. */ + if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && + fs_info->cleaner_kthread) + wake_up_process(fs_info->cleaner_kthread); + ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, @@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8..fa728ab80826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58599189bd18..200cea6e49e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part @@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { @@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add( trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr); } } @@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); @@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, path->nodes[*level]->start); } @@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start); } } @@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, /* * If for some unexpected reason the last item's index is not greater - * than the last index we logged, warn and return an error to fallback - * to a transaction commit. + * than the last index we logged, warn and force a transaction commit. */ if (WARN_ON(last_index <= inode->last_dir_index_offset)) - ret = -EUCLEAN; + ret = BTRFS_LOG_FORCE_COMMIT; else inode->last_dir_index_offset = last_index; out: @@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; + } else if (ret > 0) { + ret = 0; } goto done; @@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; } else if (ret < 0) { - err = ret; goto done; } @@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) + if (ret > 0) { ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ - if (ret != 0) goto done; /* @@ -3897,8 +3887,8 @@ search: ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3909,10 +3899,10 @@ search: */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3943,7 +3933,7 @@ done: btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3954,15 +3944,13 @@ done: * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { - btrfs_set_log_full_commit(trans); + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85cd24cb0540..bdeb5216718f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index df43093b7a46..7823168c08a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( /* * Handle the case where the scanned device is part of an fs whose last * metadata UUID change reverted it to the original FSID. At the same - * time * fs_devices was first created by another constitutent device + * time fs_devices was first created by another constituent device * which didn't fully observe the operation. This results in an * btrfs_fs_devices created with metadata/fsid different AND * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the @@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } -/* - * Calculate the geometry of a particular (address, len) tuple. This - * information is used to calculate how big a particular bio can get before it - * straddles a stripe. - * - * @fs_info: the filesystem - * @em: mapping containing the logical extent - * @op: type of operation - write or read - * @logical: address that we want to figure out the geometry of - * @io_geom: pointer used to return values - * - * Returns < 0 in case a chunk for the given logical address cannot be found, - * usually shouldn't happen unless @logical is corrupted, 0 otherwise. - */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom) +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) { - struct map_lookup *map; - u64 len; - u64 offset; - u64 stripe_offset; - u64 stripe_nr; - u32 stripe_len; - u64 raid56_full_stripe_start = (u64)-1; - int data_stripes; + u32 stripe_len = map->stripe_len; ASSERT(op != BTRFS_MAP_DISCARD); - map = em->map_lookup; - /* Offset of this logical address in the chunk */ - offset = logical - em->start; - /* Len of a stripe in a chunk */ - stripe_len = map->stripe_len; /* - * Stripe_nr is where this block falls in - * stripe_offset is the offset of this block in its stripe. + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. */ - stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); - ASSERT(stripe_offset < U32_MAX); + *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + ASSERT(*stripe_offset < U32_MAX); - data_stripes = nr_data_stripes(map); + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - /* Only stripe based profiles needs to check against stripe length. */ - if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { - u64 max_len = stripe_len - stripe_offset; + *full_stripe_start = + div64_u64(offset, full_stripe_len) * full_stripe_len; /* - * In case of raid56, we need to know the stripe aligned start + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* - * Allow a write of a full stripe, but make sure we - * don't allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - - /* - * For writes to RAID[56], allow a full stripeset across - * all disks. For other RAID types and for RAID[56] - * reads, just allow a single stripe (on a single disk). - */ - if (op == BTRFS_MAP_WRITE) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } - } - len = min_t(u64, em->len - offset, max_len); - } else { - len = em->len - offset; + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); } - io_geom->len = len; - io_geom->offset = offset; - io_geom->stripe_len = stripe_len; - io_geom->stripe_nr = stripe_nr; - io_geom->stripe_offset = stripe_offset; - io_geom->raid56_stripe_offset = raid56_full_stripe_start; - - return 0; + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return stripe_len - *stripe_offset; + return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, @@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, { struct extent_map *em; struct map_lookup *map; + u64 map_offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - struct btrfs_io_geometry geom; + u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); @@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); - if (ret < 0) - return ret; - map = em->map_lookup; - - *length = geom.len; - stripe_len = geom.stripe_len; - stripe_nr = geom.stripe_nr; - stripe_offset = geom.stripe_offset; - raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); + stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6b7a05f6cf82..7e51f2238f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -53,21 +53,6 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; -struct btrfs_io_geometry { - /* remaining bytes before crossing a stripe */ - u64 len; - /* offset of logical address in chunk */ - u64 offset; - /* length of single IO stripe */ - u32 stripe_len; - /* offset of address in stripe */ - u32 stripe_offset; - /* number of stripe where address falls */ - u64 stripe_nr; - /* offset of raid56 stripe into the chunk */ - u64 raid56_stripe_offset; -}; - /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f503e8e42d4..f95b2c94d619 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; int ret; if (!*nr_zones) @@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { unsigned int i; + u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; @@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return -EIO; /* Populate cache */ - if (zinfo->zone_cache) + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); + } return 0; } @@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); - /* - * We limit max_zone_append_size also by max_segments * - * PAGE_SIZE. Technically, we can have multiple pages per segment. But, - * since btrfs adds the pages one by one to a bio, and btrfs cannot - * increase the metadata reservation even if it increases the number of - * extents, it is safe to stick with the limit. - * - * With the zoned emulation, we can have non-zoned device on the zoned - * mode. In this case, we don't have a valid max zone append size. So, - * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. - */ - if (bdev_is_zoned(bdev)) { - zone_info->max_zone_append_size = min_t(u64, - (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); - } else { - zone_info->max_zone_append_size = - (u64)bdev_max_segments(bdev) << PAGE_SHIFT; - } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { + struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; - u64 max_zone_append_size = 0; int ret; /* @@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); + blk_set_stacking_limits(lim); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; @@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) zone_info->zone_size, zone_size); return -EINVAL; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = zone_info->max_zone_append_size; + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } } /* @@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, - fs_info->sectorsize); + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; if (fs_info->max_zone_append_size < fs_info->max_extent_size) fs_info->max_extent_size = fs_info->max_zone_append_size; @@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +bool btrfs_use_zone_append(struct btrfs_bio *bbio) { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; @@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + /* * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * extent layout the relocation code has. @@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) return ret; } -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio) +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_extent *ordered; - const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - if (bio_op(bio) != REQ_OP_ZONE_APPEND) - return; - - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); if (WARN_ON(!ordered)) return; ordered->physical = physical; - ordered->bdev = bio->bi_bdev; - btrfs_put_ordered_extent(ordered); } @@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map *em; struct btrfs_ordered_sum *sum; u64 orig_logical = ordered->disk_bytenr; - u64 *logical = NULL; - int nr, stripe_len; + struct map_lookup *map; + u64 physical = ordered->physical; + u64 chunk_start_phys; + u64 logical; - /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(!bdev_is_partition(ordered->bdev)); - if (WARN_ON(!ordered->bdev)) + em = btrfs_get_chunk_map(fs_info, orig_logical, 1); + if (IS_ERR(em)) return; + map = em->map_lookup; + chunk_start_phys = map->stripes[0].physical; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, - ordered->physical, &logical, &nr, - &stripe_len))) - goto out; - - WARN_ON(nr != 1); + if (WARN_ON_ONCE(map->num_stripes > 1) || + WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || + WARN_ON_ONCE(physical < chunk_start_phys) || + WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { + free_extent_map(em); + return; + } + logical = em->start + (physical - map->stripes[0].physical); + free_extent_map(em); - if (orig_logical == *logical) - goto out; + if (orig_logical == logical) + return; - ordered->disk_bytenr = *logical; + ordered->disk_bytenr = logical; em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = *logical; + em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); list_for_each_entry(sum, &ordered->list, list) { - if (*logical < orig_logical) - sum->bytenr -= orig_logical - *logical; + if (logical < orig_logical) + sum->bytenr -= orig_logical - logical; else - sum->bytenr += *logical - orig_logical; + sum->bytenr += logical - orig_logical; } - -out: - kfree(logical); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /* * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d80..c0570d35fea2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, @@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; } -static inline void btrfs_record_physical_zoned(struct inode *inode, - u64 file_offset, struct bio *bio) +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } @@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9804714b1751..f771001574d0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - if (!(dio->flags & IOMAP_DIO_WRITE)) { - WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; - } - - if (iomap->flags & IOMAP_F_ZONE_APPEND) - opflags |= REQ_OP_ZONE_APPEND; - else - opflags |= REQ_OP_WRITE; + opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else diff --git a/include/linux/bio.h b/include/linux/bio.h index c1da63f6c808..d766be7152e1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -12,6 +12,8 @@ #define BIO_MAX_VECS 256U +struct queue_limits; + static inline unsigned int bio_max_segs(unsigned int nr_segs) { return min(nr_segs, BIO_MAX_VECS); @@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0983dfc9a203..fca43a4bd96b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,8 +58,7 @@ struct vm_fault; #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_BUFFER_HEAD (1U << 4) -#define IOMAP_F_ZONE_APPEND (1U << 5) -#define IOMAP_F_XATTR (1U << 6) +#define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 6548b5b5aa60..75d7d22c3a27 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -32,6 +32,7 @@ struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; +struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_root *root, u64 num_bytes, - u64 empty_size, u64 data), + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(root, num_bytes, empty_size, data), + TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) - __field( u64, data ) + __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS)) +); + +TRACE_EVENT(find_free_extent_search_loop, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), + + TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop) +); + +TRACE_EVENT(find_free_extent_have_block_group, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *block_group), + + TP_ARGS(root, ffe_ctl, block_group), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + __field( bool, hinted ) + __field( u64, bg_start ) + __field( u64, bg_flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->bg_start = block_group->start; + __entry->bg_flags = block_group->flags; + ), + + TP_printk_btrfs( +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop, __entry->hinted, + __entry->bg_start, __entry->bg_flags, + __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len), + TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) + __field( u64, loop ) + __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; + __entry->bg_size_class = block_group->size_class; + __entry->start = ffe_ctl->search_start; + __entry->len = ffe_ctl->num_bytes; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), - TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " - "start=%llu len=%llu", + TP_printk_btrfs( +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster, |