summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-04-26 11:10:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-04-26 11:10:42 -0700
commitfd574a2f841c8f07b20e5b55391e0af5d39d82ff (patch)
treeec285862c28e8b38961cc9982a7b68c67f818ca7
parentd615b5416f8a1afeb82d13b238f8152c572d59c0 (diff)
parent5f0addf7b89085f8e0a2593faa419d6111612b9b (diff)
Merge tag 'for-5.18-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - direct IO fixes: - restore passing file offset to correctly calculate checksums when repairing on read and bio split happens - use correct bio when sumitting IO on zoned filesystem - zoned mode fixes: - fix selection of device to correctly calculate device capabilities when allocating a new bio - use a dedicated lock for exclusion during relocation - fix leaked plug after failure syncing log - fix assertion during scrub and relocation * tag 'for-5.18-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: zoned: use dedicated lock for data relocation btrfs: fix assertion failure during scrub due to block group reallocation btrfs: fix direct I/O writes for split bios on zoned devices btrfs: fix direct I/O read repair for split bios btrfs: fix and document the zoned device choice in alloc_new_bio btrfs: fix leaked plug after failure syncing log on zoned filesystems
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/scrub.c26
-rw-r--r--fs/btrfs/tree-log.c1
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/zoned.h4
9 files changed, 76 insertions, 29 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7631b88426e..077c95e9baa5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1060,6 +1060,7 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 71fd99b48283..f26202621989 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 126f244cdf88..ed8e288cc369 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3157,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 724e8fe06aa0..33c19f51d79b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode,
repair_bio = btrfs_bio_alloc(1);
repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
@@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode,
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
- if (wbc) {
- struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_dev->bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- }
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
- device = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
-
- btrfs_bio(bio)->device = device;
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5082b9c70f8c..1c8a43ecfb9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7810,8 +7810,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- const u64 orig_file_offset = dip->file_offset;
- u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
@@ -7821,6 +7819,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
+ u64 start = bbio->file_offset + bio_offset;
+
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, bbio,
@@ -7833,17 +7833,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
} else {
int ret;
- ASSERT((start - orig_file_offset) < UINT_MAX);
- ret = btrfs_repair_one_sector(inode,
- &bbio->bio,
- start - orig_file_offset,
- bvec.bv_page, pgoff,
+ ret = btrfs_repair_one_sector(inode, &bbio->bio,
+ bio_offset, bvec.bv_page, pgoff,
start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
- start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
@@ -7870,6 +7866,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t err = bio->bi_status;
if (err)
@@ -7880,12 +7877,12 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ)
- err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8046,6 +8043,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 11089568b287..8cd713d37ad2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
@@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ASSERT(cache->start == chunk_offset);
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 571dae8ad65e..09e4f1a04e6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
goto out;
}
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bd297f23d19e..f3e28f11cfb6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -328,6 +328,9 @@ struct btrfs_fs_devices {
struct btrfs_bio {
unsigned int mirror_num;
+ /* for direct I/O */
+ u64 file_offset;
+
/* @device is for stripe IO submission. */
struct btrfs_device *device;
u8 *csum;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..6dee76248cb4 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
@@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
#endif