From 143823cf4d5a36cb8c83f5a6adb291bc45f40bc3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 25 May 2022 16:27:25 +0200 Subject: btrfs: fix typos in comments Codespell has found a few typos. Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a5b623ee6fac..e03a38af12cd 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ -- cgit v1.2.3-70-g09d2 From c67c68eb57f1343dd7e315156ff0334ab72158fd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 27 May 2022 15:28:17 +0800 Subject: btrfs: use integrated bitmaps for btrfs_raid_bio::dbitmap and finish_pbitmap Previsouly we use "unsigned long *" for those two bitmaps. But since we only support fixed stripe length (64KiB, already checked in tree-checker), "unsigned long *" is really a waste of memory, while we can just use "unsigned long". This saves us 8 bytes in total for btrfs_raid_bio. To be extra safe, add an ASSERT() making sure calculated @stripe_nsectors is always smaller than BITS_PER_LONG. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e03a38af12cd..90f6ae49fd7b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -164,6 +164,13 @@ struct btrfs_raid_bio { atomic_t stripes_pending; atomic_t error; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + /* * these are two arrays of pointers. We allocate the * rbio big enough to hold them both and setup their @@ -184,14 +191,8 @@ struct btrfs_raid_bio { */ struct sector_ptr *stripe_sectors; - /* Bitmap to record which horizontal stripe has data */ - unsigned long *dbitmap; - /* allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -1038,14 +1039,17 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + sizeof(*rbio->bio_sectors) * num_sectors + sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + - sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1081,8 +1085,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->bio_sectors, num_sectors); CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1939,7 +1941,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* @@ -2374,7 +2376,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); /* * We have already increased bio_counter when getting bioc, record it @@ -2412,7 +2414,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) int stripe; int sectornr; - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { for (stripe = 0; stripe < rbio->real_stripes; stripe++) { struct page *page; int index = (stripe * rbio->stripe_nsectors + sectornr) * @@ -2437,7 +2439,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; @@ -2460,7 +2462,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2497,7 +2499,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; @@ -2525,7 +2527,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, sectornr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2547,7 +2549,7 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); @@ -2714,7 +2716,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; /* * We want to find all the sectors missing from the -- cgit v1.2.3-70-g09d2 From bd8f7e627703ca5707833d623efcd43f104c7b3f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 27 May 2022 15:28:19 +0800 Subject: btrfs: only write the sectors in the vertical stripe which has data stripes If we have only 8K partial write at the beginning of a full RAID56 stripe, we will write the following contents: 0 8K 32K 64K Disk 1 (data): |XX| | | Disk 2 (data): | | | Disk 3 (parity): |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX| |X| means the sector will be written back to disk. Note that, although we won't write any sectors from disk 2, but we will write the full 64KiB of parity to disk. This behavior is fine for now, but not for the future (especially for RAID56J, as we waste quite some space to journal the unused parity stripes). So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we queue a higher level bio into an rbio, we will update rbio::dbitmap to indicate which vertical stripes we need to writeback. And at finish_rmw(), we also check dbitmap to see if we need to write any sector in the vertical stripe. So after the patch, above example will only lead to the following writeback pattern: 0 8K 32K 64K Disk 1 (data): |XX| | | Disk 2 (data): | | | Disk 3 (parity): |XX| | | Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 90f6ae49fd7b..454ceee6bab5 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -392,6 +392,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -933,6 +936,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1284,6 +1293,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1358,6 +1370,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { struct sector_ptr *sector; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) @@ -1384,6 +1400,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { struct sector_ptr *sector; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) @@ -1835,6 +1855,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * rbio->stripe_len); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ @@ -1851,9 +1898,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri btrfs_put_bioc(bioc); return PTR_ERR(rbio); } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; @@ -2258,8 +2304,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { -- cgit v1.2.3-70-g09d2 From 4d10046613333508d31fe926c545c8c0b620508a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 1 Jun 2022 13:54:28 +0800 Subject: btrfs: update stripe_sectors::uptodate in steal_rbio [BUG] With added debugging, it turns out the following write sequence would cause extra read which is unnecessary: # xfs_io -f -s -c "pwrite -b 32k 0 32k" -c "pwrite -b 32k 32k 32k" \ -c "pwrite -b 32k 64k 32k" -c "pwrite -b 32k 96k 32k" \ $mnt/file The debug message looks like this (btrfs header skipped): partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 ^^^^ Still partial read, even 389152768 is already cached by the first. write. full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=0 physical=22020096 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 ^^^^ Still partial read for 298844160. full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768 This means every 32K writes, even they are in the same full stripe, still trigger read for previously cached data. This would cause extra RAID56 IO, making the btrfs raid56 cache useless. [CAUSE] Commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") tries to make steal_rbio() subpage compatible, but during that conversion, there is one thing missing. We no longer rely on PageUptodate(rbio->stripe_pages[i]), but rbio->stripe_nsectors[i].uptodate to determine if a sector is uptodate. This means, previously if we switch the pointer, everything is done, as the PageUptodate flag is still bound to that page. But now we have to manually mark the involved sectors uptodate, or later raid56_rmw_stripe() will find the stolen sector is not uptodate, and assemble the read bio for it, wasting IO. [FIX] We can easily fix the bug, by also update the rbio->stripe_sectors[].uptodate in steal_rbio(). With this fixed, now the same write pattern no longer leads to the same unnecessary read: partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768 ^^^ No more partial read, directly into the write path. full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768 Fixes: d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 454ceee6bab5..c48b7a0992f6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -348,6 +348,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) } } +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -359,7 +377,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; @@ -369,12 +386,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) if (!s || !full_page_sectors_uptodate(src, i)) continue; - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); -- cgit v1.2.3-70-g09d2 From b8bea09a456fc31af8f10029e69d105cac7e530e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 1 Jun 2022 17:46:59 +0800 Subject: btrfs: add trace event for submitted RAID56 bio Add tracepoint for better insight to how the RAID56 data are submitted. The output looks like this: (trace event header and UUID skipped) raid56_read_partial: full_stripe=389152768 devid=3 type=DATA1 offset=32768 opf=0x0 physical=323059712 len=32768 raid56_read_partial: full_stripe=389152768 devid=1 type=DATA2 offset=0 opf=0x0 physical=67174400 len=65536 raid56_write_stripe: full_stripe=389152768 devid=3 type=DATA1 offset=0 opf=0x1 physical=323026944 len=32768 raid56_write_stripe: full_stripe=389152768 devid=2 type=PQ1 offset=0 opf=0x1 physical=323026944 len=32768 The above debug output is from a 32K data write into an empty RAID56 data chunk. Some explanation on the event output: full_stripe: the logical bytenr of the full stripe devid: btrfs devid type: raid stripe type. DATA1: the first data stripe DATA2: the second data stripe PQ1: the P stripe PQ2: the Q stripe offset: the offset inside the stripe. opf: the bio op type physical: the physical offset the bio is for len: the length of the bio The first two lines are from partial RMW read, which is reading the remaining data stripes from disks. The last two lines are for full stripe RMW write, which is writing the involved two 16K stripes (one for DATA1 stripe, one for P stripe). The stripe for DATA2 doesn't need to be written. There are 5 types of trace events: - raid56_read_partial Read remaining data for regular read/write path. - raid56_write_stripe Write the modified stripes for regular read/write path. - raid56_scrub_read_recover Read remaining data for scrub recovery path. - raid56_scrub_write_stripe Write the modified stripes for scrub path. - raid56_scrub_read Read remaining data for scrub path. Also, since the trace events are included at super.c, we have to export needed structure definitions to 'raid56.h' and include the header in super.c, or we're unable to access those members. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 190 +++++++++++++------------------------------ fs/btrfs/raid56.h | 148 ++++++++++++++++++++++++++++++++- fs/btrfs/super.c | 1 + include/trace/events/btrfs.h | 94 +++++++++++++++++++++ 4 files changed, 300 insertions(+), 133 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c48b7a0992f6..baba435692d2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -63,138 +63,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct work_struct work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* Size of each individual stripe on disk */ - u32 stripe_len; - - /* How many pages there are for the full stripe including P/Q */ - u16 nr_pages; - - /* How many sectors there are for the full stripe including P/Q */ - u16 nr_sectors; - - /* Number of data stripes (no p/q) */ - u8 nr_data; - - /* Number of all stripes (including P/Q) */ - u8 real_stripes; - - /* How many pages there are for each stripe */ - u8 stripe_npages; - - /* How many sectors there are for each stripe */ - u8 stripe_nsectors; - - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - - /* Stripe number that we're scrubbing */ - u8 scrubp; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - - /* Bitmap to record which horizontal stripe has data */ - unsigned long dbitmap; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long finish_pbitmap; - - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; - - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; -}; - static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); @@ -1275,6 +1143,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_unlock_irq(&rbio->bio_list_lock); } +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; + } + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1440,6 +1336,12 @@ write_data: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1701,6 +1603,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2274,6 +2182,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2643,6 +2557,12 @@ submit_write: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2822,6 +2742,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index aaad08aefd7d..3badde24dcbf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,6 +7,152 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* Size of each individual stripe on disk */ + u32 stripe_len; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + int generic_bio_cnt; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; + static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) @@ -21,13 +167,13 @@ static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 24b86061c5df..8539ee2dc79f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 9ae94ef3e270..29fa8ea2cc0f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -30,6 +30,8 @@ struct btrfs_qgroup; struct extent_io_tree; struct prelim_ref; struct btrfs_space_info; +struct btrfs_raid_bio; +struct raid56_bio_trace_info; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -2258,6 +2260,98 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, TP_ARGS(fs_info, sinfo, old, diff) ); +DECLARE_EVENT_CLASS(btrfs_raid56_bio, + + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info), + + TP_STRUCT__entry_btrfs( + __field( u64, full_stripe ) + __field( u64, physical ) + __field( u64, devid ) + __field( u32, offset ) + __field( u32, len ) + __field( u8, opf ) + __field( u8, total_stripes ) + __field( u8, real_stripes ) + __field( u8, nr_data ) + __field( u8, stripe_nr ) + ), + + TP_fast_assign_btrfs(rbio->bioc->fs_info, + __entry->full_stripe = rbio->bioc->raid_map[0]; + __entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + __entry->len = bio->bi_iter.bi_size; + __entry->opf = bio_op(bio); + __entry->devid = trace_info->devid; + __entry->offset = trace_info->offset; + __entry->stripe_nr = trace_info->stripe_nr; + __entry->total_stripes = rbio->bioc->num_stripes; + __entry->real_stripes = rbio->real_stripes; + __entry->nr_data = rbio->nr_data; + ), + /* + * For type output, we need to output things like "DATA1" + * (the first data stripe), "DATA2" (the second data stripe), + * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device). + */ + TP_printk_btrfs( +"full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u", + __entry->full_stripe, __entry->devid, + (__entry->stripe_nr < __entry->nr_data) ? "DATA" : + ((__entry->stripe_nr < __entry->real_stripes) ? "PQ" : + "REPLACE"), + (__entry->stripe_nr < __entry->nr_data) ? + (__entry->stripe_nr + 1) : + ((__entry->stripe_nr < __entry->real_stripes) ? + (__entry->stripe_nr - __entry->nr_data + 1) : 0), + __entry->offset, __entry->opf, __entry->physical, __entry->len) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3-70-g09d2 From d34e123de1e66061051cd19e61b62fad6027fc4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:36 +0200 Subject: btrfs: defer I/O completion based on the btrfs_raid_bio Instead of attaching an extra allocation an indirect call to each low-level bio issued by the RAID code, add a work_struct to struct btrfs_raid_bio and only defer the per-rbio completion action. The per-bio action for all the I/Os are trivial and can be safely done from interrupt context. As a nice side effect this also allows sharing the boilerplate code for the per-bio completions Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 12 +++---- fs/btrfs/disk-io.h | 1 - fs/btrfs/raid56.c | 102 +++++++++++++++++++++-------------------------------- fs/btrfs/raid56.h | 2 ++ 5 files changed, 47 insertions(+), 72 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6d4e71f52910..1d5b38f3aa5f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -852,7 +852,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; + struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 018510188a0d..1c9c6c2980dd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -754,14 +754,10 @@ static void end_workqueue_bio(struct bio *bio) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; else wq = fs_info->endio_write_workers; } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) + if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; else wq = fs_info->endio_workers; @@ -2281,7 +2277,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2490,8 +2487,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4ee8c42c9f78..809ef065f166 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA, BTRFS_WQ_ENDIO_METADATA, BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index baba435692d2..00cd9e8db7ae 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1488,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1507,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio) bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1598,10 +1601,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_rmw_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_read_partial_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; @@ -2076,25 +2078,13 @@ cleanup_io: } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2177,10 +2167,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_recover_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_scrub_read_recover_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; @@ -2650,24 +2639,14 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2737,10 +2716,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_parity_scrub_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_scrub_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3badde24dcbf..3b22657ca857 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -100,6 +100,8 @@ struct btrfs_raid_bio { atomic_t error; + struct work_struct end_io_work; + /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; -- cgit v1.2.3-70-g09d2 From 5eecef7108350f6506a55d8fd9508ea32caeecad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 08:57:42 +0200 Subject: btrfs: stop looking at btrfs_bio->iter in index_one_bio All the bios that index_one_bio operates on are the bios submitted by the upper layer. These are never resubmitted to an actual device by the raid56 code, and thus the iter never changes from the initial state. Thus we can always just use bi_iter directly as it will be the same as the saved copy. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 00cd9e8db7ae..3c5886977937 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1106,9 +1106,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->raid_map[0]; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; -- cgit v1.2.3-70-g09d2 From 369200446564f04b5cd14596d69e17880be7d926 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jun 2022 15:51:18 +0800 Subject: btrfs: raid56: avoid double for loop inside finish_rmw() We can easily calculate the stripe number and sector number inside the stripe. Thus there is not much need for a double for loop. For the only case we want to skip the whole stripe, we can manually increase @total_sector_nr. This is not a recommended behavior, thus every time the iterator gets modified there will be a comment along with an ASSERT() for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 97 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 42 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3c5886977937..c63845c036df 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1182,7 +1182,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; + /* Sector number inside a stripe. */ int sectornr; bool has_qstripe; struct bio_list bio_list; @@ -1267,63 +1270,73 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, rbio->stripe_len, + REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: -- cgit v1.2.3-70-g09d2 From ef340fccbe982a14588ed15eb3a20b2e5b78a3e1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jun 2022 15:51:19 +0800 Subject: btrfs: raid56: avoid double for loop inside __raid56_parity_recover() The double for loop can be easily converted to single for loop as we're really iterating the sectors in their bytenr order. The only exception is the full stripe skip, however that can also easily be done inside the loop. Add an ASSERT() along with a comment for that specific case. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c63845c036df..ae4556b98060 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2115,8 +2115,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2132,29 +2131,29 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * stripe cache, it is possible that some or all of these * pages are going to be uptodate. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } + /* The RMW code may have already read this page in. */ + sector = rbio_stripe_sector(rbio, stripe, sectornr); + if (sector->uptodate) + continue; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * the rmw code may have already read this - * page in - */ - sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret < 0) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); -- cgit v1.2.3-70-g09d2 From aee35e4bcc3029fc3be15f696043107125b7209f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:34 +0800 Subject: btrfs: raid56: avoid double for loop inside alloc_rbio_essential_pages() The double loop is just checking if the page for the vertical stripe is allocated. We can easily convert it to single loop and get rid of @stripe variable. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ae4556b98060..41cdeff63a6b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2380,23 +2380,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe; - int sectornr; - - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - struct page *page; - int index = (stripe * rbio->stripe_nsectors + sectornr) * - sectorsize >> PAGE_SHIFT; + int total_sector_nr; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; -- cgit v1.2.3-70-g09d2 From 550cdeb3e09808540454012ddf896dae466d8822 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:35 +0800 Subject: btrfs: raid56: avoid double for loop inside raid56_rmw_stripe() This function doesn't even utilize full stripe skip, just iterate all the data sectors is definitely enough. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 59 ++++++++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 41cdeff63a6b..7ddcac96e844 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1547,9 +1547,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1561,38 +1561,35 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a page in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); -- cgit v1.2.3-70-g09d2 From 1c10702e7cb9ddecdcf032f83dad7a3583689a8e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:36 +0800 Subject: btrfs: raid56: avoid double for loop inside raid56_parity_scrub_stripe() Originally it's iterating all the sectors which has dbitmap sector for the vertical stripe. It can be easily converted to sector bytenr iteration with an test_bit() call. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7ddcac96e844..f002334d244a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2661,8 +2661,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2672,37 +2671,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a sector in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate sector. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); -- cgit v1.2.3-70-g09d2 From 0b30f719451ebbf313cdb444a27b00c10cf6e8a5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 May 2022 16:34:30 +0800 Subject: btrfs: use btrfs_raid_array to calculate number of parity stripes Use the raid table instead of hard coded values and rename the helper as it is exported. This could make later extension on RAID56 based profiles easier. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 10 ++-------- fs/btrfs/raid56.h | 12 +----------- fs/btrfs/volumes.c | 7 +++++++ fs/btrfs/volumes.h | 1 + 4 files changed, 11 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f002334d244a..0f0368e63e5a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -922,7 +922,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; void *p; ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); @@ -976,14 +975,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->finish_pointers, real_stripes); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3b22657ca857..c73bceb2b461 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -155,19 +155,9 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} - static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } #define RAID5_P_STRIPE ((u64)-2) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 75a59423a1bf..e40c0d59c4a0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f19916a69bea..b61508723d5d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -634,6 +634,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); -- cgit v1.2.3-70-g09d2 From f6065f8edeb25f4a9dfe0b446030ad995a84a088 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Jun 2022 13:18:44 +0800 Subject: btrfs: raid56: don't trust any cached sector in __raid56_parity_recover() [BUG] There is a small workload which will always fail with recent kernel: (A simplified version from btrfs/125 test case) mkfs.btrfs -f -m raid5 -d raid5 -b 1G $dev1 $dev2 $dev3 mount $dev1 $mnt xfs_io -f -c "pwrite -S 0xee 0 1M" $mnt/file1 sync umount $mnt btrfs dev scan -u $dev3 mount -o degraded $dev1 $mnt xfs_io -f -c "pwrite -S 0xff 0 128M" $mnt/file2 umount $mnt btrfs dev scan mount $dev1 $mnt btrfs balance start --full-balance $mnt umount $mnt The failure is always failed to read some tree blocks: BTRFS info (device dm-4): relocating block group 217710592 flags data|raid5 BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 ... [CAUSE] With the recently added debug output, we can see all RAID56 operations related to full stripe 38928384: 56.1183: raid56_read_partial: full_stripe=38928384 devid=2 type=DATA1 offset=0 opf=0x0 physical=9502720 len=65536 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=16384 opf=0x0 physical=9519104 len=16384 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x0 physical=9551872 len=16384 56.1187: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=0 opf=0x1 physical=9502720 len=16384 56.1188: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=32768 opf=0x1 physical=9535488 len=16384 56.1188: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=0 opf=0x1 physical=30474240 len=16384 56.1189: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=32768 opf=0x1 physical=30507008 len=16384 56.1218: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x1 physical=9551872 len=16384 56.1219: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=49152 opf=0x1 physical=30523392 len=16384 56.2721: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 56.2723: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 56.2724: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 Before we enter raid56_parity_recover(), we have triggered some metadata write for the full stripe 38928384, this leads to us to read all the sectors from disk. Furthermore, btrfs raid56 write will cache its calculated P/Q sectors to avoid unnecessary read. This means, for that full stripe, after any partial write, we will have stale data, along with P/Q calculated using that stale data. Thankfully due to patch "btrfs: only write the sectors in the vertical stripe which has data stripes" we haven't submitted all the corrupted P/Q to disk. When we really need to recover certain range, aka in raid56_parity_recover(), we will use the cached rbio, along with its cached sectors (the full stripe is all cached). This explains why we have no event raid56_scrub_read_recover() triggered. Since we have the cached P/Q which is calculated using the stale data, the recovered one will just be stale. In our particular test case, it will always return the same incorrect metadata, thus causing the same error message "parent transid verify failed on 39010304 wanted 9 found 7" again and again. [BTRFS DESTRUCTIVE RMW PROBLEM] Test case btrfs/125 (and above workload) always has its trouble with the destructive read-modify-write (RMW) cycle: 0 32K 64K Data1: | Good | Good | Data2: | Bad | Bad | Parity: | Good | Good | In above case, if we trigger any write into Data1, we will use the bad data in Data2 to re-generate parity, killing the only chance to recovery Data2, thus Data2 is lost forever. This destructive RMW cycle is not specific to btrfs RAID56, but there are some btrfs specific behaviors making the case even worse: - Btrfs will cache sectors for unrelated vertical stripes. In above example, if we're only writing into 0~32K range, btrfs will still read data range (32K ~ 64K) of Data1, and (64K~128K) of Data2. This behavior is to cache sectors for later update. Incidentally commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") has a bug which makes RAID56 to never trust the cached sectors, thus slightly improve the situation for recovery. Unfortunately, follow up fix "btrfs: update stripe_sectors::uptodate in steal_rbio" will revert the behavior back to the old one. - Btrfs raid56 partial write will update all P/Q sectors and cache them This means, even if data at (64K ~ 96K) of Data2 is free space, and only (96K ~ 128K) of Data2 is really stale data. And we write into that (96K ~ 128K), we will update all the parity sectors for the full stripe. This unnecessary behavior will completely kill the chance of recovery. Thankfully, an unrelated optimization "btrfs: only write the sectors in the vertical stripe which has data stripes" will prevent submitting the write bio for untouched vertical sectors. That optimization will keep the on-disk P/Q untouched for a chance for later recovery. [FIX] Although we have no good way to completely fix the destructive RMW (unless we go full scrub for each partial write), we can still limit the damage. With patch "btrfs: only write the sectors in the vertical stripe which has data stripes" now we won't really submit the P/Q of unrelated vertical stripes, so the on-disk P/Q should still be fine. Now we really need to do is just drop all the cached sectors when doing recovery. By this, we have a chance to read the original P/Q from disk, and have a chance to recover the stale data, while still keep the cache to speed up regular write path. In fact, just dropping all the cache for recovery path is good enough to allow the test case btrfs/125 along with the small script to pass reliably. The lack of metadata write after the degraded mount, and forced metadata COW is saving us this time. So this patch will fix the behavior by not trust any cache in __raid56_parity_recover(), to solve the problem while still keep the cache useful. But please note that this test pass DOES NOT mean we have solved the destructive RMW problem, we just do better damage control a little better. Related patches: - btrfs: only write the sectors in the vertical stripe - d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") - btrfs: update stripe_sectors::uptodate in steal_rbio Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0f0368e63e5a..c6411c849fea 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2118,9 +2118,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2135,11 +2138,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) total_sector_nr += rbio->stripe_nsectors - 1; continue; } - /* The RMW code may have already read this page in. */ sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, rbio->stripe_len, REQ_OP_READ); -- cgit v1.2.3-70-g09d2 From ff18a4afebdd9b4441983a777b88095250e9de1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:05 +0200 Subject: btrfs: raid56: use fixed stripe length everywhere The raid56 code assumes a fixed stripe length BTRFS_STRIPE_LEN but there are functions passing it as arguments, this is not necessary. The fixed value has been used for a long time and though the stripe length should be configurable by super block member stripesize, this hasn't been implemented and would require more changes so we don't need to keep this code around until then. Partially based on a patch from Qu Wenruo. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig [ update changelog ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 61 ++++++++++++++++++++++++------------------------------ fs/btrfs/raid56.h | 12 ++++------- fs/btrfs/scrub.c | 9 +++----- fs/btrfs/volumes.c | 13 +++++------- 4 files changed, 39 insertions(+), 56 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c6411c849fea..f4d3200a14dc 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -474,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -913,18 +913,17 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u32 stripe_len) + struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; - const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; void *p; - ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); /* @@ -948,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; @@ -1020,7 +1018,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, - unsigned long bio_max_len, unsigned int opf) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1065,7 +1062,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), opf, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; @@ -1287,8 +1285,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1327,8 +1324,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1373,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1395,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1580,8 +1576,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); + stripe, sectornr, REQ_OP_READ); if (ret) goto cleanup; } @@ -1790,7 +1785,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) ASSERT(orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + - rbio->nr_data * rbio->stripe_len); + rbio->nr_data * BTRFS_STRIPE_LEN); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; @@ -1808,7 +1803,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -1816,7 +1811,7 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); return PTR_ERR(rbio); @@ -2140,8 +2135,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_READ); + sectornr, REQ_OP_READ); if (ret < 0) goto cleanup; } @@ -2199,7 +2193,7 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io) + int mirror_num, int generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -2210,7 +2204,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, btrfs_bio(bio)->mirror_num = mirror_num; } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { if (generic_io) btrfs_put_bioc(bioc); @@ -2304,14 +2298,14 @@ static void read_rebuild_work(struct work_struct *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2356,7 +2350,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical >= rbio->bioc->raid_map[0]); ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset / sectorsize; rbio->bio_sectors[index].page = page; @@ -2512,7 +2506,7 @@ writeback: sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2526,7 +2520,7 @@ writeback: sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2693,7 +2687,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, REQ_OP_READ); + sectornr, REQ_OP_READ); if (ret) goto cleanup; } @@ -2758,13 +2752,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index c73bceb2b461..1dce205b79bf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -56,9 +56,6 @@ struct btrfs_raid_bio { */ enum btrfs_rbio_ops operation; - /* Size of each individual stripe on disk */ - u32 stripe_len; - /* How many pages there are for the full stripe including P/Q */ u16 nr_pages; @@ -169,21 +166,20 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); + int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u32 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a0c45e92bd6c..ad7958d18158 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1216,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1231,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1335,7 +1334,6 @@ leave_nomem: scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, @@ -1387,7 +1385,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, mirror_num = sector->sblock->sectors[0]->mirror_num; ret = raid56_parity_recover(bio, sector->recover->bioc, - sector->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -2195,7 +2192,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; @@ -2829,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, &sparity->dbitmap, sparity->nsectors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d788a351c1f..36a5466266c4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6461,6 +6461,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6758,14 +6759,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + ret = raid56_parity_write(bio, bioc); + else + ret = raid56_parity_recover(bio, bioc, mirror_num, 1); goto out_dec; } -- cgit v1.2.3-70-g09d2 From 31683f4aae4def0ecf07c77b5440833cd686bc7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:08 +0200 Subject: btrfs: do not return errors from raid56_parity_write Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission does and avoids any confusion on who needs to handle errors. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 23 +++++++++++++++-------- fs/btrfs/raid56.h | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f4d3200a14dc..0408ef29bd02 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1803,18 +1803,19 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto out; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1829,8 +1830,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + goto out_dec_counter; + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1841,13 +1842,19 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); + goto out_dec_counter; } - return ret; + + return; + +out_dec_counter: + btrfs_bio_counter_dec(fs_info); +out: + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 1dce205b79bf..3f223ae39462 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -167,7 +167,7 @@ struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b2ad30e0221..ed440b5a300c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6762,7 +6762,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { if (btrfs_op(bio) == BTRFS_MAP_WRITE) - ret = raid56_parity_write(bio, bioc); + raid56_parity_write(bio, bioc); else ret = raid56_parity_recover(bio, bioc, mirror_num, 1); goto out_dec; -- cgit v1.2.3-70-g09d2 From 6065fd95dae1013f339c78d067eb71f0761c654b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:09 +0200 Subject: btrfs: do not return errors from raid56_parity_recover Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission does and avoids any confusion on who needs to handle errors. Also use the proper bool type for the generic_io argument. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 39 ++++++++++++++++----------------------- fs/btrfs/raid56.h | 4 ++-- fs/btrfs/scrub.c | 10 ++-------- fs/btrfs/volumes.c | 2 +- 4 files changed, 21 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0408ef29bd02..84d0e073b409 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2199,12 +2199,11 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); @@ -2213,9 +2212,8 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; @@ -2227,10 +2225,9 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); kfree(rbio); - return -EIO; + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } if (generic_io) { @@ -2257,24 +2254,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done - */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + if (generic_io) + btrfs_put_bioc(bioc); + bio_endio(bio); } static void rmw_work(struct work_struct *work) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3f223ae39462..6f48f9e4c869 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -165,8 +165,8 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, int generic_io); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ad7958d18158..3afe5fa50a63 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1376,18 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = sector->sblock->sectors[0]->mirror_num; - ret = raid56_parity_recover(bio, sector->recover->bioc, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, + sector->sblock->sectors[0]->mirror_num, false); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ed440b5a300c..c9328cbd7fe9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6764,7 +6764,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror if (btrfs_op(bio) == BTRFS_MAP_WRITE) raid56_parity_write(bio, bioc); else - ret = raid56_parity_recover(bio, bioc, mirror_num, 1); + raid56_parity_recover(bio, bioc, mirror_num, true); goto out_dec; } -- cgit v1.2.3-70-g09d2 From b9af128d1e81645e7d9030e30def06ea5032f201 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:10 +0200 Subject: btrfs: raid56: transfer the bio counter reference to the raid submission helpers Transfer the bio counter reference acquired by btrfs_submit_bio to raid56_parity_write and raid56_parity_recovery together with the bio that the reference was acquired for instead of acquiring another reference in those helpers and dropping the original one in btrfs_submit_bio. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 16 ++++++---------- fs/btrfs/volumes.c | 15 +++++++-------- 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/raid56.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 84d0e073b409..1afe32d5ab01 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1815,12 +1815,11 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); ret = PTR_ERR(rbio); - goto out; + goto out_dec_counter; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); - btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1852,7 +1851,6 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) out_dec_counter: btrfs_bio_counter_dec(fs_info); -out: bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); } @@ -2208,6 +2206,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); btrfs_bio(bio)->mirror_num = mirror_num; + } else { + btrfs_get_bioc(bioc); } rbio = alloc_rbio(fs_info, bioc); @@ -2230,12 +2230,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, goto out_end_bio; } - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); + if (generic_io) rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); - } /* * Loop retry: @@ -2265,8 +2261,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, return; out_end_bio: - if (generic_io) - btrfs_put_bioc(bioc); + btrfs_bio_counter_dec(fs_info); + btrfs_put_bioc(bioc); bio_endio(bio); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c9328cbd7fe9..bf4e140f6bfc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6750,8 +6750,12 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); - if (ret) - goto out_dec; + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return; + } total_devs = bioc->num_stripes; bioc->orig_bio = bio; @@ -6765,7 +6769,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror raid56_parity_write(bio, bioc); else raid56_parity_recover(bio, bioc, mirror_num, true); - goto out_dec; + return; } if (map_length < length) { @@ -6780,12 +6784,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror submit_stripe_bio(bioc, bio, dev_nr, should_clone); } -out_dec: btrfs_bio_counter_dec(fs_info); - if (ret) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - } } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, -- cgit v1.2.3-70-g09d2