diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 21:13:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 21:13:45 -0700 |
commit | 7e284070abe53d448517b80493863595af4ab5f0 (patch) | |
tree | 47c568d01f3a64f346362fc687165983e04d65bc | |
parent | 780d8ce7162818cfe03e9a5e23b3af192a1d37bc (diff) | |
parent | ca522482e3eafd005b8d4e8b1331c911505a58d5 (diff) |
Merge tag 'for-5.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Enable DM core bioset's per-cpu bio cache if QUEUE_FLAG_POLL set.
This change improves DM's hipri bio polling (REQ_POLLED) performance
by 7 - 20% depending on the system.
- Update DM core to use jump_labels to further reduce cost of unlikely
branches for zoned block devices, dm-stats and swap_bios throttling.
- Various DM core changes to reduce bio-based DM overhead and simplify
IO accounting.
- Fundamental DM core improvements to dm_io reference counting and the
elimination of using bio_split()+bio_chain() -- instead DM's
bio-based IO accounting is updated to account that a split occurred.
- Improve DM core's abnormal bio processing to do less work.
- Improve DM core's hipri polling support to use a single list rather
than an hlist.
- Update DM core to pass NULL bdev to bio_alloc_clone() so that
initialization that isn't useful for DM can be elided.
- Add cond_resched to DM stats' various loops that loop over all
entries.
- Fix incorrect error code return from DM integrity's constructor.
- Make DM crypt's printing of the key constant-time.
- Update bio-based DM multipath to provide high-resolution timer to the
Historical Service Time (HST) path selector.
* tag 'for-5.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (26 commits)
dm: pass NULL bdev to bio_alloc_clone
dm cache metadata: remove unnecessary variable in __dump_mapping
dm mpath: provide high-resolution timer to HST for bio-based
dm crypt: make printing of the key constant-time
dm integrity: fix error code in dm_integrity_ctr()
dm stats: add cond_resched when looping over entries
dm: improve abnormal bio processing
dm: simplify bio-based IO accounting further
dm: put all polled dm_io instances into a single list
dm: improve dm_io reference counting
dm: don't grab target io reference in dm_zone_map_bio
dm: improve bio splitting and associated IO accounting
dm: switch to bdev based IO accounting interfaces
dm: pass dm_io instance to dm_io_acct directly
dm: don't pass bio to __dm_start_io_acct and dm_end_io_acct
dm: use bio_sectors in dm_aceept_partial_bio
dm: simplify basic targets
dm: conditionally enable branching for less used features
dm: introduce dm_{get,put}_live_table_bio called from dm_submit_bio
dm: move hot dm_io members to same cacheline as dm_target_io
...
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 38 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 14 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-path-selector.h | 15 | ||||
-rw-r--r-- | drivers/md/dm-ps-historical-service-time.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-stats.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-zone.c | 10 | ||||
-rw-r--r-- | drivers/md/dm.c | 556 | ||||
-rw-r--r-- | drivers/md/dm.h | 4 |
15 files changed, 409 insertions, 287 deletions
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 2874f222c313..ab13b7380265 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1509,7 +1509,6 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, static int __dump_mapping(void *context, uint64_t cblock, void *leaf) { - int r = 0; __le64 value; dm_oblock_t oblock; unsigned flags; @@ -1517,7 +1516,7 @@ static int __dump_mapping(void *context, uint64_t cblock, void *leaf) memcpy(&value, leaf, sizeof(value)); unpack_value(value, &oblock, &flags); - return r; + return 0; } static int __dump_mappings(struct dm_cache_metadata *cmd) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 4277853c7535..d21648a923ea 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,6 +13,7 @@ #include <linux/ktime.h> #include <linux/blk-mq.h> #include <linux/blk-crypto-profile.h> +#include <linux/jump_label.h> #include <trace/events/block.h> @@ -154,6 +155,10 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md) return &md->stats; } +DECLARE_STATIC_KEY_FALSE(stats_enabled); +DECLARE_STATIC_KEY_FALSE(swap_bios_enabled); +DECLARE_STATIC_KEY_FALSE(zoned_enabled); + static inline bool dm_emulate_zone_append(struct mapped_device *md) { if (blk_queue_is_zoned(md->queue)) @@ -237,6 +242,12 @@ static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) tio->flags |= (1U << bit); } +static inline bool dm_tio_is_normal(struct dm_target_io *tio) +{ + return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && + !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); +} + /* * One of these is allocated per original bio. * It contains the first clone used for that original. @@ -245,16 +256,20 @@ static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) struct dm_io { unsigned short magic; blk_short_t flags; - atomic_t io_count; - struct mapped_device *md; - struct bio *orig_bio; - blk_status_t status; spinlock_t lock; unsigned long start_time; void *data; - struct hlist_node node; - struct task_struct *map_task; + struct dm_io *next; struct dm_stats_aux stats_aux; + blk_status_t status; + atomic_t io_count; + struct mapped_device *md; + + /* The three fields represent mapped part of original bio */ + struct bio *orig_bio; + unsigned int sector_offset; /* offset to end of orig_bio */ + unsigned int sectors; + /* last member of dm_target_io is 'struct bio' */ struct dm_target_io tio; }; @@ -263,8 +278,8 @@ struct dm_io { * dm_io flags */ enum { - DM_IO_START_ACCT, - DM_IO_ACCOUNTED + DM_IO_ACCOUNTED, + DM_IO_WAS_SPLIT }; static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) @@ -277,13 +292,6 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) io->flags |= (1U << bit); } -static inline void dm_io_inc_pending(struct dm_io *io) -{ - atomic_inc(&io->io_count); -} - -void dm_io_dec_pending(struct dm_io *io, blk_status_t error); - static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fb80539865d7..159c6806c19b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3439,6 +3439,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static char hex2asc(unsigned char c) +{ + return c + '0' + ((unsigned)(9 - c) >> 4 & 0x27); +} + static void crypt_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -3457,9 +3462,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, if (cc->key_size > 0) { if (cc->key_string) DMEMIT(":%u:%s", cc->key_size, cc->key_string); - else - for (i = 0; i < cc->key_size; i++) - DMEMIT("%02x", cc->key[i]); + else { + for (i = 0; i < cc->key_size; i++) { + DMEMIT("%c%c", hex2asc(cc->key[i] >> 4), + hex2asc(cc->key[i] & 0xf)); + } + } } else DMEMIT("-"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 9a51bf51a859..869afef5654a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -296,8 +296,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) } delayed->class = c; bio_set_dev(bio, c->dev->bdev); - if (bio_sectors(bio)) - bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, c, bio); } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 345229d7e59c..f2305eb758a2 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,9 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - flakey_map_sector(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 36ae30b73a6e..3d5a0ce123c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4494,8 +4494,6 @@ try_smaller_buffer: } if (should_write_sb) { - int r; - init_journal(ic, 0, ic->journal_sections, 0); r = dm_integrity_failed(ic); if (unlikely(r)) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 76b486e4d2be..0a6abbbe3745 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -84,19 +84,12 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) return lc->start + dm_target_offset(ti, bi_sector); } -static void linear_map_bio(struct dm_target *ti, struct bio *bio) +static int linear_map(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - linear_map_sector(ti, bio->bi_iter.bi_sector); -} - -static int linear_map(struct dm_target *ti, struct bio *bio) -{ - linear_map_bio(ti, bio); + bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); return DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6ed9d2731254..0e325469a252 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -105,6 +105,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; size_t nr_bytes; + u64 start_time_ns; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -295,6 +296,7 @@ static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mp mpio->nr_bytes = bio->bi_iter.bi_size; mpio->pgpath = NULL; + mpio->start_time_ns = 0; *mpio_p = mpio; dm_bio_record(bio_details, bio); @@ -647,6 +649,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, mpio->pgpath = pgpath; + if (dm_ps_use_hr_timer(pgpath->pg->ps.type)) + mpio->start_time_ns = ktime_get_ns(); + bio->bi_status = 0; bio_set_dev(bio, pgpath->path.dev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -1713,7 +1718,8 @@ done: if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, - dm_start_time_ns_from_clone(clone)); + (mpio->start_time_ns ?: + dm_start_time_ns_from_clone(clone))); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index c47bc0e20275..83cac2b04b66 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -26,11 +26,26 @@ struct path_selector { void *context; }; +/* + * If a path selector uses this flag, a high resolution timer is used + * (via ktime_get_ns) to account for IO start time in BIO-based mpath. + * This improves performance of some path selectors (i.e. HST), in + * exchange for slightly higher overhead when submitting the BIO. + * The extra cost is usually offset by improved path selection for + * some benchmarks. + * + * This has no effect for request-based mpath, since it already uses a + * higher precision timer by default. + */ +#define DM_PS_USE_HR_TIMER 0x00000001 +#define dm_ps_use_hr_timer(type) ((type)->features & DM_PS_USE_HR_TIMER) + /* Information about a path selector type */ struct path_selector_type { char *name; struct module *module; + unsigned int features; unsigned int table_args; unsigned int info_args; diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index 82f2a06153dc..1d82c95d323d 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -523,6 +523,7 @@ static int hst_end_io(struct path_selector *ps, struct dm_path *path, static struct path_selector_type hst_ps = { .name = "historical-service-time", .module = THIS_MODULE, + .features = DM_PS_USE_HR_TIMER, .table_args = 1, .info_args = 3, .create = hst_create, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 0e039a8c0bf2..8326f9fe0e91 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -225,6 +225,7 @@ void dm_stats_cleanup(struct dm_stats *stats) atomic_read(&shared->in_flight[READ]), atomic_read(&shared->in_flight[WRITE])); } + cond_resched(); } dm_stat_free(&s->rcu_head); } @@ -330,6 +331,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { atomic_set(&s->stat_shared[ni].in_flight[READ], 0); atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + cond_resched(); } if (s->n_histogram_entries) { @@ -342,6 +344,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { s->stat_shared[ni].tmp.histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } @@ -362,6 +365,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { p[ni].histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } } @@ -396,6 +400,9 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, dm_stats_recalc_precise_timestamps(stats); + if (!static_key_enabled(&stats_enabled.key)) + static_branch_enable(&stats_enabled); + mutex_unlock(&stats->mutex); resume_callback(md); @@ -497,6 +504,7 @@ static int dm_stats_list(struct dm_stats *stats, const char *program, } DMEMIT("\n"); } + cond_resched(); } mutex_unlock(&stats->mutex); @@ -774,6 +782,7 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, local_irq_enable(); } } + cond_resched(); } } @@ -889,6 +898,8 @@ static int dm_stats_print(struct dm_stats *stats, int id, if (unlikely(sz + 1 >= maxlen)) goto buffer_overflow; + + cond_resched(); } if (clear) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e7d42f6335a2..a37c7b763643 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -719,6 +719,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); + if (tgt->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + static_branch_enable(&swap_bios_enabled); + return 0; bad: @@ -1002,6 +1005,8 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } +static int dm_table_supports_poll(struct dm_table *t); + static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); @@ -1009,21 +1014,24 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned min_pool_size = 0; struct dm_target *ti; unsigned i; + bool poll_supported = false; if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (__table_type_bio_based(type)) + if (__table_type_bio_based(type)) { for (i = 0; i < t->num_targets; i++) { ti = t->targets + i; per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); } + poll_supported = !!dm_table_supports_poll(t); + } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, - per_io_data_size, min_pool_size); + t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, + t->integrity_supported, poll_supported); if (!t->mempools) return -ENOMEM; @@ -2035,6 +2043,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 57daa86c19cf..3e7b1fe1580b 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -550,13 +550,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) return DM_MAPIO_KILL; } - /* - * The target map function may issue and complete the IO quickly. - * Take an extra reference on the IO to make sure it does disappear - * until we run dm_zone_map_bio_end(). - */ - dm_io_inc_pending(io); - /* Let the target do its work */ r = ti->type->map(ti, clone); switch (r) { @@ -587,9 +580,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) break; } - /* Drop the extra reference on the IO */ - dm_io_dec_pending(io, sts); - if (sts != BLK_STS_OK) return DM_MAPIO_KILL; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39081338ca61..d62f1354ecbf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,6 +71,10 @@ void dm_issue_global_event(void) wake_up(&dm_global_eventq); } +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + /* * One of these is allocated (on-stack) per original bio. */ @@ -80,7 +84,8 @@ struct clone_info { struct dm_io *io; sector_t sector; unsigned sector_count; - bool submit_as_polled; + bool is_abnormal_io:1; + bool submit_as_polled:1; }; #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) @@ -498,69 +503,76 @@ static bool bio_is_flush_with_data(struct bio *bio) return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, - unsigned long start_time, struct dm_stats_aux *stats_aux) +static void dm_io_acct(struct dm_io *io, bool end) { - bool is_flush_with_data; - unsigned int bi_size; + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + unsigned int sectors; - /* If REQ_PREFLUSH set save any payload but do not account it */ - is_flush_with_data = bio_is_flush_with_data(bio); - if (is_flush_with_data) { - bi_size = bio->bi_iter.bi_size; - bio->bi_iter.bi_size = 0; - } + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) + sectors = bio_sectors(bio); + else + sectors = io->sectors; if (!end) - bio_start_io_acct_time(bio, start_time); + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); else - bio_end_io_acct(bio, start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), + sector, sectors, end, start_time, stats_aux); - - /* Restore bio's payload so it does get accounted upon requeue */ - if (is_flush_with_data) - bio->bi_iter.bi_size = bi_size; + } } -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) +static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { - /* Must account IO to DM device in terms of orig_bio */ - struct bio *bio = io->orig_bio; - /* * Ensure IO accounting is only ever started once. - * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ - if (!clone || - likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { - if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) - return; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; - if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - return; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); dm_io_set_flag(io, DM_IO_ACCOUNTED); spin_unlock_irqrestore(&io->lock, flags); } - __dm_start_io_acct(io, bio); + __dm_start_io_acct(io); } -static void dm_end_io_acct(struct dm_io *io, struct bio *bio) +static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) @@ -569,7 +581,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; tio = clone_to_tio(clone); tio->flags = 0; @@ -578,17 +592,19 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; - io->status = 0; - atomic_set(&io->io_count, 1); + io->status = BLK_STS_OK; + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); - io->orig_bio = NULL; + io->orig_bio = bio; io->md = md; - io->map_task = current; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; - dm_stats_record_start(&md->stats, &io->stats_aux); + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); return io; } @@ -599,7 +615,7 @@ static void free_io(struct dm_io *io) } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) + unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { struct dm_target_io *tio; struct bio *clone; @@ -610,10 +626,13 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, - gfp_mask, &ci->io->md->bs); + struct mapped_device *md = ci->io->md; + + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); if (!clone) return NULL; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -663,14 +682,16 @@ static void queue_io(struct mapped_device *md, struct bio *bio) * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } @@ -696,6 +717,24 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, + int *srcu_idx, struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + return dm_get_live_table_fast(md); + else + return dm_get_live_table(md, srcu_idx); +} + +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, + struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + dm_put_live_table_fast(md); + else + dm_put_live_table(md, srcu_idx); +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@ -875,14 +914,14 @@ static void dm_io_complete(struct dm_io *io) io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - dm_end_io_acct(io, bio); + dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ - __dm_start_io_acct(io, bio); - dm_end_io_acct(io, bio); + __dm_start_io_acct(io); + dm_end_io_acct(io); } free_io(io); smp_wmb(); @@ -899,7 +938,7 @@ static void dm_io_complete(struct dm_io *io) * may only reflect a subset of the pre-split original) * so clear REQ_POLLED in case of requeue. */ - bio->bi_opf &= ~REQ_POLLED; + bio_clear_polled(bio); if (io_error == BLK_STS_AGAIN) { /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ queue_io(md, bio); @@ -923,30 +962,35 @@ static void dm_io_complete(struct dm_io *io) } } -static inline bool dm_tio_is_normal(struct dm_target_io *tio) -{ - return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && - !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static inline void __dm_io_dec_pending(struct dm_io *io) { + if (atomic_dec_and_test(&io->io_count)) + dm_io_complete(io); +} + +static void dm_io_set_error(struct dm_io *io, blk_status_t error) +{ + unsigned long flags; + /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - unsigned long flags; - spin_lock_irqsave(&io->lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(io->md))) - io->status = error; - spin_unlock_irqrestore(&io->lock, flags); + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; } + spin_unlock_irqrestore(&io->lock, flags); +} - if (atomic_dec_and_test(&io->io_count)) - dm_io_complete(io); +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); } void disable_discard(struct mapped_device *md) @@ -974,35 +1018,43 @@ static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct mapped_device *md = io->md; - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); - } + if (likely(bio->bi_bdev != md->disk->part0)) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !q->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } - if (blk_queue_is_zoned(q)) - dm_zone_endio(io, bio); + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) + dm_zone_endio(io, bio); + } if (endio) { - int r = endio(tio->ti, bio, &error); + int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - /* - * Requeuing writes to a sequential zone of a zoned - * target will break the sequential write pattern: - * fail such IO. - */ - if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) - error = BLK_STS_IOERR; - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: @@ -1016,10 +1068,9 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(swap_bios_limit(tio->ti, bio))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); - } free_tio(bio); dm_io_dec_pending(io, error); @@ -1161,7 +1212,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> - * <------- bi_size -------> + * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. @@ -1178,25 +1229,25 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); - unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(op_is_zone_mgmt(bio_op(bio))); BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); - BUG_ON(bi_size > *tio->len_ptr); - BUG_ON(n_sectors > bi_size); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); - *tio->len_ptr -= bi_size - n_sectors; + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; -} -EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -static inline void __dm_submit_bio_remap(struct bio *clone, - dev_t dev, sector_t old_sector) -{ - trace_block_bio_remap(clone, dev, old_sector); - submit_bio_noacct(clone); + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); + tio->io->sectors = n_sectors; } +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); /* * @clone: clone bio that DM core passed to target's .map function @@ -1212,8 +1263,6 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; - WARN_ON_ONCE(!tio->ti->accounts_remapped_io); - /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; @@ -1222,22 +1271,11 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (io->map_task == current) { - /* Still in target's map function */ - dm_io_set_flag(io, DM_IO_START_ACCT); - } else { - /* - * Called by another thread, managed by DM target, - * wait for dm_split_and_process_bio() to store - * io->orig_bio - */ - while (unlikely(!smp_load_acquire(&io->orig_bio))) - msleep(1); - dm_start_io_acct(io, clone); - } + dm_start_io_acct(io, clone); - __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); + submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); @@ -1260,55 +1298,53 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); - int r; - struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; clone->bi_end_io = clone_endio; /* * Map the clone. */ - dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; - if (unlikely(swap_bios_limit(ti, clone))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } - /* - * Check if the IO needs a special mapping due to zone append emulation - * on zoned target. In this case, dm_zone_map_bio() calls the target - * map operation. - */ - if (dm_emulate_zone_append(io->md)) - r = dm_zone_map_bio(tio); - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: - /* - * the bio has been remapped so dispatch it, but defer - * dm_start_io_acct() until after possible bio_split(). - */ - __dm_submit_bio_remap(clone, disk_devt(io->md->disk), - tio->old_sector); - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: - if (unlikely(swap_bios_limit(ti, clone))) - up(&io->md->swap_bios_semaphore); + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) + up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); @@ -1321,6 +1357,31 @@ static void __map_bio(struct bio *clone) } } +static void setup_split_accounting(struct clone_info *ci, unsigned len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + } + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&io->md->stats))) { + /* + * Save bi_sector in terms of its offset from end of + * original bio, only needed for DM-stats' benefit. + * - saved regardless of whether split needed so that + * dm_accept_partial_bio() doesn't need to. + */ + io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + } +} + static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { @@ -1350,18 +1411,22 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; + int ret = 0; switch (num_bios) { case 0: break; case 1: + if (len) + setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); + ret = 1; break; default: /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ @@ -1369,9 +1434,12 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, while ((clone = bio_list_pop(&blist))) { dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); + ret += 1; } break; } + + return ret; } static void __send_empty_flush(struct clone_info *ci) @@ -1392,8 +1460,19 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + while ((ti = dm_table_get_target(ci->map, target_nr++))) { + int bios; + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } @@ -1402,11 +1481,18 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; + int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); - __send_duplicate_bios(ci, ti, num_bios, &len); + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; @@ -1414,21 +1500,24 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target static bool is_abnormal_io(struct bio *bio) { - bool r = false; + unsigned int op = bio_op(bio); - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - r = true; - break; + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } } - return r; + return false; } -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - int *result) +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) { unsigned num_bios = 0; @@ -1442,8 +1531,6 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; - default: - return false; } /* @@ -1452,17 +1539,15 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * reconfiguration might also have changed that since the * check was performed. */ - if (!num_bios) - *result = -EOPNOTSUPP; - else { - __send_changing_extent_only(ci, ti, num_bios); - *result = 0; - } - return true; + if (unlikely(!num_bios)) + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; } /* - * Reuse ->bi_private as hlist head for storing all dm_io instances + * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * @@ -1470,54 +1555,53 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) +static inline struct dm_io **dm_poll_list_head(struct bio *bio) { - return (struct hlist_head *)&bio->bi_private; + return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); + struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse - * .bi_private as hlist head for storing dm_io list + * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; - INIT_HLIST_HEAD(head); - /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ - io->data = hlist_entry(head->first, struct dm_io, node)->data; + io->data = (*head)->data; + io->next = *head; } - hlist_add_head(&io->node, head); + *head = io; } /* * Select the correct strategy for processing a non-flush bio. */ -static int __split_and_process_bio(struct clone_info *ci) +static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned len; - int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!ti) - return -EIO; - - if (__process_abnormal_io(ci, ti, &r)) - return r; + if (unlikely(!ti)) + return BLK_STS_IOERR; + else if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is @@ -1526,27 +1610,30 @@ static int __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); __map_bio(clone); ci->sector += len; ci->sector_count -= len; - return 0; + return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, - struct dm_table *map, struct bio *bio) + struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = alloc_io(md, bio); ci->bio = bio; + ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ - if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } @@ -1557,10 +1644,21 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; - struct bio *orig_bio = NULL; - int error = 0; + struct dm_io *io; + blk_status_t error = BLK_STS_OK; + bool is_abnormal; - init_clone_info(&ci, md, map, bio); + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + blk_queue_split(&bio); + } + + init_clone_info(&ci, md, map, bio, is_abnormal); + io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); @@ -1569,40 +1667,34 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); - ci.io->map_task = NULL; if (error || !ci.sector_count) goto out; - /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in ci.io->orig_bio to be - * used by dm_end_io_acct() and for dm_io_complete() to use for - * completion handling. */ - orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - bio_chain(orig_bio, bio); - trace_block_split(orig_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (!orig_bio) - orig_bio = bio; - smp_store_release(&ci.io->orig_bio, orig_bio); - if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) - dm_start_io_acct(ci.io, NULL); - /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * - * Add every dm_io instance into the hlist_head which is stored in - * bio->bi_private, so that dm_poll_bio can poll them all. + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. */ - if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); - else - dm_queue_poll_io(bio, ci.io); + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else + dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) @@ -1611,7 +1703,7 @@ static void dm_submit_bio(struct bio *bio) int srcu_idx; struct dm_table *map; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table_bio(md, &srcu_idx, bio); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@ -1625,16 +1717,9 @@ static void dm_submit_bio(struct bio *bio) goto out; } - /* - * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) - * otherwise associated queue_limits won't be imposed. - */ - if (is_abnormal_io(bio)) - blk_queue_split(&bio); - dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table(md, srcu_idx); + dm_put_live_table_bio(md, srcu_idx, bio); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, @@ -1653,18 +1738,16 @@ static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); - struct hlist_head tmp = HLIST_HEAD_INIT; - struct hlist_node *next; - struct dm_io *io; + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; - WARN_ON_ONCE(hlist_empty(head)); - - hlist_move_list(head, &tmp); + WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. @@ -1675,24 +1758,27 @@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; - bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; + bio->bi_private = list->data; - hlist_for_each_entry_safe(io, next, &tmp, node) { - if (dm_poll_dm_io(io, iob, flags)) { - hlist_del_init(&io->node); + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { /* - * clone_endio() has already occurred, so passing - * error as 0 here doesn't override io->status + * clone_endio() has already occurred, so no + * error handling is needed here. */ - dm_io_dec_pending(io, 0); + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; } } /* Not done? */ - if (!hlist_empty(&tmp)) { + if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ - hlist_move_list(&tmp, head); + *head = tmp; return 0; } return 1; @@ -2901,8 +2987,8 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size, - unsigned min_pool_size) + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; @@ -2918,7 +3004,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); if (ret) goto out; if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 9013dc1a7b00..3f89664fea01 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -221,8 +221,8 @@ void dm_kcopyd_exit(void); * Mempool operations */ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_bio_data_size, - unsigned min_pool_size); + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll); void dm_free_md_mempools(struct dm_md_mempools *pools); /* |