From 2e315dc07df009c3e29d6926871f62a30cfae394 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:34 +0800 Subject: blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter Grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter(), and this way will prevent the request from being re-used when ->fn is running. The approach is same as what we do during handling timeout. Fix request use-after-free(UAF) related with completion race or queue releasing: - If one rq is referred before rq->q is frozen, then queue won't be frozen before the request is released during iteration. - If one rq is referred after rq->q is frozen, refcount_inc_not_zero() will return false, and we won't iterate over this request. However, still one request UAF not covered: refcount_inc_not_zero() may read one freed request, and it will be handled in next patch. Tested-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c86c01bfecdb..debfa5cd8025 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } +void blk_mq_put_rq_ref(struct request *rq) +{ + if (is_flush_rq(rq, rq->mq_hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); - + blk_mq_put_rq_ref(rq); return true; } -- cgit v1.2.3-70-g09d2 From bd63141d585bef14f4caf111f6d0e27fe2300ec6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:35 +0800 Subject: blk-mq: clear stale request in tags->rq[] before freeing one request pool refcount_inc_not_zero() in bt_tags_iter() still may read one freed request. Fix the issue by the following approach: 1) hold a per-tags spinlock when reading ->rqs[tag] and calling refcount_inc_not_zero in bt_tags_iter() 2) clearing stale request referred via ->rqs[tag] before freeing request pool, the per-tags spinlock is held for clearing stale ->rq[tag] So after we cleared stale requests, bt_tags_iter() won't observe freed request any more, also the clearing will wait for pending request reference. The idea of clearing ->rqs[] is borrowed from John Garry's previous patch and one recent David's patch. Tested-by: John Garry Reviewed-by: David Jeffery Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 9 +++++++-- block/blk-mq-tag.h | 6 ++++++ block/blk-mq.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 544edf2c56a5..1671dae43030 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -202,10 +202,14 @@ struct bt_iter_data { static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { - struct request *rq = tags->rqs[bitnr]; + struct request *rq; + unsigned long flags; + spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; if (!rq || !refcount_inc_not_zero(&rq->ref)) - return NULL; + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -538,6 +542,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); if (blk_mq_is_sbitmap_shared(flags)) return tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7d3e6b333a4a..f887988e5ef6 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -20,6 +20,12 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index debfa5cd8025..dd371f321d35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2307,6 +2307,45 @@ queue_exit: return BLK_QC_T_NONE; } +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -2325,6 +2364,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2384,11 +2425,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { -- cgit v1.2.3-70-g09d2 From 364b61818f65045479e42e76ed8dd6f051778280 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:36 +0800 Subject: blk-mq: clearing flush request reference in tags->rqs[] Before we free request queue, clearing flush request reference in tags->rqs[], so that potential UAF can be avoided. Based on one patch written by David Jeffery. Tested-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: David Jeffery Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index dd371f321d35..fbb165393790 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2643,16 +2643,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); } +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); -- cgit v1.2.3-70-g09d2 From 56b68085e536eff2676108f2f8356889a7dbbf55 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:57 +0800 Subject: blk-mq: Some tag allocation code refactoring The tag allocation code to alloc the sbitmap pairs is common for regular bitmaps tags and shared sbitmap, so refactor into a common function. Also remove superfluous "flags" argument from blk_mq_init_shared_sbitmap(). Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-2-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 54 +++++++++++++++++++++++++++++++++--------------------- block/blk-mq-tag.h | 9 ++++++--- block/blk-mq.c | 2 +- 3 files changed, 40 insertions(+), 25 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1671dae43030..f597d40de10b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,39 +471,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) + if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; - if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, - round_robin, node)) + if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; + return 0; + +free_bitmap_tags: + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; +} + +static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + int ret; + + ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, + &tags->__breserved_tags, + tags->nr_tags, tags->nr_reserved_tags, + node, alloc_policy); + if (ret) + return ret; + tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags; return 0; -free_bitmap_tags: - sbitmap_queue_free(&tags->__bitmap_tags); - return -ENOMEM; } -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) { - unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - int i, node = set->numa_node; + int i, ret; - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&set->__breserved_tags, set->reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, + set->queue_depth, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; @@ -513,9 +528,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) } return 0; -free_bitmap_tags: - sbitmap_queue_free(&set->__bitmap_tags); - return -ENOMEM; } void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index f887988e5ef6..8ed55af08427 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -32,11 +32,14 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); +extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, + unsigned int reserved, + int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, - unsigned int flags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); - extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index fbb165393790..001e196bdebd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3564,7 +3564,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); - if (blk_mq_init_shared_sbitmap(set, set->flags)) { + if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; goto out_free_mq_rq_maps; } -- cgit v1.2.3-70-g09d2 From d97e594c51660bea510a387731637b894651e4b5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:58 +0800 Subject: blk-mq: Use request queue-wide tags for tagset-wide sbitmap The tags used for an IO scheduler are currently per hctx. As such, when q->nr_hw_queues grows, so does the request queue total IO scheduler tag depth. This may cause problems for SCSI MQ HBAs whose total driver depth is fixed. Ming and Yanhui report higher CPU usage and lower throughput in scenarios where the fixed total driver tag depth is appreciably lower than the total scheduler tag depth: https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b In that scenario, since the scheduler tag is got first, much contention is introduced since a driver tag may not be available after we have got the sched tag. Improve this scenario by introducing request queue-wide tags for when a tagset-wide sbitmap is used. The static sched requests are still allocated per hctx, as requests are initialised per hctx, as in blk_mq_init_request(..., hctx_idx, ...) -> set->ops->init_request(.., hctx_idx, ...). For simplicity of resizing the request queue sbitmap when updating the request queue depth, just init at the max possible size, so we don't need to deal with the possibly with swapping out a new sbitmap for old if we need to grow. Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 67 ++++++++++++++++++++++++++++++++++++++++---------- block/blk-mq-sched.h | 2 ++ block/blk-mq-tag.c | 11 ++++----- block/blk-mq.c | 13 ++++++++-- include/linux/blkdev.h | 4 +++ 5 files changed, 76 insertions(+), 21 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 996a4b2f73aa..045b6878b8c5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -509,11 +509,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, set->flags); hctx->sched_tags = NULL; } } @@ -523,12 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; - /* Clear HCTX_SHARED so tags are init'ed */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, flags); + set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; @@ -546,16 +542,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - /* Clear HCTX_SHARED so tags are freed */ - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } } +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +{ + struct blk_mq_tag_set *set = queue->tag_set; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, + MAX_SCHED_RQ, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; + + queue_for_each_hw_ctx(queue, hctx, i) { + hctx->sched_tags->bitmap_tags = + &queue->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &queue->sched_breserved_tags; + } + + sbitmap_queue_resize(&queue->sched_bitmap_tags, + queue->nr_requests - set->reserved_tags); + + return 0; +} + +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); +} + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; @@ -580,12 +610,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) - goto err; + goto err_free_tags; + } + + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); + if (ret) + goto err_free_tags; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_sbitmap; blk_mq_debugfs_register_sched(q); @@ -605,7 +641,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err: +err_free_sbitmap: + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); +err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; @@ -643,5 +682,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5b18ab915c65..aff037cfd8e7 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,6 +5,8 @@ #include "blk-mq.h" #include "blk-mq-tag.h" +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) + void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f597d40de10b..86f87346232a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" /* @@ -590,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; - /* Only sched tags can grow, so clear HCTX_SHARED flag */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; @@ -602,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, flags); + tags->nr_reserved_tags, set->flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new, flags); + blk_mq_free_rq_map(new, set->flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, flags); + blk_mq_free_rq_map(*tagsptr, set->flags); *tagsptr = new; } else { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 001e196bdebd..f11d4018ce2e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3640,15 +3640,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); + if (blk_mq_is_sbitmap_shared(set->flags)) { + hctx->sched_tags->bitmap_tags = + &q->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &q->sched_breserved_tags; + } } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f69c75bd6d27..2c28577b50f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -493,6 +494,9 @@ struct request_queue { atomic_t nr_active_requests_shared_sbitmap; + struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); -- cgit v1.2.3-70-g09d2 From 613471549f366cdf4170b81ce0f99f3867ec4d16 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Jun 2021 12:47:21 +0200 Subject: block: Do not pull requests from the scheduler when we cannot dispatch them Provided the device driver does not implement dispatch budget accounting (which only SCSI does) the loop in __blk_mq_do_dispatch_sched() pulls requests from the IO scheduler as long as it is willing to give out any. That defeats scheduling heuristics inside the scheduler by creating false impression that the device can take more IO when it in fact cannot. For example with BFQ IO scheduler on top of virtio-blk device setting blkio cgroup weight has barely any impact on observed throughput of async IO because __blk_mq_do_dispatch_sched() always sucks out all the IO queued in BFQ. BFQ first submits IO from higher weight cgroups but when that is all dispatched, it will give out IO of lower weight cgroups as well. And then we have to wait for all this IO to be dispatched to the disk (which means lot of it actually has to complete) before the IO scheduler is queried again for dispatching more requests. This completely destroys any service differentiation. So grab request tag for a request pulled out of the IO scheduler already in __blk_mq_do_dispatch_sched() and do not pull any more requests if we cannot get it because we are unlikely to be able to dispatch it. That way only single request is going to wait in the dispatch list for some tag to free. Reviewed-by: Ming Lei Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210603104721.6309-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 12 +++++++++++- block/blk-mq.c | 2 +- block/blk-mq.h | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 045b6878b8c5..a9182d2f8ad3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -168,9 +168,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); + count++; if (rq->mq_hctx != hctx) multi_hctxs = true; - } while (++count < max_dispatch); + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); if (!count) { if (run_queue) diff --git a/block/blk-mq.c b/block/blk-mq.c index f11d4018ce2e..4261adee9964 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1104,7 +1104,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index 556368d2c5b6..4b1ca7b7bbeb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -260,6 +260,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } +bool blk_mq_get_driver_tag(struct request *rq); + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; -- cgit v1.2.3-70-g09d2 From cdb14e0f7775e767484843e8ecd736bb21754c58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:16 +0300 Subject: blk-mq: factor out a blk_mq_alloc_sq_tag_set helper Factour out a helper to initialize a simple single hw queue tag_set from blk_mq_init_sq_queue. This will allow to phase out blk_mq_init_sq_queue in favor of a more symmetric and general API. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 ++++++++++++++++++-------------- include/linux/blk-mq.h | 3 +++ 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4261adee9964..867e5faf4f5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3152,24 +3152,12 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - memset(set, 0, sizeof(*set)); - set->ops = ops; - set->nr_hw_queues = 1; - set->nr_maps = 1; - set->queue_depth = queue_depth; - set->numa_node = NUMA_NO_NODE; - set->flags = set_flags; - - ret = blk_mq_alloc_tag_set(set); + ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); if (ret) return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { + if (IS_ERR(q)) blk_mq_free_tag_set(set); - return q; - } - return q; } EXPORT_SYMBOL(blk_mq_init_sq_queue); @@ -3589,6 +3577,22 @@ out_free_mq_map: } EXPORT_SYMBOL(blk_mq_alloc_tag_set); +/* allocate and initialize a tagset for a simple single-queue device */ +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags) +{ + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + return blk_mq_alloc_tag_set(set); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); + void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 359486940fa0..bb950fc669ef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -439,6 +439,9 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -- cgit v1.2.3-70-g09d2 From 26a9750aa875126e4b7fc5ee6de652a529c5b7ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:17 +0300 Subject: blk-mq: improve the blk_mq_init_allocated_queue interface Don't return the passed in request_queue but a normal error code, and drop the elevator_init argument in favor of just calling elevator_init_mq directly from dm-rq. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- block/blk.h | 1 - block/elevator.c | 2 +- drivers/md/dm-rq.c | 9 +++------ include/linux/blk-mq.h | 5 ++--- include/linux/elevator.h | 1 + 6 files changed, 21 insertions(+), 33 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 867e5faf4f5b..8550ad64982f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3115,21 +3115,18 @@ void blk_mq_release(struct request_queue *q) struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { - struct request_queue *uninit_q, *q; + struct request_queue *q; + int ret; - uninit_q = blk_alloc_queue(set->numa_node); - if (!uninit_q) + q = blk_alloc_queue(set->numa_node); + if (!q) return ERR_PTR(-ENOMEM); - uninit_q->queuedata = queuedata; - - /* - * Initialize the queue without an elevator. device_add_disk() will do - * the initialization. - */ - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { + blk_cleanup_queue(q); + return ERR_PTR(ret); + } return q; } EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); @@ -3273,9 +3270,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init) +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -3325,11 +3321,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - - if (elevator_init) - elevator_init_mq(q); - - return q; + return 0; err_hctxs: kfree(q->queue_hw_ctx); @@ -3340,7 +3332,7 @@ err_poll: q->poll_cb = NULL; err_exit: q->mq_ops = NULL; - return ERR_PTR(-ENOMEM); + return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); diff --git a/block/blk.h b/block/blk.h index 3440142f029b..d3fa47af3607 100644 --- a/block/blk.h +++ b/block/blk.h @@ -192,7 +192,6 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); diff --git a/block/elevator.c b/block/elevator.c index 440699c28119..06e203426410 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -693,7 +693,7 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } - +EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9c3bc3711b33..0dbd48cbdff9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = { int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) { - struct request_queue *q; struct dm_target *immutable_tgt; int err; @@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); - if (IS_ERR(q)) { - err = PTR_ERR(q); + err = blk_mq_init_allocated_queue(md->tag_set, md->queue); + if (err) goto out_tag_set; - } - + elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bb950fc669ef..73750b2838d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -429,9 +429,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init); +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index dcb2f9022c1d..783ecb3cb77a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -120,6 +120,7 @@ extern void elv_merged_request(struct request_queue *, struct request *, extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); +void elevator_init_mq(struct request_queue *q); /* * io scheduler registration -- cgit v1.2.3-70-g09d2 From b461dfc49eb6fbabc60b9dad476e787ada56b7b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:18 +0300 Subject: blk-mq: add the blk_mq_alloc_disk APIs Add a new API to allocate a gendisk including the request_queue for use with blk-mq based drivers. This is to avoid boilerplate code in drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 31 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8550ad64982f..b123077a0dc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3137,6 +3137,25 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_mq_init_queue_data(set, queuedata); + if (IS_ERR(q)) + return ERR_CAST(q); + + disk = __alloc_disk_node(0, set->numa_node); + if (!disk) { + blk_cleanup_queue(q); + return ERR_PTR(-ENOMEM); + } + disk->queue = q; + return disk; +} +EXPORT_SYMBOL(__blk_mq_alloc_disk); + /* * Helper for setting up a queue with mq ops, given queue depth, and * the passed in mq ops flags. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 73750b2838d2..f496c6c5b5d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -426,6 +426,18 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define blk_mq_alloc_disk(set, queuedata) \ +({ \ + static struct lock_class_key __key; \ + struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, \ + "(bio completion)", &__key, 0); \ + __disk; \ +}) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); -- cgit v1.2.3-70-g09d2 From 08c1d480ed38995690a7d83f2c6a505f6cbbed9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:30 +0300 Subject: blk-mq: remove blk_mq_init_sq_queue All users are gone now. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-16-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ---------------------- include/linux/blk-mq.h | 4 ---- 2 files changed, 26 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index b123077a0dc4..3115ea2d0990 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3156,28 +3156,6 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) } EXPORT_SYMBOL(__blk_mq_alloc_disk); -/* - * Helper for setting up a queue with mq ops, given queue depth, and - * the passed in mq ops flags. - */ -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags) -{ - struct request_queue *q; - int ret; - - ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); - if (ret) - return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) - blk_mq_free_tag_set(set); - return q; -} -EXPORT_SYMBOL(blk_mq_init_sq_queue); - static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f496c6c5b5d2..02a4aab0aeac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -443,10 +443,6 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); -- cgit v1.2.3-70-g09d2 From cb9516be7708a2a18ec0a19fe3a225b5b3bc92c7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Jun 2021 10:02:48 +0800 Subject: blk-mq: update hctx->dispatch_busy in case of real scheduler Commit 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") starts to support io batching submission by using hctx->dispatch_busy. However, blk_mq_update_dispatch_busy() isn't changed to update hctx->dispatch_busy in that commit, so fix the issue by updating hctx->dispatch_busy in case of real scheduler. Reported-by: Jan Kara Reviewed-by: Jan Kara Fixes: 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210625020248.1630497-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3115ea2d0990..c2f3550337f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; - if (hctx->queue->elevator) - return; - ewma = hctx->dispatch_busy; if (!ewma && !busy) -- cgit v1.2.3-70-g09d2