From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:50:16 -0700 Subject: block: move __get_task_ioprio() into header file We call this once per IO, which can be millions of times per second. Since nobody really uses io priorities, or at least it isn't very common, this is all wasted time and can amount to as much as 3% of the total kernel time. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/ioprio.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'block') diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519a79..73301a261429 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; -- cgit v1.2.3-70-g09d2 From 742e324a0679ce271f7475a40056bac6917a950c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2024 08:29:53 -0700 Subject: block/iocost: silence warning on 'last_period' potentially being unused If CONFIG_TRACEPOINTS isn't enabled, we assign this variable but then never use it. This can cause the compiler to complain about that: block/blk-iocost.c:1264:6: warning: variable 'last_period' set but not used [-Wunused-but-set-variable] 1264 | u64 last_period, cur_period; | ^ Rather than add ifdefs to guard this, just mark it __maybe_unused. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401102335.GiWdeIo9-lkp@intel.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 089fcb9cfce3..c8beec6d7df0 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; -- cgit v1.2.3-70-g09d2 From 748dc0b65ec2b4b7b3dbd7befcc4a54fdcac7988 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Jan 2024 18:29:42 +0900 Subject: block: fix partial zone append completion handling in req_bio_endio() Partial completions of zone append request is not allowed but if a zone append completion indicates a number of completed bytes different from the original BIO size, only the BIO status is set to error. This leads to bio_advance() not setting the BIO size to 0 and thus to not call bio_endio() at the end of req_bio_endio(). Make sure a partially completed zone append is failed and completed immediately by forcing the completed number of bytes (nbytes) to be equal to the BIO size, thus ensuring that bio_endio() is called. Fixes: 297db731847e ("block: fix req_bio_endio append error handling") Cc: stable@kernel.vger.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240110092942.442334-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fb29ff5cc281..aa9a05fdd023 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -772,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); -- cgit v1.2.3-70-g09d2 From 5266caaf5660529e3da53004b8b7174cab6374ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Jan 2024 20:26:26 +0800 Subject: blk-mq: fix IO hang from sbitmap wakeup race In blk_mq_mark_tag_wait(), __add_wait_queue() may be re-ordered with the following blk_mq_get_driver_tag() in case of getting driver tag failure. Then in __sbitmap_queue_wake_up(), waitqueue_active() may not observe the added waiter in blk_mq_mark_tag_wait() and wake up nothing, meantime blk_mq_mark_tag_wait() can't get driver tag successfully. This issue can be reproduced by running the following test in loop, and fio hang can be observed in < 30min when running it on my test VM in laptop. modprobe -r scsi_debug modprobe scsi_debug delay=0 dev_size_mb=4096 max_queue=1 host_max_queue=1 submit_queues=4 dev=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename` fio --filename=/dev/"$dev" --direct=1 --rw=randrw --bs=4k --iodepth=1 \ --runtime=100 --numjobs=40 --time_based --name=test \ --ioengine=libaio Fix the issue by adding one explicit barrier in blk_mq_mark_tag_wait(), which is just fine in case of running out of tag. Cc: Jan Kara Cc: Kemeng Shi Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240112122626.4181044-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index aa9a05fdd023..a4c54c5895a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1852,6 +1852,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); + /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait -- cgit v1.2.3-70-g09d2 From 25c1772a0493463408489b1fae65cf77fe46cac1 Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Fri, 12 Jan 2024 00:15:18 +0100 Subject: block: print symbolic error name instead of error code Utilize the %pe print specifier to get the symbolic error name as a string (i.e "-ENOMEM") in the log message instead of the error code to increase its readablility. This change was suggested in https://lore.kernel.org/all/92972476-0b1f-4d0a-9951-af3fc8bc6e65@suswa.mountain/ Signed-off-by: Christian Heusel Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240111231521.1596838-1-christian@heusel.eu Signed-off-by: Jens Axboe --- block/partitions/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index e6ac73617f3e..cab0d76a828e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -562,8 +562,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } -- cgit v1.2.3-70-g09d2 From 309ce6741430b5a7b5e85cd1a7569647f8d9dfa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jan 2024 14:57:04 +0100 Subject: blk-mq: rename blk_mq_can_use_cached_rq blk_mq_can_use_cached_rq doesn't just check if we can use the request, but also performs the work to actually use it. Remove the _can in the naming, and improve the comment describing the function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240111135705.2155518-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a4c54c5895a1..f57b86d6de6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2900,8 +2900,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2979,7 +2982,7 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { -- cgit v1.2.3-70-g09d2 From 7b4f36cd22a65b750b4cb6ac14804fb7d6e6c67d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jan 2024 09:12:20 -0700 Subject: block: ensure we hold a queue reference when using queue limits q_usage_counter is the only thing preventing us from the limits changing under us in __bio_split_to_limits, but blk_mq_submit_bio doesn't hold it while calling into it. Move the splitting inside the region where we know we've got a queue reference. Ideally this could still remain a shared section of code, but let's keep the fix simple and defer any refactoring here to later. Reported-by: Christoph Hellwig Fixes: 900e08075202 ("block: move queue enter logic into blk_mq_submit_bio()") Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f57b86d6de6a..e02c4b1af8c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2964,12 +2964,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - bio_set_ioprio(bio); if (plug) { @@ -2978,6 +2972,11 @@ void blk_mq_submit_bio(struct bio *bio) rq = NULL; } if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) @@ -2988,6 +2987,11 @@ void blk_mq_submit_bio(struct bio *bio) } else { if (unlikely(bio_queue_enter(bio))) return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } if (!bio_integrity_prep(bio)) goto fail; } -- cgit v1.2.3-70-g09d2 From 521277d12b5a75982d4f642d2ee22db8d7f986dd Mon Sep 17 00:00:00 2001 From: Nicky Chorley Date: Sun, 14 Jan 2024 19:10:56 +0000 Subject: block: Correct a documentation comment in blk-cgroup.c Commit 99e603874366 ("blk-cgroup: pass a gendisk to the blkg allocation helpers") changed blkg_alloc() to take a struct gendisk instead of a struct request_queue, but the documentation comment still referred to q. So, update that comment to refer to disk instead and fix a typo. Signed-off-by: Nicky Chorley Link: https://lore.kernel.org/r/20240114191056.6992-1-ndchorley@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e303fd317313..ff93c385ba5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) -- cgit v1.2.3-70-g09d2 From be50df31c4e2a69f961a3bb759346d299eaa2b23 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 16 Jan 2024 17:34:31 +0300 Subject: block: bio-integrity: fix kcalloc() arguments order When compiling with gcc version 14.0.1 20240116 (experimental) and W=1, I've noticed the following warning: block/bio-integrity.c: In function 'bio_integrity_map_user': block/bio-integrity.c:339:38: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 339 | bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); | ^ block/bio-integrity.c:339:38: note: earlier argument should specify number of elements, later size of each element Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Fixes: 492c5d455969 ("block: bio-integrity: directly map user buffers") Signed-off-by: Dmitry Antipov Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20240116143437.89060-1-dmantipov@yandex.ru Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index feef615e2c9c..c9a16fba58b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -336,7 +336,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { - bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); if (!bvec) return -ENOMEM; pages = NULL; -- cgit v1.2.3-70-g09d2 From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2024 12:36:09 -0800 Subject: blk-mq: Remove the hctx 'run' debugfs attribute Nobody uses the debugfs hctx 'run' attribute. Hence remove this attribute and also the code that updates the corresponding member variable. Suggested-by: Jens Axboe Cc: Gabriel Ryan Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 18 ------------------ block/blk-mq-sched.c | 2 -- include/linux/blk-mq.h | 3 --- 3 files changed, 23 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344f2f..94668e72ab09 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b15b..451a2c1f1f32 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a676e116085f..7a8150a5f051 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -391,9 +391,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @run: Number of dispatched requests. */ - unsigned long run; - /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ -- cgit v1.2.3-70-g09d2