diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-26 12:52:58 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-26 12:52:58 -0700 |
commit | 9dd6956b38923dc1b7b349ca1eee3c0bb1f0163a (patch) | |
tree | c70bb7d65a50a51686378b6113a8663e0e60d9b8 /block | |
parent | 5b9a7bb72fddbc5247f56ede55d485fab7abdf92 (diff) | |
parent | 55793ea54d77719a071b1ccc05a05056e3b5e009 (diff) |
Merge tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- drbd patches, bringing us closer to unifying the out-of-tree version
and the in tree one (Andreas, Christoph)
- support for auto-quiesce for the s390 dasd driver (Stefan)
- MD pull request via Song:
- md/bitmap: Optimal last page size (Jon Derrick)
- Various raid10 fixes (Yu Kuai, Li Nan)
- md: add error_handlers for raid0 and linear (Mariusz Tkaczyk)
- NVMe pull request via Christoph:
- Drop redundant pci_enable_pcie_error_reporting (Bjorn Helgaas)
- Validate nvmet module parameters (Chaitanya Kulkarni)
- Fence TCP socket on receive error (Chris Leech)
- Fix async event trace event (Keith Busch)
- Minor cleanups (Chaitanya Kulkarni, zhenwei pi)
- Fix and cleanup nvmet Identify handling (Damien Le Moal,
Christoph Hellwig)
- Fix double blk_mq_complete_request race in the timeout handler
(Lei Yin)
- Fix irq locking in nvme-fcloop (Ming Lei)
- Remove queue mapping helper for rdma devices (Sagi Grimberg)
- use structured request attribute checks for nbd (Jakub)
- fix blk-crypto race conditions between keyslot management (Eric)
- add sed-opal support for reading read locking range attributes
(Ondrej)
- make fault injection configurable for null_blk (Akinobu)
- clean up the request insertion API (Christoph)
- clean up the queue running API (Christoph)
- blkg config helper cleanups (Tejun)
- lazy init support for blk-iolatency (Tejun)
- various fixes and tweaks to ublk (Ming)
- remove hybrid polling. It hasn't really been useful since we got
async polled IO support, and these days we don't support sync polled
IO at all (Keith)
- misc fixes, cleanups, improvements (Zhong, Ondrej, Colin, Chengming,
Chaitanya, me)
* tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux: (118 commits)
nbd: fix incomplete validation of ioctl arg
ublk: don't return 0 in case of any failure
sed-opal: geometry feature reporting command
null_blk: Always check queue mode setting from configfs
block: ublk: switch to ioctl command encoding
blk-mq: fix the blk_mq_add_to_requeue_list call in blk_kick_flush
block, bfq: Fix division by zero error on zero wsum
fault-inject: fix build error when FAULT_INJECTION_CONFIGFS=y and CONFIGFS_FS=m
block: store bdev->bd_disk->fops->submit_bio state in bdev
block: re-arrange the struct block_device fields for better layout
md/raid5: remove unused working_disks variable
md/raid10: don't call bio_start_io_acct twice for bio which experienced read error
md/raid10: fix memleak of md thread
md/raid10: fix memleak for 'conf->bio_split'
md/raid10: fix leak of 'r10bio->remaining' for recovery
md/raid10: don't BUG_ON() in raise_barrier()
md: fix soft lockup in status_resync
md: add error_handlers for raid0 and linear
md: Use optimal I/O size for last bitmap page
md: Fix types in sb writer
...
Diffstat (limited to 'block')
40 files changed, 920 insertions, 1069 deletions
diff --git a/block/Kconfig b/block/Kconfig index 69ccf7457ae1..86122e459fe0 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -215,11 +215,6 @@ config BLK_MQ_VIRTIO depends on VIRTIO default y -config BLK_MQ_RDMA - bool - depends on INFINIBAND - default y - config BLK_PM def_bool PM diff --git a/block/Makefile b/block/Makefile index 4e01bb71ad6e..b31b05390749 100644 --- a/block/Makefile +++ b/block/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o -obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bdev.c b/block/bdev.c index 1795c7d4b99e..850852fe4b78 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -419,6 +419,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_inode = inode; bdev->bd_queue = disk->queue; bdev->bd_stats = alloc_percpu(struct disk_stats); + bdev->bd_has_submit_bio = false; if (!bdev->bd_stats) { iput(inode); return NULL; diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 89ffb3aa992c..2c90e5de0acd 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -497,15 +497,9 @@ static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; - return &bgd->pd; -} - -static void bfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; + bgd->weight = CGROUP_WEIGHT_DFL; + return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) @@ -1111,9 +1105,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, struct bfq_group *bfqg; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) - return ret; + goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ @@ -1135,7 +1131,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, ret = 0; } out: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1301,8 +1297,6 @@ struct blkcg_policy blkcg_policy_bfq = { .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d9ed3108c17a..3164e3177965 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -129,7 +129,6 @@ #include "elevator.h" #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" @@ -649,6 +648,8 @@ retry: sched_data->service_tree[i].wsum; } } + if (!wsum) + continue; limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); if (entity->allocated >= limit) { bfq_log_bfqq(bfqq->bfqd, bfqq, @@ -6232,7 +6233,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q, static struct bfq_queue *bfq_init_rq(struct request *rq); static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + blk_insert_t flags) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; @@ -6255,11 +6256,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - if (!bfqq || at_head) { - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); + if (flags & BLK_MQ_INSERT_AT_HEAD) { + list_add(&rq->queuelist, &bfqd->dispatch); + } else if (!bfqq) { + list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* @@ -6289,14 +6289,15 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) + struct list_head *list, + blk_insert_t flags) { while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - bfq_insert_request(hctx, rq, at_head); + bfq_insert_request(hctx, rq, flags); } } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 69aaee52285a..467e8cfc41a2 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -20,7 +20,6 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 18c922579719..ff45649361e7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -33,7 +33,6 @@ #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" -#include "blk-rq-qos.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -693,69 +692,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** - * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update - * @inputp: input string pointer + * blkg_conf_init - initialize a blkg_conf_ctx + * @ctx: blkg_conf_ctx to initialize + * @input: input string * - * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching bdev. *@inputp is - * updated to point past the device node prefix. Returns an ERR_PTR() - * value on error. + * Initialize @ctx which can be used to parse blkg config input string @input. + * Once initialized, @ctx can be used with blkg_conf_open_bdev() and + * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). + */ +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) +{ + *ctx = (struct blkg_conf_ctx){ .input = input }; +} +EXPORT_SYMBOL_GPL(blkg_conf_init); + +/** + * blkg_conf_open_bdev - parse and open bdev for per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Use this function iff blkg_conf_prep() can't be used for some reason. + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from + * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is + * set to point past the device node prefix. + * + * This function may be called multiple times on @ctx and the extra calls become + * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function + * explicitly if bdev access is needed without resolving the blkcg / policy part + * of @ctx->input. Returns -errno on error. */ -struct block_device *blkcg_conf_open_bdev(char **inputp) +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { - char *input = *inputp; + char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; + if (ctx->bdev) + return 0; + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return ERR_PTR(-EINVAL); + return -EINVAL; input += key_len; if (!isspace(*input)) - return ERR_PTR(-EINVAL); + return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) - return ERR_PTR(-ENODEV); + return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); - return ERR_PTR(-ENODEV); + return -ENODEV; } - *inputp = input; - return bdev; + ctx->body = input; + ctx->bdev = bdev; + return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy - * @input: input string - * @ctx: blkg_conf_ctx to be filled + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * + * Parse per-blkg config update from @ctx->input and initialize @ctx + * accordingly. On success, @ctx->body points to the part of @ctx->input + * following MAJ:MIN, @ctx->bdev points to the target block device and + * @ctx->blkg to the blkg being configured. * - * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->body the - * part of @input following MAJ:MIN. This function returns with RCU read - * lock and queue lock held and must be paired with blkg_conf_finish(). + * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this + * function returns with queue lock held and must be followed by + * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) + struct blkg_conf_ctx *ctx) + __acquires(&bdev->bd_queue->queue_lock) { - struct block_device *bdev; struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - disk = bdev->bd_disk; + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; + + disk = ctx->bdev->bd_disk; q = disk->queue; /* @@ -766,7 +789,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (ret) goto fail; - rcu_read_lock(); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -795,7 +817,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { @@ -809,7 +830,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail_exit_queue; } - rcu_read_lock(); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -836,20 +856,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } success: blk_queue_exit(q); - ctx->bdev = bdev; ctx->blkg = blkg; - ctx->body = input; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); fail_exit_queue: blk_queue_exit(q); fail: - blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -865,20 +881,27 @@ fail: EXPORT_SYMBOL_GPL(blkg_conf_prep); /** - * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() + * blkg_conf_exit - clean up per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Finish up after per-blkg config update. This function must be paired - * with blkg_conf_prep(). + * Clean up after per-blkg config update. This function must be called on all + * blkg_conf_ctx's initialized with blkg_conf_init(). */ -void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) +void blkg_conf_exit(struct blkg_conf_ctx *ctx) + __releases(&ctx->bdev->bd_queue->queue_lock) { - spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); - rcu_read_unlock(); - blkdev_put_no_open(ctx->bdev); + if (ctx->blkg) { + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); + ctx->blkg = NULL; + } + + if (ctx->bdev) { + blkdev_put_no_open(ctx->bdev); + ctx->body = NULL; + ctx->bdev = NULL; + } } -EXPORT_SYMBOL_GPL(blkg_conf_finish); +EXPORT_SYMBOL_GPL(blkg_conf_exit); static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { @@ -1289,8 +1312,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -1368,14 +1389,8 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_ioprio_exit; - ret = blk_iolatency_init(disk); - if (ret) - goto err_throtl_exit; - return 0; -err_throtl_exit: - blk_throtl_exit(disk); err_ioprio_exit: blk_ioprio_exit(disk); err_destroy_all: @@ -1391,30 +1406,9 @@ err_unlock: void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); - rq_qos_exit(disk->queue); blk_throtl_exit(disk); } -static void blkcg_bind(struct cgroup_subsys_state *root_css) -{ - int i; - - mutex_lock(&blkcg_pol_mutex); - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkcg *blkcg; - - if (!pol || !pol->cpd_bind_fn) - continue; - - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) - if (blkcg->cpd[pol->plid]) - pol->cpd_bind_fn(blkcg->cpd[pol->plid]); - } - mutex_unlock(&blkcg_pol_mutex); -} - static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) @@ -1428,7 +1422,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, - .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", @@ -1666,8 +1659,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index e98d2c1be354..624c03c8fe64 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -174,9 +174,7 @@ struct blkcg_policy { /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; - blkcg_pol_init_cpd_fn *cpd_init_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; - blkcg_pol_bind_cpd_fn *cpd_bind_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; @@ -209,15 +207,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { + char *input; + char *body; struct block_device *bdev; struct blkcg_gq *blkg; - char *body; }; -struct block_device *blkcg_conf_open_bdev(char **inputp); +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); + struct blkg_conf_ctx *ctx); +void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-core.c b/block/blk-core.c index 478978dcb2bd..00c74330fa92 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) static void blk_free_queue(struct request_queue *q) { - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - if (queue_is_mq(q)) blk_mq_release(q); @@ -593,14 +587,14 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static void __submit_bio(struct bio *bio) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (unlikely(!blk_crypto_bio_prep(&bio))) return; - if (!disk->fops->submit_bio) { + if (!bio->bi_bdev->bd_has_submit_bio) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { + struct gendisk *disk = bio->bi_bdev->bd_disk; + disk->fops->submit_bio(bio); blk_queue_exit(disk->queue); } @@ -704,7 +698,7 @@ void submit_bio_noacct_nocheck(struct bio *bio) */ if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); - else if (!bio->bi_bdev->bd_disk->fops->submit_bio) + else if (!bio->bi_bdev->bd_has_submit_bio) __submit_bio_noacct_mq(bio); else __submit_bio_noacct(bio); diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index a8cdaf26851e..93a141979694 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -65,6 +65,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return rq->crypt_keyslot; +} + blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); @@ -119,6 +124,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return false; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -153,14 +163,21 @@ static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq); -static inline blk_status_t blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); +static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) - return __blk_crypto_init_request(rq); + return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } +void __blk_crypto_rq_put_keyslot(struct request *rq); +static inline void blk_crypto_rq_put_keyslot(struct request *rq) +{ + if (blk_crypto_rq_has_keyslot(rq)) + __blk_crypto_rq_put_keyslot(rq); +} + void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { @@ -188,21 +205,6 @@ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, return 0; } -/** - * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted - * into a request queue. - * @rq: the request being queued - * - * Return: BLK_STS_OK on success, nonzero on error. - */ -static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) -{ - - if (blk_crypto_rq_is_encrypted(rq)) - return blk_crypto_init_request(rq); - return BLK_STS_OK; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 0307fb0d95d3..2a67d3fb63e5 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -227,14 +227,13 @@ EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); * @profile: the crypto profile of the device the key will be used on * @key: the key that will be used * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct - * will be stored here; otherwise NULL will be stored here. + * will be stored here. blk_crypto_put_keyslot() must be called + * later to release it. Otherwise, NULL will be stored here. * * If the device has keyslots, this gets a keyslot that's been programmed with * the specified key. If the key is already in a slot, this reuses it; * otherwise this waits for a slot to become idle and programs the key into it. * - * This must be paired with a call to blk_crypto_put_keyslot(). - * * Context: Process context. Takes and releases profile->lock. * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or * one wasn't needed; or a blk_status_t error on failure. @@ -312,20 +311,15 @@ success: /** * blk_crypto_put_keyslot() - Release a reference to a keyslot - * @slot: The keyslot to release the reference of (may be NULL). + * @slot: The keyslot to release the reference of * * Context: Any context. */ void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) { - struct blk_crypto_profile *profile; + struct blk_crypto_profile *profile = slot->profile; unsigned long flags; - if (!slot) - return; - - profile = slot->profile; - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, &profile->idle_slots_lock, flags)) { list_add_tail(&slot->idle_slot_node, &profile->idle_slots); @@ -354,28 +348,16 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return true; } -/** - * __blk_crypto_evict_key() - Evict a key from a device. - * @profile: the crypto profile of the device - * @key: the key to evict. It must not still be used in any I/O. - * - * If the device has keyslots, this finds the keyslot (if any) that contains the - * specified key and calls the driver's keyslot_evict function to evict it. - * - * Otherwise, this just calls the driver's keyslot_evict function if it is - * implemented, passing just the key (without any particular keyslot). This - * allows layered devices to evict the key from their underlying devices. - * - * Context: Process context. Takes and releases profile->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. +/* + * This is an internal function that evicts a key from an inline encryption + * device that can be either a real device or the blk-crypto-fallback "device". + * It is used only by blk_crypto_evict_key(); see that function for details. */ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key) { struct blk_crypto_keyslot *slot; - int err = 0; + int err; if (profile->num_slots == 0) { if (profile->ll_ops.keyslot_evict) { @@ -389,22 +371,30 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, blk_crypto_hw_enter(profile); slot = blk_crypto_find_keyslot(profile, key); - if (!slot) - goto out_unlock; + if (!slot) { + /* + * Not an error, since a key not in use by I/O is not guaranteed + * to be in a keyslot. There can be more keys than keyslots. + */ + err = 0; + goto out; + } if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + /* BUG: key is still in use by I/O */ err = -EBUSY; - goto out_unlock; + goto out_remove; } err = profile->ll_ops.keyslot_evict(profile, key, blk_crypto_keyslot_index(slot)); - if (err) - goto out_unlock; - +out_remove: + /* + * Callers free the key even on error, so unlink the key from the hash + * table and clear slot->key even on error. + */ hlist_del(&slot->hash_node); slot->key = NULL; - err = 0; -out_unlock: +out: blk_crypto_hw_exit(profile); return err; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 45378586151f..4d760b092deb 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/blk-crypto-profile.h> #include <linux/module.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" @@ -224,27 +225,27 @@ static bool bio_crypt_check_alignment(struct bio *bio) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, rq->crypt_ctx->bc_key, &rq->crypt_keyslot); } -/** - * __blk_crypto_free_request - Uninitialize the crypto fields of a request. - * - * @rq: The request whose crypto fields to uninitialize. - * - * Completely uninitializes the crypto fields of a request. If a keyslot has - * been programmed into some inline encryption hardware, that keyslot is - * released. The rq->crypt_ctx is also freed. - */ -void __blk_crypto_free_request(struct request *rq) +void __blk_crypto_rq_put_keyslot(struct request *rq) { blk_crypto_put_keyslot(rq->crypt_keyslot); + rq->crypt_keyslot = NULL; +} + +void __blk_crypto_free_request(struct request *rq) +{ + /* The keyslot, if one was needed, should have been released earlier. */ + if (WARN_ON_ONCE(rq->crypt_keyslot)) + __blk_crypto_rq_put_keyslot(rq); + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); - blk_crypto_rq_set_defaults(rq); + rq->crypt_ctx = NULL; } /** @@ -399,30 +400,39 @@ int blk_crypto_start_using_key(struct block_device *bdev, } /** - * blk_crypto_evict_key() - Evict a key from any inline encryption hardware - * it may have been programmed into - * @bdev: The block_device who's associated inline encryption hardware this key - * might have been programmed into - * @key: The key to evict + * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device + * @bdev: a block_device on which I/O using the key may have been done + * @key: the key to evict + * + * For a given block_device, this function removes the given blk_crypto_key from + * the keyslot management structures and evicts it from any underlying hardware + * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * - * Upper layers (filesystems) must call this function to ensure that a key is - * evicted from any hardware that it might have been programmed into. The key - * must not be in use by any in-flight IO when this function is called. + * Upper layers must call this before freeing the blk_crypto_key. It must be + * called for every block_device the key may have been used on. The key must no + * longer be in use by any I/O when this function is called. * - * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. + * Context: May sleep. */ -int blk_crypto_evict_key(struct block_device *bdev, - const struct blk_crypto_key *key) +void blk_crypto_evict_key(struct block_device *bdev, + const struct blk_crypto_key *key) { struct request_queue *q = bdev_get_queue(bdev); + int err; if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) - return __blk_crypto_evict_key(q->crypto_profile, key); - + err = __blk_crypto_evict_key(q->crypto_profile, key); + else + err = blk_crypto_fallback_evict_key(key); /* - * If the block_device didn't support the key, then blk-crypto-fallback - * may have been used, so try to evict the key from blk-crypto-fallback. + * An error can only occur here if the key failed to be evicted from a + * keyslot (due to a hardware or driver issue) or is allegedly still in + * use by I/O (due to a kernel bug). Even in these cases, the key is + * still unlinked from the keyslot management structures, and the caller + * is allowed and expected to free it right away. There's nothing + * callers can do to handle errors, so just log them and return void. */ - return blk_crypto_fallback_evict_key(key); + if (err) + pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/block/blk-flush.c b/block/blk-flush.c index 53202eff545e..04698ed9bcd4 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -68,12 +68,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> -#include <linux/blk-mq.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ @@ -138,11 +136,6 @@ static void blk_flush_restore_request(struct request *rq) rq->end_io = rq->flush.saved_end_io; } -static void blk_flush_queue_rq(struct request *rq, bool add_front) -{ - blk_mq_add_to_requeue_list(rq, add_front, true); -} - static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; @@ -195,7 +188,8 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - blk_flush_queue_rq(rq, true); + blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: @@ -352,7 +346,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, smp_wmb(); req_ref_set(flush_rq, 1); - blk_flush_queue_rq(flush_rq, false); + blk_mq_add_to_requeue_list(flush_rq, 0); + blk_mq_kick_requeue_list(q); } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, @@ -396,6 +391,7 @@ void blk_insert_flush(struct request *rq) unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; /* * @policy now records what operations need to be done. Adjust @@ -432,7 +428,8 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false, true); + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 4442c7a85112..285ced3467ab 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3106,9 +3106,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return nbytes; } - ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) - return ret; + goto err; iocg = blkg_to_iocg(ctx.blkg); @@ -3127,12 +3129,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return nbytes; einval: - blkg_conf_finish(&ctx); - return -EINVAL; + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -3189,19 +3193,22 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; - char *p; + char *body, *p; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); - disk = bdev->bd_disk; + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + disk = ctx.bdev->bd_disk; if (!queue_is_mq(disk->queue)) { ret = -EOPNOTSUPP; goto err; @@ -3223,7 +3230,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, enable = ioc->enabled; user = ioc->user_qos_params; - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3313,7 +3320,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); @@ -3323,7 +3330,7 @@ einval: ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return ret; } @@ -3376,19 +3383,22 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; - char *p; + char *body, *p; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; - q = bdev_get_queue(bdev); + body = ctx.body; + q = bdev_get_queue(ctx.bdev); if (!queue_is_mq(q)) { ret = -EOPNOTSUPP; goto err; @@ -3396,7 +3406,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(q); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk); + ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); @@ -3409,7 +3419,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3456,7 +3466,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return nbytes; einval: @@ -3467,7 +3477,7 @@ einval: ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return ret; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 0dc910568b31..fd5fec989e39 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) } } -int blk_iolatency_init(struct gendisk *disk) +static int blk_iolatency_init(struct gendisk *disk) { struct blk_iolatency *blkiolat; int ret; @@ -824,6 +824,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) } } +static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) +{ + static DEFINE_MUTEX(init_mutex); + int ret; + + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + mutex_lock(&init_mutex); + + if (!iolat_rq_qos(ctx->bdev->bd_queue)) + ret = blk_iolatency_init(ctx->bdev->bd_disk); + + mutex_unlock(&init_mutex); + + return ret; +} + static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -836,9 +859,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, u64 oldval; int ret; - ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blk_iolatency_try_init(&ctx); if (ret) - return ret; + goto out; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); + if (ret) + goto out; iolat = blkg_to_lat(ctx.blkg); p = ctx.body; @@ -874,7 +903,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, iolatency_clear_scaling(blkg); ret = 0; out: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -967,7 +996,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); u64 now = ktime_to_ns(ktime_get()); int cpu; diff --git a/block/blk-merge.c b/block/blk-merge.c index 6460abdb2426..65e75efa9bd3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -867,6 +867,8 @@ static struct request *attempt_merge(struct request_queue *q, if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); + blk_crypto_rq_put_keyslot(next); + /* * 'next' is going away, so update stats accordingly */ diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 0c612c19feb8..9638b25fd521 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -12,7 +12,6 @@ #include <linux/cpu.h> #include <linux/group_cpus.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b01818f8e216..d23a8554ec4a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -7,41 +7,14 @@ #include <linux/blkdev.h> #include <linux/debugfs.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #include "blk-rq-qos.h" -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); - } else { - seq_puts(m, "samples=0"); - } -} - static int queue_poll_stat_show(void *data, struct seq_file *m) { - struct request_queue *q = data; - int bucket; - - if (!q->poll_stat) - return 0; - - for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket]); - seq_puts(m, "\n"); - - seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket + 1]); - seq_puts(m, "\n"); - } return 0; } @@ -282,7 +255,6 @@ static const char *const rqf_name[] = { RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), - RQF_NAME(MQ_POLL_SLEPT), RQF_NAME(TIMED_OUT), RQF_NAME(ELV), RQF_NAME(RESV), diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index a90b88fd1332..d47b5c73c9eb 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -4,7 +4,6 @@ */ #include <linux/kobject.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> #include <linux/pci.h> #include <linux/module.h> diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c deleted file mode 100644 index 29c1f4d6eb04..000000000000 --- a/block/blk-mq-rdma.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2017 Sagi Grimberg. - */ -#include <linux/blk-mq.h> -#include <linux/blk-mq-rdma.h> -#include <rdma/ib_verbs.h> - -/** - * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device - * @map: CPU to hardware queue map. - * @dev: rdma device to provide a mapping for. - * @first_vec: first interrupt vectors to use for queues (usually 0) - * - * This function assumes the rdma device @dev has at least as many available - * interrupt vetors as @set has queues. It will then query it's affinity mask - * and built queue mapping that maps a queue to the CPUs that have irq affinity - * for the corresponding vector. - * - * In case either the driver passed a @dev with less vectors than - * @set->nr_hw_queues, or @dev does not provide an affinity mask for a - * vector, we fallback to the naive mapping. - */ -void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, - struct ib_device *dev, int first_vec) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < map->nr_queues; queue++) { - mask = ib_get_vector_affinity(dev, first_vec + queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - map->mq_map[cpu] = map->queue_offset + queue; - } - - return; - -fallback: - blk_mq_map_queues(map); -} -EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 06b312c69114..67c95f31b15b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -6,7 +6,6 @@ */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/blk-mq.h> #include <linux/list_sort.h> #include <trace/events/block.h> @@ -15,7 +14,6 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #include "blk-wbt.h" /* @@ -271,9 +269,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = hctx->queue; - const bool has_sched = q->elevator; - int ret = 0; + bool need_dispatch = false; LIST_HEAD(rq_list); /* @@ -302,23 +298,22 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { - if (has_sched) - ret = blk_mq_do_dispatch_sched(hctx); - else - ret = blk_mq_do_dispatch_ctx(hctx); - } - } else if (has_sched) { - ret = blk_mq_do_dispatch_sched(hctx); - } else if (hctx->dispatch_busy) { - /* dequeue request one by one from sw queue if queue is busy */ - ret = blk_mq_do_dispatch_ctx(hctx); + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + return 0; + need_dispatch = true; } else { - blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + need_dispatch = hctx->dispatch_busy; } - return ret; + if (hctx->queue->elevator) + return blk_mq_do_dispatch_sched(hctx); + + /* dequeue request one by one from sw queue if queue is busy */ + if (need_dispatch) + return blk_mq_do_dispatch_ctx(hctx); + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + return 0; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) @@ -384,116 +379,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - /* - * dispatch flush and passthrough rq directly - * - * passthrough request has to be added to hctx->dispatch directly. - * For some reason, device may be in one situation which can't - * handle FS request, so STS_RESOURCE is always returned and the - * FS request will be added to hctx->dispatch. However passthrough - * request may be required at that time for fixing the problem. If - * passthrough request is added to scheduler queue, there isn't any - * chance to dispatch it given we prioritize requests in hctx->dispatch. - */ - if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) - return true; - - return false; -} - -void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); - - if (blk_mq_sched_bypass_insert(hctx, rq)) { - /* - * Firstly normal IO request is inserted to scheduler queue or - * sw queue, meantime we add flush request to dispatch queue( - * hctx->dispatch) directly and there is at most one in-flight - * flush request for each hw queue, so it doesn't matter to add - * flush request to tail or front of the dispatch queue. - * - * Secondly in case of NCQ, flush request belongs to non-NCQ - * command, and queueing it will fail when there is any - * in-flight normal IO request(NCQ command). When adding flush - * rq to the front of hctx->dispatch, it is easier to introduce - * extra time to flush rq's latency because of S_SCHED_RESTART - * compared with adding to the tail of dispatch queue, then - * chance of flush merge is increased, and less flush requests - * will be issued to controller. It is observed that ~10% time - * is saved in blktests block/004 on disk attached to AHCI/NCQ - * drive when adding flush rq to the front of hctx->dispatch. - * - * Simply queue flush rq to the front of hctx->dispatch so that - * intensive flush workloads can benefit in case of NCQ HW. - */ - at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; - blk_mq_request_bypass_insert(rq, at_head, false); - goto run; - } - - if (e) { - LIST_HEAD(list); - - list_add(&rq->queuelist, &list); - e->type->ops.insert_requests(hctx, &list, at_head); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - -run: - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async) -{ - struct elevator_queue *e; - struct request_queue *q = hctx->queue; - - /* - * blk_mq_sched_insert_requests() is called from flush plug - * context only, and hold one usage counter to prevent queue - * from being released. - */ - percpu_ref_get(&q->q_usage_counter); - - e = hctx->queue->elevator; - if (e) { - e->type->ops.insert_requests(hctx, list, false); - } else { - /* - * try to issue requests directly if the hw queue isn't - * busy in case of 'none' scheduler, and this way may save - * us one extra enqueue & dequeue to sw queue. - */ - if (!hctx->dispatch_busy && !run_queue_async) { - blk_mq_run_dispatch_ops(hctx->queue, - blk_mq_try_issue_list_directly(hctx, list)); - if (list_empty(list)) - goto out; - } - blk_mq_insert_requests(hctx, ctx, list); - } - - blk_mq_run_hw_queue(hctx, run_queue_async); - out: - percpu_ref_put(&q->q_usage_counter); -} - static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 025013972453..7c3cbad17f30 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -4,7 +4,6 @@ #include "elevator.h" #include "blk-mq.h" -#include "blk-mq-tag.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) @@ -17,12 +16,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); -void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async); -void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async); - void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 1b2b0d258e46..156e9bb07abf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -10,10 +10,8 @@ #include <linux/workqueue.h> #include <linux/smp.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" static void blk_mq_sysfs_release(struct kobject *kobj) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9eb968e14d31..d6af9d431dc6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -9,12 +9,10 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/blk-mq.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" /* * Recalculate wakeup batch when tag is shared by hctx. diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h deleted file mode 100644 index 91ff37e3b43d..000000000000 --- a/block/blk-mq-tag.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef INT_BLK_MQ_TAG_H -#define INT_BLK_MQ_TAG_H - -struct blk_mq_alloc_data; - -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, - unsigned int reserved_tags, - int node, int alloc_policy); -extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, - unsigned int queue_depth, - unsigned int reserved, - int node, int alloc_policy); - -extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, - unsigned int *offset); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, - unsigned int tag); -void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); -extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, - unsigned int depth, bool can_grow); -extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, - unsigned int size); -extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); - -extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, - void *priv); -void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, - void *priv); - -static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, - struct blk_mq_hw_ctx *hctx) -{ - if (!hctx) - return &bt->ws[0]; - return sbq_wait_ptr(bt, &hctx->wait_index); -} - -enum { - BLK_MQ_NO_TAG = -1U, - BLK_MQ_TAG_MIN = 1, - BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, -}; - -extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); - -static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) -{ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_tag_busy(hctx); -} - -static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) -{ - if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) - return; - - __blk_mq_tag_idle(hctx); -} - -static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, - unsigned int tag) -{ - return tag < tags->nr_reserved_tags; -} - -#endif diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 6589f076a096..68d0945c0b08 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -3,7 +3,6 @@ * Copyright (c) 2016 Christoph Hellwig. */ #include <linux/device.h> -#include <linux/blk-mq.h> #include <linux/blk-mq-virtio.h> #include <linux/virtio_config.h> #include <linux/module.h> diff --git a/block/blk-mq.c b/block/blk-mq.c index 2831f78f86a0..f6dad0886a2f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -32,12 +32,10 @@ #include <trace/events/block.h> -#include <linux/blk-mq.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" @@ -46,51 +44,19 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); -static void blk_mq_poll_stats_start(struct request_queue *q); -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); - -static int blk_mq_poll_stats_bkt(const struct request *rq) -{ - int ddir, sectors, bucket; - - ddir = rq_data_dir(rq); - sectors = blk_rq_stats_sectors(rq); - - bucket = ddir + 2 * ilog2(sectors); - - if (bucket < 0) - return -1; - else if (bucket >= BLK_MQ_POLL_STATS_BKTS) - return ddir + BLK_MQ_POLL_STATS_BKTS - 2; - - return bucket; -} - -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return xa_load(&q->hctx_table, - (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); -} - -static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, - blk_qc_t qc) -{ - unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); - - if (qc & BLK_QC_T_INTERNAL) - return blk_mq_tag_to_rq(hctx->sched_tags, tag); - return blk_mq_tag_to_rq(hctx->tags, tag); + return xa_load(&q->hctx_table, qc); } static inline blk_qc_t blk_rq_to_qc(struct request *rq) { - return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | - (rq->tag != -1 ? - rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); + return rq->mq_hctx->queue_num; } /* @@ -840,6 +806,12 @@ static void blk_complete_request(struct request *req) req->q->integrity.profile->complete_fn(req, total_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + blk_crypto_rq_put_keyslot(req); + blk_account_io_completion(req, total_bytes); do { @@ -905,6 +877,13 @@ bool blk_update_request(struct request *req, blk_status_t error, req->q->integrity.profile->complete_fn(req, nr_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) + __blk_crypto_rq_put_keyslot(req); + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET)) && !test_bit(GD_DEAD, &req->q->disk->state)) { @@ -976,17 +955,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -static void __blk_account_io_done(struct request *req, u64 now) -{ - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); -} - static inline void blk_account_io_done(struct request *req, u64 now) { /* @@ -995,40 +963,41 @@ static inline void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) - __blk_account_io_done(req, now); -} - -static void __blk_account_io_start(struct request *rq) -{ - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (rq->bio) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->q->disk->part0; + !(req->rq_flags & RQF_FLUSH_SEQ)) { + const int sgrp = op_stat_group(req_op(req)); - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); + } } static inline void blk_account_io_start(struct request *req) { - if (blk_do_io_stat(req)) - __blk_account_io_start(req); + if (blk_do_io_stat(req)) { + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_unlock(); + } } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); + if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); - } blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -1322,6 +1291,8 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -1332,10 +1303,13 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) * device, directly accessing the plug instead of using blk_mq_plug() * should not have any consequences. */ - if (current->plug) + if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); - else - blk_mq_sched_insert_request(rq, at_head, true, false); + return; + } + + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -1383,6 +1357,7 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait) */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; @@ -1394,7 +1369,8 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); - blk_mq_sched_insert_request(rq, at_head, true, false); + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) { blk_rq_poll_completion(rq, &wait.done); @@ -1434,12 +1410,17 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { + struct request_queue *q = rq->q; + __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); + blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -1455,33 +1436,33 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) - continue; - - rq->rq_flags &= ~RQF_SOFTBARRIER; - list_del_init(&rq->queuelist); /* - * If RQF_DONTPREP, rq has contained some driver specific - * data, so insert it to hctx dispatch list to avoid any - * merge. + * If RQF_DONTPREP ist set, the request has been started by the + * driver already and might have driver-specific data allocated + * already. Insert it into the hctx dispatch list to avoid + * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) - blk_mq_request_bypass_insert(rq, false, false); - else - blk_mq_sched_insert_request(rq, true, false, false); + if (rq->rq_flags & RQF_DONTPREP) { + rq->rq_flags &= ~RQF_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_request_bypass_insert(rq, 0); + } else if (rq->rq_flags & RQF_SOFTBARRIER) { + rq->rq_flags &= ~RQF_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); + } } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false); + blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list) +void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags) { struct request_queue *q = rq->q; unsigned long flags; @@ -1493,16 +1474,13 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); spin_lock_irqsave(&q->requeue_lock, flags); - if (at_head) { + if (insert_flags & BLK_MQ_INSERT_AT_HEAD) { rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->requeue_list); } else { list_add_tail(&rq->queuelist, &q->requeue_list); } spin_unlock_irqrestore(&q->requeue_lock, flags); - - if (kick_requeue_list) - blk_mq_kick_requeue_list(q); } void blk_mq_kick_requeue_list(struct request_queue *q) @@ -2158,24 +2136,6 @@ out: return true; } -/** - * __blk_mq_run_hw_queue - Run a hardware queue. - * @hctx: Pointer to the hardware queue to run. - * - * Send pending requests to the hardware. - */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) -{ - /* - * We can't run the queue inline with ints disabled. Ensure that - * we catch bad users of this early. - */ - WARN_ON_ONCE(in_interrupt()); - - blk_mq_run_dispatch_ops(hctx->queue, - blk_mq_sched_dispatch_requests(hctx)); -} - static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); @@ -2232,42 +2192,19 @@ select_cpu: } /** - * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @async: If we want to run the queue asynchronously. * @msecs: Milliseconds of delay to wait before running the queue. * - * If !@async, try to run the queue now. Else, run the queue asynchronously and - * with a delay of @msecs. + * Run a hardware queue asynchronously with a delay of @msecs. */ -static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, - unsigned long msecs) +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; - - if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { - __blk_mq_run_hw_queue(hctx); - return; - } - } - kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } - -/** - * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. - * @hctx: Pointer to the hardware queue to run. - * @msecs: Milliseconds of delay to wait before running the queue. - * - * Run a hardware queue asynchronously with a delay of @msecs. - */ -void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) -{ - __blk_mq_delay_run_hw_queue(hctx, true, msecs); -} EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); /** @@ -2284,6 +2221,11 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) bool need_run; /* + * We can't run the queue inline with interrupts disabled. + */ + WARN_ON_ONCE(!async && in_interrupt()); + + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even __blk_mq_hctx_has_pending() can't be called safely. @@ -2295,8 +2237,17 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); - if (need_run) - __blk_mq_delay_run_hw_queue(hctx, async, 0); + if (!need_run) + return; + + if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || + !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + blk_mq_delay_run_hw_queue(hctx, 0); + return; + } + + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -2461,80 +2412,52 @@ EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { - struct blk_mq_hw_ctx *hctx; - - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); - - /* - * If we are stopped, don't run the queue. - */ - if (blk_mq_hctx_stopped(hctx)) - return; - - __blk_mq_run_hw_queue(hctx); -} - -static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, - struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - trace_block_rq_insert(rq); - - if (at_head) - list_add(&rq->queuelist, &ctx->rq_lists[type]); - else - list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); -} - -void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - - lockdep_assert_held(&ctx->lock); + struct blk_mq_hw_ctx *hctx = + container_of(work, struct blk_mq_hw_ctx, run_work.work); - __blk_mq_insert_req_list(hctx, rq, at_head); - blk_mq_hctx_mark_pending(hctx, ctx); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. - * @at_head: true if the request should be inserted at the head of the list. - * @run_queue: If we should run the hardware queue after inserting the request. + * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, bool at_head, - bool run_queue) +void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); - if (at_head) + if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, false); } -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct list_head *list) - +static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, struct list_head *list, + bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* + * Try to issue requests directly if the hw queue isn't busy to save an + * extra enqueue & dequeue to the sw queue. + */ + if (!hctx->dispatch_busy && !run_queue_async) { + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); + if (list_empty(list)) + goto out; + } + + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ @@ -2547,6 +2470,70 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); +out: + blk_mq_run_hw_queue(hctx, run_queue_async); +} + +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) +{ + struct request_queue *q = rq->q; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (blk_rq_is_passthrough(rq)) { + /* + * Passthrough request have to be added to hctx->dispatch + * directly. The device may be in a situation where it can't + * handle FS request, and always returns BLK_STS_RESOURCE for + * them, which gets them added to hctx->dispatch. + * + * If a passthrough request is required to unblock the queues, + * and it is added to the scheduler queue, there is no chance to + * dispatch it given we prioritize requests in hctx->dispatch. + */ + blk_mq_request_bypass_insert(rq, flags); + } else if (rq->rq_flags & RQF_FLUSH_SEQ) { + /* + * Firstly normal IO request is inserted to scheduler queue or + * sw queue, meantime we add flush request to dispatch queue( + * hctx->dispatch) directly and there is at most one in-flight + * flush request for each hw queue, so it doesn't matter to add + * flush request to tail or front of the dispatch queue. + * + * Secondly in case of NCQ, flush request belongs to non-NCQ + * command, and queueing it will fail when there is any + * in-flight normal IO request(NCQ command). When adding flush + * rq to the front of hctx->dispatch, it is easier to introduce + * extra time to flush rq's latency because of S_SCHED_RESTART + * compared with adding to the tail of dispatch queue, then + * chance of flush merge is increased, and less flush requests + * will be issued to controller. It is observed that ~10% time + * is saved in blktests block/004 on disk attached to AHCI/NCQ + * drive when adding flush rq to the front of hctx->dispatch. + * + * Simply queue flush rq to the front of hctx->dispatch so that + * intensive flush workloads can benefit in case of NCQ HW. + */ + blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); + } else if (q->elevator) { + LIST_HEAD(list); + + WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); + + list_add(&rq->queuelist, &list); + q->elevator->type->ops.insert_requests(hctx, &list, flags); + } else { + trace_block_rq_insert(rq); + + spin_lock(&ctx->lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); + else + list_add_tail(&rq->queuelist, + &ctx->rq_lists[hctx->type]); + blk_mq_hctx_mark_pending(hctx, ctx); + spin_unlock(&ctx->lock); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -2600,49 +2587,19 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - bool bypass_insert, bool last) +static bool blk_mq_get_budget_and_tag(struct request *rq) { - struct request_queue *q = rq->q; - bool run_queue = true; int budget_token; - /* - * RCU or SRCU read lock is needed before checking quiesced flag. - * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. - */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { - run_queue = false; - bypass_insert = false; - goto insert; - } - - if ((rq->rq_flags & RQF_ELV) && !bypass_insert) - goto insert; - - budget_token = blk_mq_get_dispatch_budget(q); + budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) - goto insert; - + return false; blk_mq_set_rq_budget_token(rq, budget_token); - if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(q, budget_token); - goto insert; + blk_mq_put_dispatch_budget(rq->q, budget_token); + return false; } - - return __blk_mq_issue_directly(hctx, rq, last); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_sched_insert_request(rq, false, run_queue, false); - - return BLK_STS_OK; + return true; } /** @@ -2658,18 +2615,46 @@ insert: static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_status_t ret = - __blk_mq_try_issue_directly(hctx, rq, false, true); + blk_status_t ret; + + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return; + } + + if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); + return; + } - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, false, true); - else if (ret != BLK_STS_OK) + ret = __blk_mq_issue_directly(hctx, rq, true); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); + break; + default: blk_mq_end_request(rq, ret); + break; + } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return BLK_STS_OK; + } + + if (!blk_mq_get_budget_and_tag(rq)) + return BLK_STS_RESOURCE; + return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_plug_issue_direct(struct blk_plug *plug) @@ -2697,7 +2682,8 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug) break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, true); + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); @@ -2743,7 +2729,16 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); + + percpu_ref_get(&this_hctx->queue->q_usage_counter); + if (this_hctx->queue->elevator) { + this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, + &list, 0); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else { + blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); + } + percpu_ref_put(&this_hctx->queue->q_usage_counter); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) @@ -2789,7 +2784,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) } while (!rq_list_empty(plug->mq_list)); } -void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; @@ -2807,8 +2802,9 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, - list_empty(list)); + blk_mq_request_bypass_insert(rq, 0); + if (list_empty(list)) + blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); @@ -2934,6 +2930,7 @@ void blk_mq_submit_bio(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); + struct blk_mq_hw_ctx *hctx; struct request *rq; unsigned int nr_segs = 1; blk_status_t ret; @@ -2965,7 +2962,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - ret = blk_crypto_init_request(rq); + ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); @@ -2978,15 +2975,19 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (plug) + if (plug) { blk_add_rq_to_plug(plug, rq); - else if ((rq->rq_flags & RQF_ELV) || - (rq->mq_hctx->dispatch_busy && - (q->nr_hw_queues == 1 || !is_sync))) - blk_mq_sched_insert_request(rq, false, true, true); - else - blk_mq_run_dispatch_ops(rq->q, - blk_mq_try_issue_directly(rq->mq_hctx, rq)); + return; + } + + hctx = rq->mq_hctx; + if ((rq->rq_flags & RQF_ELV) || + (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, true); + } else { + blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); + } } #ifdef CONFIG_BLK_MQ_STACKING @@ -3034,8 +3035,9 @@ blk_status_t blk_insert_cloned_request(struct request *rq) if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) + return ret; blk_account_io_start(rq); @@ -4206,14 +4208,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; - q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_mq_poll_stats_bkt, - BLK_MQ_POLL_STATS_BKTS, q); - if (!q->poll_cb) - goto err_exit; - if (blk_mq_alloc_ctxs(q)) - goto err_poll; + goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -4241,11 +4237,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; - /* - * Default to classic polling - */ - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -4253,9 +4244,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: blk_mq_release(q); -err_poll: - blk_stat_free_callback(q->poll_cb); - q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return -ENOMEM; @@ -4752,138 +4740,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -/* Enable polling stats and return whether they were already enabled. */ -static bool blk_poll_stats_enable(struct request_queue *q) -{ - if (q->poll_stat) - return true; - - return blk_stats_alloc_enable(q); -} - -static void blk_mq_poll_stats_start(struct request_queue *q) -{ - /* - * We don't arm the callback if polling stats are not enabled or the - * callback is already active. - */ - if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) - return; - - blk_stat_activate_msecs(q->poll_cb, 100); -} - -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) -{ - struct request_queue *q = cb->data; - int bucket; - - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { - if (cb->stat[bucket].nr_samples) - q->poll_stat[bucket] = cb->stat[bucket]; - } -} - -static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct request *rq) -{ - unsigned long ret = 0; - int bucket; - - /* - * If stats collection isn't on, don't sleep but turn it on for - * future users - */ - if (!blk_poll_stats_enable(q)) - return 0; - - /* - * As an optimistic guess, use half of the mean service time - * for this type of request. We can (and should) make this smarter. - * For instance, if the completion latencies are tight, we can - * get closer than just half the mean. This is especially - * important on devices where the completion latencies are longer - * than ~10 usec. We do use the stats for the relevant IO size - * if available which does lead to better estimates. - */ - bucket = blk_mq_poll_stats_bkt(rq); - if (bucket < 0) - return ret; - - if (q->poll_stat[bucket].nr_samples) - ret = (q->poll_stat[bucket].mean + 1) / 2; - - return ret; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) -{ - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); - struct request *rq = blk_qc_to_rq(hctx, qc); - struct hrtimer_sleeper hs; - enum hrtimer_mode mode; - unsigned int nsecs; - ktime_t kt; - - /* - * If a request has completed on queue that uses an I/O scheduler, we - * won't get back a request from blk_qc_to_rq. - */ - if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) - return false; - - /* - * If we get here, hybrid polling is enabled. Hence poll_nsec can be: - * - * 0: use half of prev avg - * >0: use this specific value - */ - if (q->poll_nsec > 0) - nsecs = q->poll_nsec; - else - nsecs = blk_mq_poll_nsecs(q, rq); - - if (!nsecs) - return false; - - rq->rq_flags |= RQF_MQ_POLL_SLEPT; - - /* - * This will be replaced with the stats tracking code, using - * 'avg_completion_time / 2' as the pre-sleep target. - */ - kt = nsecs; - - mode = HRTIMER_MODE_REL; - hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&hs.timer, kt); - - do { - if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) - break; - set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_sleeper_start_expires(&hs, mode); - if (hs.task) - io_schedule(); - hrtimer_cancel(&hs.timer); - mode = HRTIMER_MODE_ABS; - } while (hs.task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - destroy_hrtimer_on_stack(&hs.timer); - - /* - * If we sleep, have the caller restart the poll loop to reset the - * state. Like for the other success return cases, the caller is - * responsible for checking if the IO completed. If the IO isn't - * complete, we'll get called again and will go straight to the busy - * poll loop. - */ - return true; -} - -static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - struct io_comp_batch *iob, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); @@ -4910,17 +4768,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) -{ - if (!(flags & BLK_POLL_NOSLEEP) && - q->poll_nsec != BLK_MQ_POLL_CLASSIC) { - if (blk_mq_poll_hybrid(q, cookie)) - return 1; - } - return blk_mq_poll_classic(q, cookie, iob, flags); -} - unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; diff --git a/block/blk-mq.h b/block/blk-mq.h index a7482d2cc82e..e876584d3516 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -2,8 +2,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include <linux/blk-mq.h> #include "blk-stat.h" -#include "blk-mq-tag.h" struct blk_mq_tag_set; @@ -30,6 +30,15 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +enum { + BLK_MQ_NO_TAG = -1U, + BLK_MQ_TAG_MIN = 1, + BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, +}; + +typedef unsigned int __bitwise blk_insert_t; +#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) + void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); @@ -38,8 +47,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list); +void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -59,14 +67,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, /* * Internal helpers for request insertion into sw queues */ -void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head); -void blk_mq_request_bypass_insert(struct request *rq, bool at_head, - bool run_queue); -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct list_head *list); -void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, - struct list_head *list); +void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); /* * CPU -> queue mappings @@ -164,6 +165,60 @@ struct blk_mq_alloc_data { struct blk_mq_hw_ctx *hctx; }; +struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, + unsigned int reserved_tags, int node, int alloc_policy); +void blk_mq_free_tags(struct blk_mq_tags *tags); +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, unsigned int queue_depth, + unsigned int reserved, int node, int alloc_policy); + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset); +void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag); +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, unsigned int depth, bool can_grow); +void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, + unsigned int size); +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); + +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, + void *priv); +void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); + +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_idle(hctx); +} + +static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, + unsigned int tag) +{ + return tag < tags->nr_reserved_tags; +} + static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; diff --git a/block/blk-pm.c b/block/blk-pm.c index 2dad62cc1572..6b72b2e03fc8 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/blk-mq.h> #include <linux/blk-pm.h> #include <linux/blkdev.h> #include <linux/pm_runtime.h> #include "blk-mq.h" -#include "blk-mq-tag.h" /** * blk_pm_runtime_init - Block layer runtime PM initialization routine diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index b02a1a3d33a8..f48ee150d667 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_WBT); } -static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } diff --git a/block/blk-stat.c b/block/blk-stat.c index c6ca16abf911..7ff76ae6c76a 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -6,7 +6,6 @@ */ #include <linux/kernel.h> #include <linux/rculist.h> -#include <linux/blk-mq.h> #include "blk-stat.h" #include "blk-mq.h" @@ -190,7 +189,7 @@ void blk_stat_disable_accounting(struct request_queue *q) unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); - if (!--q->stats->accounting) + if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } @@ -201,7 +200,7 @@ void blk_stat_enable_accounting(struct request_queue *q) unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); - if (!q->stats->accounting++) + if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } @@ -231,21 +230,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } - -bool blk_stats_alloc_enable(struct request_queue *q) -{ - struct blk_rq_stat *poll_stat; - - poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), - GFP_ATOMIC); - if (!poll_stat) - return false; - - if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { - kfree(poll_stat); - return true; - } - - blk_stat_add_callback(q, q->poll_cb); - return false; -} diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fce1c7fa44..a64208583853 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -9,7 +9,6 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> -#include <linux/blk-mq.h> #include <linux/debugfs.h> #include "blk.h" @@ -408,35 +407,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - int val; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - val = BLK_MQ_POLL_CLASSIC; - else - val = q->poll_nsec / 1000; - - return sprintf(page, "%d\n", val); + return sprintf(page, "%d\n", -1); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - int err, val; - - if (!q->mq_ops || !q->mq_ops->poll) - return -EINVAL; - - err = kstrtoint(page, 10, &val); - if (err < 0) - return err; - - if (val == BLK_MQ_POLL_CLASSIC) - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - else if (val >= 0) - q->poll_nsec = val * 1000; - else - return -EINVAL; - return count; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 47e9d8be68f3..9d010d867fbf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1368,9 +1368,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, int ret; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) @@ -1389,7 +1391,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_conf_updated(tg, false); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1561,9 +1563,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, int ret; int index = of_cft(of)->private; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); @@ -1662,7 +1666,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -2439,11 +2443,12 @@ void blk_throtl_register(struct gendisk *disk) #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; -#endif +#else td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); +#endif } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk.h b/block/blk.h index cc4e8873dfde..2da831103471 100644 --- a/block/blk.h +++ b/block/blk.h @@ -399,12 +399,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, return bio; } -#ifdef CONFIG_BLK_CGROUP_IOLATENCY -int blk_iolatency_init(struct gendisk *disk); -#else -static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; -#endif - #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); diff --git a/block/elevator.h b/block/elevator.h index 774a8f6b99e6..7ca3d7b6ed82 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -4,6 +4,7 @@ #include <linux/percpu.h> #include <linux/hashtable.h> +#include "blk-mq.h" struct io_cq; struct elevator_type; @@ -37,7 +38,8 @@ struct elevator_mq_ops { void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); void (*prepare_request)(struct request *); void (*finish_request)(struct request *); - void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); + void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list, + blk_insert_t flags); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); diff --git a/block/genhd.c b/block/genhd.c index 7f874737af68..2d58ac54043a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -426,6 +426,9 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, */ elevator_init_mq(disk->queue); + /* Mark bdev as having a submit_bio, if needed */ + disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL; + /* * If the driver provides an explicit major number it also must provide * the number of minors numbers supported, and those will be used to diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 2146969237bf..4155594aefc6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/module.h> #include <linux/sbitmap.h> @@ -19,7 +18,6 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #define CREATE_TRACE_POINTS #include <trace/events/kyber.h> @@ -590,7 +588,8 @@ static void kyber_prepare_request(struct request *rq) } static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *rq_list, bool at_head) + struct list_head *rq_list, + blk_insert_t flags) { struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next; @@ -602,7 +601,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, spin_lock(&kcq->lock); trace_block_rq_insert(rq); - if (at_head) + if (flags & BLK_MQ_INSERT_AT_HEAD) list_move(&rq->queuelist, head); else list_move_tail(&rq->queuelist, head); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f10c2a0d18d4..5839a027e0f0 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> @@ -23,7 +22,6 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" /* @@ -768,7 +766,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -801,7 +799,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - if (at_head) { + if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { @@ -822,10 +820,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } /* - * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) + struct list_head *list, + blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -836,7 +835,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, at_head); + dd_insert_request(hctx, rq, flags); } spin_unlock(&dd->lock); } diff --git a/block/opal_proto.h b/block/opal_proto.h index 7152aa1f1a49..a4e56845dd82 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -86,6 +86,15 @@ enum opal_response_token { #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 +/* + * Boolean operators from TCG Core spec 2.01 Section: + * 5.1.3.11 + * Table 61 + */ +#define OPAL_BOOLEAN_AND 0 +#define OPAL_BOOLEAN_OR 1 +#define OPAL_BOOLEAN_NOT 2 + /* Enum to index OPALUID array */ enum opal_uid { /* users */ @@ -105,6 +114,7 @@ enum opal_uid { /* tables */ OPAL_TABLE_TABLE, OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_START_TO_KEY, OPAL_LOCKINGRANGE_ACE_RDLOCKED, OPAL_LOCKINGRANGE_ACE_WRLOCKED, OPAL_MBRCONTROL, diff --git a/block/sed-opal.c b/block/sed-opal.c index c320093c14f1..c18339446ef3 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -83,8 +83,10 @@ struct opal_dev { u16 comid; u32 hsn; u32 tsn; - u64 align; + u64 align; /* alignment granularity */ u64 lowest_lba; + u32 logical_block_size; + u8 align_required; /* ALIGN: 0 or 1 */ size_t pos; u8 *cmd; @@ -132,6 +134,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_START_TO_KEY] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xD0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = @@ -407,6 +411,8 @@ static void check_geometry(struct opal_dev *dev, const void *data) dev->align = be64_to_cpu(geo->alignment_granularity); dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba); + dev->logical_block_size = be32_to_cpu(geo->logical_block_size); + dev->align_required = geo->reserved01 & 1; } static int execute_step(struct opal_dev *dev, @@ -1147,12 +1153,8 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont) return opal_send_recv(dev, cont); } -/* - * request @column from table @table on device @dev. On success, the column - * data will be available in dev->resp->tok[4] - */ -static int generic_get_column(struct opal_dev *dev, const u8 *table, - u64 column) +static int generic_get_columns(struct opal_dev *dev, const u8 *table, + u64 start_column, u64 end_column) { int err; @@ -1162,12 +1164,12 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_STARTCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, start_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_ENDCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, end_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); @@ -1179,6 +1181,16 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, } /* + * request @column from table @table on device @dev. On success, the column + * data will be available in dev->resp->tok[4] + */ +static int generic_get_column(struct opal_dev *dev, const u8 *table, + u64 column) +{ + return generic_get_columns(dev, table, column, column); +} + +/* * see TCG SAS 5.3.2.3 for a description of the available columns * * the result is provided in dev->resp->tok[4] @@ -1437,6 +1449,129 @@ static int setup_locking_range(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int response_get_column(const struct parsed_resp *resp, + int *iter, + u8 column, + u64 *value) +{ + const struct opal_resp_tok *tok; + int n = *iter; + u64 val; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + if (response_get_u64(resp, n) != column) { + pr_debug("Token %d does not match expected column %u.\n", + n, column); + return OPAL_INVAL_PARAM; + } + n++; + + val = response_get_u64(resp, n); + n++; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + *value = val; + *iter = n; + + return 0; +} + +static int locking_range_status(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u64 resp; + bool rlocked, wlocked; + int err, tok_n = 2; + struct opal_lr_status *lrst = data; + + err = build_locking_range(lr_buffer, sizeof(lr_buffer), + lrst->session.opal_key.lr); + if (err) + return err; + + err = generic_get_columns(dev, lr_buffer, OPAL_RANGESTART, + OPAL_WRITELOCKED); + if (err) { + pr_debug("Couldn't get lr %u table columns %d to %d.\n", + lrst->session.opal_key.lr, OPAL_RANGESTART, + OPAL_WRITELOCKED); + return err; + } + + /* range start */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGESTART, + &lrst->range_start); + if (err) + return err; + + /* range length */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGELENGTH, + &lrst->range_length); + if (err) + return err; + + /* RLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKENABLED, + &resp); + if (err) + return err; + + lrst->RLE = !!resp; + + /* WLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKENABLED, + &resp); + if (err) + return err; + + lrst->WLE = !!resp; + + /* read locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKED, &resp); + if (err) + return err; + + rlocked = !!resp; + + /* write locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKED, &resp); + if (err) + return err; + + wlocked = !!resp; + + /* opal_lock_state can not map 'read locked' only state. */ + lrst->l_state = OPAL_RW; + if (rlocked && wlocked) + lrst->l_state = OPAL_LK; + else if (wlocked) + lrst->l_state = OPAL_RO; + else if (rlocked) { + pr_debug("Can not report read locked only state.\n"); + return -EINVAL; + } + + return 0; +} + static int start_generic_opal_session(struct opal_dev *dev, enum opal_uid auth, enum opal_uid sp_type, @@ -1759,25 +1894,43 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } -static int add_user_to_lr(struct opal_dev *dev, void *data) +static void add_authority_object_ref(int *err, + struct opal_dev *dev, + const u8 *uid, + size_t uid_len) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(err, dev, uid, uid_len); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static void add_boolean_object_ref(int *err, + struct opal_dev *dev, + u8 boolean_op) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(err, dev, boolean_op); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static int set_lr_boolean_ace(struct opal_dev *dev, + unsigned int opal_uid, + u8 lr, + const u8 *users, + size_t users_len) { u8 lr_buffer[OPAL_UID_LENGTH]; u8 user_uid[OPAL_UID_LENGTH]; - struct opal_lock_unlock *lkul = data; + u8 u; int err; - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], - OPAL_UID_LENGTH); - - if (lkul->l_state == OPAL_RW) - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], - OPAL_UID_LENGTH); - - lr_buffer[7] = lkul->session.opal_key.lr; - - memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); - - user_uid[7] = lkul->session.who; + memcpy(lr_buffer, opaluid[opal_uid], OPAL_UID_LENGTH); + lr_buffer[7] = lr; err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]); @@ -1790,35 +1943,49 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) add_token_u8(&err, dev, OPAL_STARTLIST); + for (u = 0; u < users_len; u++) { + if (users[u] == OPAL_ADMIN1) + memcpy(user_uid, opaluid[OPAL_ADMIN1_UID], + OPAL_UID_LENGTH); + else { + memcpy(user_uid, opaluid[OPAL_USER1_UID], + OPAL_UID_LENGTH); + user_uid[7] = users[u]; + } - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], - OPAL_UID_LENGTH/2); - add_token_u8(&err, dev, 1); - add_token_u8(&err, dev, OPAL_ENDNAME); + add_authority_object_ref(&err, dev, user_uid, sizeof(user_uid)); + /* + * Add boolean operator in postfix only with + * two or more authorities being added in ACE + * expresion. + * */ + if (u > 0) + add_boolean_object_ref(&err, dev, OPAL_BOOLEAN_OR); + } add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); + return err; +} + +static int add_user_to_lr(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, + lkul->l_state == OPAL_RW ? + OPAL_LOCKINGRANGE_ACE_WRLOCKED : + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); if (err) { pr_debug("Error building add user to locking range command.\n"); return err; @@ -1827,6 +1994,27 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int add_user_to_lr_ace(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + OPAL_ADMIN1, + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, OPAL_LOCKINGRANGE_ACE_START_TO_KEY, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); + + if (err) { + pr_debug("Error building add user to locking ranges ACEs.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int lock_unlock_locking_range(struct opal_dev *dev, void *data) { u8 lr_buffer[OPAL_UID_LENGTH]; @@ -2364,6 +2552,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev, const struct opal_step steps[] = { { start_admin1LSP_opal_session, &lk_unlk->session.opal_key }, { add_user_to_lr, lk_unlk }, + { add_user_to_lr_ace, lk_unlk }, { end_opal_session, } }; int ret; @@ -2580,6 +2769,33 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; } +static int opal_locking_range_status(struct opal_dev *dev, + struct opal_lr_status *opal_lrst, + void __user *data) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrst->session }, + { locking_range_status, opal_lrst }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_lr_status, range_start), + (void *)opal_lrst + offsetof(struct opal_lr_status, range_start), + sizeof(*opal_lrst) - offsetof(struct opal_lr_status, range_start))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + + return ret; +} + static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) { const struct opal_step pw_steps[] = { @@ -2744,6 +2960,26 @@ static int opal_get_status(struct opal_dev *dev, void __user *data) return 0; } +static int opal_get_geometry(struct opal_dev *dev, void __user *data) +{ + struct opal_geometry geo = {0}; + + if (check_opal_support(dev)) + return -EINVAL; + + geo.align = dev->align_required; + geo.logical_block_size = dev->logical_block_size; + geo.alignment_granularity = dev->align; + geo.lowest_aligned_lba = dev->lowest_lba; + + if (copy_to_user(data, &geo, sizeof(geo))) { + pr_debug("Error copying geometry data to userspace\n"); + return -EFAULT; + } + + return 0; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2814,6 +3050,12 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_STATUS: ret = opal_get_status(dev, arg); break; + case IOC_OPAL_GET_LR_STATUS: + ret = opal_locking_range_status(dev, p, arg); + break; + case IOC_OPAL_GET_GEOMETRY: + ret = opal_get_geometry(dev, arg); + break; default: break; } |