summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig1
-rw-r--r--block/bdev.c10
-rw-r--r--block/bfq-cgroup.c34
-rw-r--r--block/bfq-iosched.c22
-rw-r--r--block/bfq-iosched.h8
-rw-r--r--block/bfq-wf2q.c5
-rw-r--r--block/bio.c146
-rw-r--r--block/blk-cgroup-rwstat.h8
-rw-r--r--block/blk-cgroup.c76
-rw-r--r--block/blk-cgroup.h12
-rw-r--r--block/blk-core.c93
-rw-r--r--block/blk-flush.c6
-rw-r--r--block/blk-ia-ranges.c66
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iocost.c22
-rw-r--r--block/blk-iolatency.c30
-rw-r--r--block/blk-ioprio.c57
-rw-r--r--block/blk-ioprio.h9
-rw-r--r--block/blk-lib.c6
-rw-r--r--block/blk-merge.c80
-rw-r--r--block/blk-mq-debugfs-zoned.c6
-rw-r--r--block/blk-mq-debugfs.c48
-rw-r--r--block/blk-mq-debugfs.h10
-rw-r--r--block/blk-mq-sched.c12
-rw-r--r--block/blk-mq-sysfs.c45
-rw-r--r--block/blk-mq-tag.c31
-rw-r--r--block/blk-mq-tag.h10
-rw-r--r--block/blk-mq.c145
-rw-r--r--block/blk-mq.h39
-rw-r--r--block/blk-rq-qos.c12
-rw-r--r--block/blk-rq-qos.h18
-rw-r--r--block/blk-settings.c11
-rw-r--r--block/blk-sysfs.c84
-rw-r--r--block/blk-throttle.c7
-rw-r--r--block/blk-wbt.c30
-rw-r--r--block/blk-zoned.c92
-rw-r--r--block/blk.h34
-rw-r--r--block/bounce.c13
-rw-r--r--block/bsg-lib.c6
-rw-r--r--block/elevator.h2
-rw-r--r--block/fops.c30
-rw-r--r--block/genhd.c120
-rw-r--r--block/holder.c4
-rw-r--r--block/ioctl.c2
-rw-r--r--block/ioprio.c58
-rw-r--r--block/kyber-iosched.c11
-rw-r--r--block/mq-deadline.c7
-rw-r--r--block/partitions/check.h4
-rw-r--r--block/partitions/core.c23
49 files changed, 776 insertions, 831 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 50b17e260fa2..444c5ab3b67e 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -147,7 +147,6 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP
- select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
help
Enabling this option enables the .weight interface for cost
diff --git a/block/bdev.c b/block/bdev.c
index 5fe06c1f2def..ce05175e71ce 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -54,12 +54,10 @@ static void bdev_write_inode(struct block_device *bdev)
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
ret = write_inode_now(inode, true);
- if (ret) {
- char name[BDEVNAME_SIZE];
- pr_warn_ratelimited("VFS: Dirty inode writeback failed "
- "for block device %s (err=%d).\n",
- bdevname(bdev, name), ret);
- }
+ if (ret)
+ pr_warn_ratelimited(
+ "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
+ bdev, ret);
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 09574af83566..30b15a9a47c4 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -220,46 +220,46 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
}
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op)
+ blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, -1);
}
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+ blkg_rwstat_add(&bfqg->stats.merged, opf, 1);
}
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op)
+ u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns();
if (now > io_start_time_ns)
- blkg_rwstat_add(&stats->service_time, op,
+ blkg_rwstat_add(&stats->service_time, opf,
now - io_start_time_ns);
if (io_start_time_ns > start_time_ns)
- blkg_rwstat_add(&stats->wait_time, op,
+ blkg_rwstat_add(&stats->wait_time, opf,
io_start_time_ns - start_time_ns);
}
#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op) { }
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+ blk_opf_t opf) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op) { }
+ u64 io_start_time_ns, blk_opf_t opf) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
@@ -706,10 +706,10 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * __bfq_bic_change_cgroup - move @bic to @bfqg.
* @bfqd: the queue descriptor.
* @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @bfqg: the group to move to.
*
* Move bic to blkcg, assuming that bfqd->lock is held; which makes
* sure that the reference to cgroup is valid across the call (see
@@ -863,6 +863,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
* @bfqd: the device data structure with the root group.
* @entity: the entity to move, if entity is a leaf; or the parent entity
* of an active leaf entity to move, if entity is not a leaf.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
struct bfq_entity *entity,
@@ -892,6 +893,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
* @st: the service tree to start the search from.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_active_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg,
@@ -1471,8 +1473,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
-void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
-
void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0d46cb728bbf..c740b41fe0a4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -668,19 +668,19 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
* significantly affect service guarantees coming from the BFQ scheduling
* algorithm.
*/
-static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
+ struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
int depth;
unsigned limit = data->q->nr_requests;
/* Sync reads have full depth available */
- if (op_is_sync(op) && !op_is_write(op)) {
+ if (op_is_sync(opf) && !op_is_write(opf)) {
depth = 0;
} else {
- depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
limit = (limit * depth) >> bfqd->full_depth_shift;
}
@@ -693,7 +693,7 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
if (depth)
data->shallow_depth = depth;
}
@@ -6104,7 +6104,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags)
+ blk_opf_t cmd_flags)
{
if (!bfqq)
return;
@@ -6129,7 +6129,7 @@ static void bfq_update_insert_stats(struct request_queue *q,
static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags) {}
+ blk_opf_t cmd_flags) {}
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static struct bfq_queue *bfq_init_rq(struct request *rq);
@@ -6141,7 +6141,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -7046,6 +7046,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
spin_unlock_irq(&bfqd->lock);
#endif
+ blk_stat_disable_accounting(bfqd->queue);
wbt_enable_default(bfqd->queue);
kfree(bfqd);
@@ -7188,7 +7189,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
wbt_disable_default(q);
+ blk_stat_enable_accounting(q);
+
return 0;
out_free:
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ca8177d7bf7c..ad8e513d7e87 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -994,11 +994,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op);
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+ blk_opf_t opf);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op);
+ u64 io_start_time_ns, blk_opf_t opf);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index f8eb340381cf..983413cdefad 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1360,6 +1360,8 @@ left:
/**
* __bfq_lookup_next_entity - return the first eligible entity in @st.
* @st: the service tree.
+ * @in_service: whether or not there is an in-service entity for the sched_data
+ * this active tree belongs to.
*
* If there is no in-service entity for the sched_data st belongs to,
* then return the entity that will be set in service if:
@@ -1472,9 +1474,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
break;
}
- if (!entity)
- return NULL;
-
return entity;
}
diff --git a/block/bio.c b/block/bio.c
index f92d0223247b..6f9f883f9a65 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -239,7 +239,7 @@ static void bio_free(struct bio *bio)
* when IO has completed, or when the bio is released.
*/
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
- unsigned short max_vecs, unsigned int opf)
+ unsigned short max_vecs, blk_opf_t opf)
{
bio->bi_next = NULL;
bio->bi_bdev = bdev;
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
+void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
@@ -341,7 +341,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
EXPORT_SYMBOL(bio_chain);
struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
- unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
{
struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
@@ -409,7 +409,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
}
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
- unsigned short nr_vecs, unsigned int opf, gfp_t gfp,
+ unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
{
struct bio_alloc_cache *cache;
@@ -468,7 +468,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
- unsigned int opf, gfp_t gfp_mask,
+ blk_opf_t opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
@@ -1033,7 +1033,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
return 0;
- if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+ if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
return 0;
return bio_add_hw_page(q, bio, page, len, offset,
@@ -1159,6 +1159,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
put_page(pages[i]);
}
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ bool same_page = false;
+
+ if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+ if (WARN_ON_ONCE(bio_full(bio, len)))
+ return -EINVAL;
+ __bio_add_page(bio, page, len, offset);
+ return 0;
+ }
+
+ if (same_page)
+ put_page(page);
+ return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ bool same_page = false;
+
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_zone_append_sectors(q), &same_page) != len)
+ return -EINVAL;
+ if (same_page)
+ put_page(page);
+ return 0;
+}
+
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
@@ -1177,7 +1208,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- bool same_page = false;
ssize_t size, left;
unsigned len, i;
size_t offset;
@@ -1186,82 +1216,43 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
- */
+ */
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (unlikely(size <= 0))
- return size ? size : -EFAULT;
-
- for (left = size, i = 0; left > 0; left -= len, i++) {
- struct page *page = pages[i];
-
- len = min_t(size_t, PAGE_SIZE - offset, left);
-
- if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (same_page)
- put_page(page);
- } else {
- if (WARN_ON_ONCE(bio_full(bio, len))) {
- bio_put_pages(pages + i, left, offset);
- return -EINVAL;
- }
- __bio_add_page(bio, page, len, offset);
- }
- offset = 0;
- }
-
- iov_iter_advance(iter, size);
- return 0;
-}
-
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
- unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
- struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
- struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i;
- size_t offset;
- int ret = 0;
-
- if (WARN_ON_ONCE(!max_append_sectors))
- return 0;
-
/*
- * Move page array up in the allocated memory for the bio vecs as far as
- * possible so that we can start filling biovecs from the beginning
- * without overwriting the temporary page array.
+ * Each segment in the iov is required to be a block size multiple.
+ * However, we may not be able to get the entire segment if it spans
+ * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+ * result to ensure the bio's total size is correct. The remainder of
+ * the iov data will be picked up in the next bio iteration.
*/
- BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
- pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ if (size > 0)
+ size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
if (unlikely(size <= 0))
return size ? size : -EFAULT;
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
- bool same_page = false;
+ int ret;
len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_add_hw_page(q, bio, page, len, offset,
- max_append_sectors, &same_page) != len) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ ret = bio_iov_add_zone_append_page(bio, page, len,
+ offset);
+ else
+ ret = bio_iov_add_page(bio, page, len, offset);
+
+ if (ret) {
bio_put_pages(pages + i, left, offset);
- ret = -EINVAL;
- break;
+ return ret;
}
- if (same_page)
- put_page(page);
offset = 0;
}
- iov_iter_advance(iter, size - left);
- return ret;
+ iov_iter_advance(iter, size);
+ return 0;
}
/**
@@ -1298,10 +1289,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
}
do {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = __bio_iov_append_get_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
+ ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
@@ -1747,26 +1735,6 @@ bad:
}
EXPORT_SYMBOL(bioset_init);
-/*
- * Initialize and setup a new bio_set, based on the settings from
- * another bio_set.
- */
-int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
-{
- int flags;
-
- flags = 0;
- if (src->bvec_pool.min_nr)
- flags |= BIOSET_NEED_BVECS;
- if (src->rescue_workqueue)
- flags |= BIOSET_NEED_RESCUER;
- if (src->cache)
- flags |= BIOSET_PERCPU_CACHE;
-
- return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
-}
-EXPORT_SYMBOL(bioset_init_from_src);
-
static int __init init_bio(void)
{
int i;
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 9f2723b34b75..022527b0b043 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
* caller is responsible for synchronizing calls to this function.
*/
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
- unsigned int op, uint64_t val)
+ blk_opf_t opf, uint64_t val)
{
struct percpu_counter *cnt;
- if (op_is_discard(op))
+ if (op_is_discard(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
- else if (op_is_write(op))
+ else if (op_is_write(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
- if (op_is_sync(op))
+ if (op_is_sync(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 764e740b0c0f..869af9d72bcf 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -846,6 +846,21 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
}
}
+static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
+ struct blkg_iostat *last)
+{
+ struct blkg_iostat delta;
+ unsigned long flags;
+
+ /* propagate percpu delta to global */
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, cur);
+ blkg_iostat_sub(&delta, last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(last, &delta);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+}
+
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
@@ -860,8 +875,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned long flags;
+ struct blkg_iostat cur;
unsigned int seq;
/* fetch the current per-cpu values */
@@ -870,23 +884,12 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
blkg_iostat_set(&cur, &bisc->cur);
} while (u64_stats_fetch_retry(&bisc->sync, seq));
- /* propagate percpu delta to global */
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+ blkcg_iostat_update(blkg, &cur, &bisc->last);
/* propagate global delta to parent (unless that's root) */
- if (parent && parent->parent) {
- flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
- }
+ if (parent && parent->parent)
+ blkcg_iostat_update(parent, &blkg->iostat.cur,
+ &blkg->iostat.last);
}
rcu_read_unlock();
@@ -1299,6 +1302,7 @@ int blkcg_init_queue(struct request_queue *q)
ret = blk_iolatency_init(q);
if (ret) {
blk_throtl_exit(q);
+ blk_ioprio_exit(q);
goto err_destroy_all;
}
@@ -1529,6 +1533,18 @@ void blkcg_deactivate_policy(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
+static void blkcg_free_all_cpd(struct blkcg_policy *pol)
+{
+ struct blkcg *blkcg;
+
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
+ }
+}
+
/**
* blkcg_policy_register - register a blkcg policy
* @pol: blkcg policy to register
@@ -1593,14 +1609,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
err_unlock:
mutex_unlock(&blkcg_pol_mutex);
@@ -1617,8 +1628,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
*/
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
- struct blkcg *blkcg;
-
mutex_lock(&blkcg_pol_register_mutex);
if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1633,14 +1642,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
mutex_unlock(&blkcg_pol_mutex);
@@ -1696,7 +1700,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
* everybody is happy with their IO latencies.
*/
if (time_before64(old + NSEC_PER_SEC, now) &&
- atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
u64 cur = atomic64_read(&blkg->delay_nsec);
u64 sub = min_t(u64, blkg->last_delay, now - old);
int cur_use = atomic_read(&blkg->use_delay);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d4de0a35e066..d2724d1dd7c9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -430,12 +430,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
* then check to see if we were the last delay so we can drop the
* congestion count on the cgroup.
*/
- while (old) {
- int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
- if (cur == old)
- break;
- old = cur;
- }
+ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
+ ;
if (old == 0)
return 0;
@@ -458,7 +454,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
int old = atomic_read(&blkg->use_delay);
/* We only want 1 person setting the congestion count for this blkg. */
- if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
atomic64_set(&blkg->delay_nsec, delay);
@@ -475,7 +471,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
int old = atomic_read(&blkg->use_delay);
/* We only want 1 person clearing the congestion count for this blkg. */
- if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 06ff5bbfe8f6..3d286a256d3d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static const char *const blk_op_name[] = {
* string format. Useful in the debugging and tracing bio or request. For
* invalid REQ_OP_XXX it returns string "UNKNOWN".
*/
-inline const char *blk_op_str(unsigned int op)
+inline const char *blk_op_str(enum req_op op)
{
const char *op_str = "UNKNOWN";
@@ -285,62 +285,6 @@ void blk_queue_start_drain(struct request_queue *q)
}
/**
- * blk_cleanup_queue - shutdown a request queue
- * @q: request queue to shutdown
- *
- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
- * put it. All future requests will be failed immediately with -ENODEV.
- *
- * Context: can sleep
- */
-void blk_cleanup_queue(struct request_queue *q)
-{
- /* cannot be called from atomic context */
- might_sleep();
-
- WARN_ON_ONCE(blk_queue_registered(q));
-
- /* mark @q DYING, no new request or merges will be allowed afterwards */
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
- blk_queue_start_drain(q);
-
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-
- /*
- * Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that blk_mq_run_hw_queues() accesses the hardware queues
- * after draining finished.
- */
- blk_freeze_queue(q);
-
- blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
-
- blk_sync_queue(q);
- if (queue_is_mq(q)) {
- blk_mq_cancel_work_sync(q);
- blk_mq_exit_queue(q);
- }
-
- /*
- * In theory, request pool of sched_tags belongs to request queue.
- * However, the current implementation requires tag_set for freeing
- * requests, so free the pool now.
- *
- * Queue has become frozen, there can't be any in-queue requests, so
- * it is safe to free requests now.
- */
- mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- blk_mq_sched_free_rqs(q);
- mutex_unlock(&q->sysfs_lock);
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
-}
-EXPORT_SYMBOL(blk_cleanup_queue);
-
-/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
@@ -448,7 +392,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
goto fail_srcu;
@@ -498,7 +442,7 @@ fail_stats:
fail_split:
bioset_exit(&q->bio_split);
fail_id:
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
fail_srcu:
if (alloc_srcu)
cleanup_srcu_struct(q->srcu);
@@ -517,12 +461,10 @@ fail_q:
*/
bool blk_get_queue(struct request_queue *q)
{
- if (likely(!blk_queue_dying(q))) {
- __blk_get_queue(q);
- return true;
- }
-
- return false;
+ if (unlikely(blk_queue_dying(q)))
+ return false;
+ kobject_get(&q->kobj);
+ return true;
}
EXPORT_SYMBOL(blk_get_queue);
@@ -621,16 +563,15 @@ static int blk_partition_remap(struct bio *bio)
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
struct bio *bio)
{
- sector_t pos = bio->bi_iter.bi_sector;
int nr_sectors = bio_sectors(bio);
/* Only applicable to zoned block devices */
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (pos & (blk_queue_zone_sectors(q) - 1) ||
- !blk_queue_zone_is_seq(q, pos))
+ if (bio->bi_iter.bi_sector & (bdev_zone_sectors(bio->bi_bdev) - 1) ||
+ !bio_zone_is_seq(bio))
return BLK_STS_IOERR;
/*
@@ -775,7 +716,7 @@ void submit_bio_noacct(struct bio *bio)
might_sleep();
- plug = blk_mq_plug(q, bio);
+ plug = blk_mq_plug(bio);
if (plug && plug->nowait)
bio->bi_opf |= REQ_NOWAIT;
@@ -831,11 +772,11 @@ void submit_bio_noacct(struct bio *bio)
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_ZONE_RESET_ALL:
- if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
case REQ_OP_WRITE_ZEROES:
@@ -1000,7 +941,7 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp))) {
- if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
+ if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->bd_partno) {
@@ -1010,7 +951,7 @@ again:
}
unsigned long bdev_start_io_acct(struct block_device *bdev,
- unsigned int sectors, unsigned int op,
+ unsigned int sectors, enum req_op op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
@@ -1051,7 +992,7 @@ unsigned long bio_start_io_acct(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);
-void bdev_end_io_acct(struct block_device *bdev, unsigned int op,
+void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
@@ -1260,7 +1201,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule);
int __init blk_dev_init(void)
{
- BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+ BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c68968724870..d20a0c6b2c66 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -94,7 +94,7 @@ enum {
};
static void blk_kick_flush(struct request_queue *q,
- struct blk_flush_queue *fq, unsigned int flags);
+ struct blk_flush_queue *fq, blk_opf_t flags);
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
@@ -173,7 +173,7 @@ static void blk_flush_complete_seq(struct request *rq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -290,7 +290,7 @@ bool is_flush_rq(struct request *rq)
*
*/
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
- unsigned int flags)
+ blk_opf_t flags)
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 56ed48d2954e..2bd1d311033b 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -102,31 +102,18 @@ static struct kobj_type blk_ia_ranges_ktype = {
* disk_register_independent_access_ranges - register with sysfs a set of
* independent access ranges
* @disk: Target disk
- * @new_iars: New set of independent access ranges
*
* Register with sysfs a set of independent access ranges for @disk.
- * If @new_iars is not NULL, this set of ranges is registered and the old set
- * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
- * registered if it is not already.
*/
-int disk_register_independent_access_ranges(struct gendisk *disk,
- struct blk_independent_access_ranges *new_iars)
+int disk_register_independent_access_ranges(struct gendisk *disk)
{
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
struct request_queue *q = disk->queue;
- struct blk_independent_access_ranges *iars;
int i, ret;
lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
- /* If a new range set is specified, unregister the old one */
- if (new_iars) {
- if (q->ia_ranges)
- disk_unregister_independent_access_ranges(disk);
- q->ia_ranges = new_iars;
- }
-
- iars = q->ia_ranges;
if (!iars)
return 0;
@@ -138,13 +125,12 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
&q->kobj, "%s", "independent_access_ranges");
if (ret) {
- q->ia_ranges = NULL;
+ disk->ia_ranges = NULL;
kobject_put(&iars->kobj);
return ret;
}
for (i = 0; i < iars->nr_ia_ranges; i++) {
- iars->ia_range[i].queue = q;
ret = kobject_init_and_add(&iars->ia_range[i].kobj,
&blk_ia_range_ktype, &iars->kobj,
"%d", i);
@@ -165,7 +151,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
void disk_unregister_independent_access_ranges(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- struct blk_independent_access_ranges *iars = q->ia_ranges;
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
int i;
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -183,7 +169,7 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk)
kfree(iars);
}
- q->ia_ranges = NULL;
+ disk->ia_ranges = NULL;
}
static struct blk_independent_access_range *
@@ -211,6 +197,9 @@ static bool disk_check_ia_ranges(struct gendisk *disk,
sector_t sector = 0;
int i;
+ if (WARN_ON_ONCE(!iars->nr_ia_ranges))
+ return false;
+
/*
* While sorting the ranges in increasing LBA order, check that the
* ranges do not overlap, that there are no sector holes and that all
@@ -243,7 +232,7 @@ static bool disk_check_ia_ranges(struct gendisk *disk,
static bool disk_ia_ranges_changed(struct gendisk *disk,
struct blk_independent_access_ranges *new)
{
- struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+ struct blk_independent_access_ranges *old = disk->ia_ranges;
int i;
if (!old)
@@ -299,25 +288,15 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
- if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
+ mutex_lock(&q->sysfs_dir_lock);
+ mutex_lock(&q->sysfs_lock);
+ if (iars && !disk_check_ia_ranges(disk, iars)) {
kfree(iars);
iars = NULL;
}
-
- mutex_lock(&q->sysfs_dir_lock);
- mutex_lock(&q->sysfs_lock);
-
- if (iars) {
- if (!disk_check_ia_ranges(disk, iars)) {
- kfree(iars);
- iars = NULL;
- goto reg;
- }
-
- if (!disk_ia_ranges_changed(disk, iars)) {
- kfree(iars);
- goto unlock;
- }
+ if (iars && !disk_ia_ranges_changed(disk, iars)) {
+ kfree(iars);
+ goto unlock;
}
/*
@@ -325,17 +304,12 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
* revalidation. If that is the case, we need to unregister the old
* set of independent access ranges and register the new set. If the
* queue is not registered, registration of the device request queue
- * will register the independent access ranges, so only swap in the
- * new set and free the old one.
+ * will register the independent access ranges.
*/
-reg:
- if (blk_queue_registered(q)) {
- disk_register_independent_access_ranges(disk, iars);
- } else {
- swap(q->ia_ranges, iars);
- kfree(iars);
- }
-
+ disk_unregister_independent_access_ranges(disk);
+ disk->ia_ranges = iars;
+ if (blk_queue_registered(q))
+ disk_register_independent_access_ranges(disk);
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index df9cfe4ca532..63fc02042408 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
+ ioc->ioprio = IOPRIO_DEFAULT;
+
return ioc;
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 33a11ba971ea..7936e5f5821c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2769,7 +2769,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
return;
- switch (req_op(rq) & REQ_OP_MASK) {
+ switch (req_op(rq)) {
case REQ_OP_READ:
pidx = QOS_RLAT;
rw = READ;
@@ -2886,15 +2886,21 @@ static int blk_iocost_init(struct request_queue *q)
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
*/
- rq_qos_add(q, rqos);
+ ret = rq_qos_add(q, rqos);
+ if (ret)
+ goto err_free_ioc;
+
ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
- if (ret) {
- rq_qos_del(q, rqos);
- free_percpu(ioc->pcpu_stat);
- kfree(ioc);
- return ret;
- }
+ if (ret)
+ goto err_del_qos;
return 0;
+
+err_del_qos:
+ rq_qos_del(q, rqos);
+err_free_ioc:
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+ return ret;
}
static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 9568bf8dfe82..e285152345a2 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -401,7 +401,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
unsigned int cur_cookie;
unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
u64 scale_lat;
- unsigned int old;
int direction = 0;
if (lat_to_blkg(iolat)->parent == NULL)
@@ -422,11 +421,10 @@ static void check_scale_change(struct iolatency_grp *iolat)
else
return;
- old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
-
- /* Somebody beat us to the punch, just bail. */
- if (old != our_cookie)
+ if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) {
+ /* Somebody beat us to the punch, just bail. */
return;
+ }
if (direction < 0 && iolat->min_lat_nsec) {
u64 samples_thresh;
@@ -633,8 +631,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
window_start = atomic64_read(&iolat->window_start);
if (now > window_start &&
(now - window_start) >= iolat->cur_win_nsec) {
- if (atomic64_cmpxchg(&iolat->window_start,
- window_start, now) == window_start)
+ if (atomic64_try_cmpxchg(&iolat->window_start,
+ &window_start, now))
iolatency_check_latencies(iolat, now);
}
}
@@ -773,19 +771,23 @@ int blk_iolatency_init(struct request_queue *q)
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
- rq_qos_add(q, rqos);
-
+ ret = rq_qos_add(q, rqos);
+ if (ret)
+ goto err_free;
ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
- if (ret) {
- rq_qos_del(q, rqos);
- kfree(blkiolat);
- return ret;
- }
+ if (ret)
+ goto err_qos_del;
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
return 0;
+
+err_qos_del:
+ rq_qos_del(q, rqos);
+err_free:
+ kfree(blkiolat);
+ return ret;
}
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 79e797f5d194..c00060a02c6e 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -62,7 +62,6 @@ struct ioprio_blkg {
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
enum prio_policy prio_policy;
- bool prio_set;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -113,7 +112,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0)
return ret;
blkcg->prio_policy = ret;
- blkcg->prio_set = true;
return nbytes;
}
@@ -183,26 +181,20 @@ static struct blkcg_policy ioprio_policy = {
.pd_free_fn = ioprio_free_pd,
};
-struct blk_ioprio {
- struct rq_qos rqos;
-};
-
-static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
- struct bio *bio)
+void blkcg_set_ioprio(struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;
- if (!blkcg->prio_set)
+ if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
return;
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
* correspond to a lower priority. Hence, the max_t() below selects
* the lower priority of bi_ioprio and the cgroup I/O priority class.
- * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the
- * bio I/O priority is not modified. If the bio I/O priority equals
- * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
+ * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
+ * priority is assigned to the bio.
*/
prio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
@@ -210,49 +202,14 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
bio->bi_ioprio = prio;
}
-static void blkcg_ioprio_exit(struct rq_qos *rqos)
+void blk_ioprio_exit(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg =
- container_of(rqos, typeof(*blkioprio_blkg), rqos);
-
- blkcg_deactivate_policy(rqos->q, &ioprio_policy);
- kfree(blkioprio_blkg);
+ blkcg_deactivate_policy(q, &ioprio_policy);
}
-static struct rq_qos_ops blkcg_ioprio_ops = {
- .track = blkcg_ioprio_track,
- .exit = blkcg_ioprio_exit,
-};
-
int blk_ioprio_init(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg;
- struct rq_qos *rqos;
- int ret;
-
- blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL);
- if (!blkioprio_blkg)
- return -ENOMEM;
-
- ret = blkcg_activate_policy(q, &ioprio_policy);
- if (ret) {
- kfree(blkioprio_blkg);
- return ret;
- }
-
- rqos = &blkioprio_blkg->rqos;
- rqos->id = RQ_QOS_IOPRIO;
- rqos->ops = &blkcg_ioprio_ops;
- rqos->q = q;
-
- /*
- * Registering the rq-qos policy after activating the blk-cgroup
- * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the
- * rq-qos callbacks.
- */
- rq_qos_add(q, rqos);
-
- return 0;
+ return blkcg_activate_policy(q, &ioprio_policy);
}
static int __init ioprio_init(void)
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
index a7785c2f1aea..5a1eb550e178 100644
--- a/block/blk-ioprio.h
+++ b/block/blk-ioprio.h
@@ -6,14 +6,23 @@
#include <linux/kconfig.h>
struct request_queue;
+struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct request_queue *q);
+void blk_ioprio_exit(struct request_queue *q);
+void blkcg_set_ioprio(struct bio *bio);
#else
static inline int blk_ioprio_init(struct request_queue *q)
{
return 0;
}
+static inline void blk_ioprio_exit(struct request_queue *q)
+{
+}
+static inline void blkcg_set_ioprio(struct bio *bio)
+{
+}
#endif
#endif /* _BLK_IOPRIO_H_ */
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 09b7e1200c0f..67e6dbc1ae81 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -48,10 +48,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
/* In case the discard granularity isn't set by buggy device driver */
if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
- char dev_name[BDEVNAME_SIZE];
-
- bdevname(bdev, dev_name);
- pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name);
+ pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
+ bdev);
return -EOPNOTSUPP;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7771dacc99cb..4c8a699754c9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -164,18 +164,21 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
- unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
- unsigned max_sectors = sectors;
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
- unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
+ unsigned max_sectors = queue_max_sectors(q), start, end;
- max_sectors += start_offset;
- max_sectors &= ~(pbs - 1);
- if (max_sectors > start_offset)
- return max_sectors - start_offset;
+ if (q->limits.chunk_sectors) {
+ max_sectors = min(max_sectors,
+ blk_chunk_sectors_left(bio->bi_iter.bi_sector,
+ q->limits.chunk_sectors));
+ }
- return sectors & ~(lbs - 1);
+ start = bio->bi_iter.bi_sector & (pbs - 1);
+ end = (start + max_sectors) & ~(pbs - 1);
+ if (end > start)
+ return end - start;
+ return max_sectors & ~(lbs - 1);
}
static inline unsigned get_max_segment_size(const struct request_queue *q,
@@ -201,11 +204,11 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
- * @sectors: [in,out] Number of sectors in the bio being built. Incremented
- * by the number of sectors from @bv that may be appended to that
- * bio without exceeding @max_sectors
+ * @bytes: [in,out] Number of bytes in the bio being built. Incremented
+ * by the number of bytes from @bv that may be appended to that
+ * bio without exceeding @max_bytes
* @max_segs: [in] upper bound for *@nsegs
- * @max_sectors: [in] upper bound for *@sectors
+ * @max_bytes: [in] upper bound for *@bytes
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
@@ -216,10 +219,10 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
*/
static bool bvec_split_segs(const struct request_queue *q,
const struct bio_vec *bv, unsigned *nsegs,
- unsigned *sectors, unsigned max_segs,
- unsigned max_sectors)
+ unsigned *bytes, unsigned max_segs,
+ unsigned max_bytes)
{
- unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
@@ -237,7 +240,7 @@ static bool bvec_split_segs(const struct request_queue *q,
break;
}
- *sectors += total_len >> 9;
+ *bytes += total_len;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
@@ -269,8 +272,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
- unsigned nsegs = 0, sectors = 0;
- const unsigned max_sectors = get_max_io_size(q, bio);
+ unsigned nsegs = 0, bytes = 0;
+ const unsigned max_bytes = get_max_io_size(q, bio) << 9;
const unsigned max_segs = queue_max_segments(q);
bio_for_each_bvec(bv, bio, iter) {
@@ -282,12 +285,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
goto split;
if (nsegs < max_segs &&
- sectors + (bv.bv_len >> 9) <= max_sectors &&
+ bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
- max_sectors)) {
+ bytes += bv.bv_len;
+ } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
+ max_bytes)) {
goto split;
}
@@ -301,12 +304,19 @@ split:
*segs = nsegs;
/*
+ * Individual bvecs might not be logical block aligned. Round down the
+ * split size so that each bio is properly block size aligned, even if
+ * we do not use the full hardware limits.
+ */
+ bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
+
+ /*
* Bio splitting may cause subtle trouble such as hang when doing sync
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
- return bio_split(bio, sectors, GFP_NOIO, bs);
+ return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
/**
@@ -345,6 +355,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
+ blkcg_bio_issue_init(split);
bio_chain(split, *bio);
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
@@ -375,7 +386,7 @@ EXPORT_SYMBOL(blk_queue_split);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- unsigned int nr_sectors = 0;
+ unsigned int bytes = 0;
struct req_iterator iter;
struct bio_vec bv;
@@ -395,10 +406,12 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return 1;
case REQ_OP_WRITE_ZEROES:
return 0;
+ default:
+ break;
}
rq_for_each_bvec(bv, rq, iter)
- bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@@ -559,17 +572,18 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
+ unsigned int max_sectors;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
+ max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
- return blk_queue_get_max_sectors(q, req_op(rq));
-
- return min(blk_max_size_offset(q, offset, 0),
- blk_queue_get_max_sectors(q, req_op(rq)));
+ return max_sectors;
+ return min(max_sectors,
+ blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
@@ -699,7 +713,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
*/
void blk_rq_set_mixed_merge(struct request *rq)
{
- unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+ blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
struct bio *bio;
if (rq->rq_flags & RQF_MIXED_MERGE)
@@ -915,7 +929,7 @@ enum bio_merge_status {
static enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+ const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_back_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -939,7 +953,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
static enum bio_merge_status bio_attempt_front_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+ const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1040,7 +1054,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct blk_plug *plug;
struct request *rq;
- plug = blk_mq_plug(q, bio);
+ plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
index 038cb627c868..a77b099c34b7 100644
--- a/block/blk-mq-debugfs-zoned.c
+++ b/block/blk-mq-debugfs-zoned.c
@@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m)
struct request_queue *q = data;
unsigned int i;
- if (!q->seq_zones_wlock)
+ if (!q->disk->seq_zones_wlock)
return 0;
- for (i = 0; i < q->nr_zones; i++)
- if (test_bit(i, q->seq_zones_wlock))
+ for (i = 0; i < q->disk->nr_zones; i++)
+ if (test_bit(i, q->disk->seq_zones_wlock))
seq_printf(m, "%u\n", i);
return 0;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7e4136a60e1c..8559cea7f300 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -116,7 +116,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(ADD_RANDOM),
QUEUE_FLAG_NAME(SAME_FORCE),
- QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
@@ -151,11 +150,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
char opbuf[16] = { }, *op;
/*
- * The "state" attribute is removed after blk_cleanup_queue() has called
- * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid
- * triggering a use-after-free.
+ * The "state" attribute is removed when the queue is removed. Don't
+ * allow setting the state on a dying queue to avoid a use-after-free.
*/
- if (blk_queue_dead(q))
+ if (blk_queue_dying(q))
return -ENOENT;
if (count >= sizeof(opbuf)) {
@@ -306,7 +304,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
- const unsigned int op = req_op(rq);
+ const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
seq_printf(m, "%p {.op=", rq);
@@ -315,8 +313,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
else
seq_printf(m, "%s", op_str);
seq_puts(m, ", .cmd_flags=");
- blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
- ARRAY_SIZE(cmd_flag_name));
+ blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK),
+ cmd_flag_name, ARRAY_SIZE(cmd_flag_name));
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
@@ -377,7 +375,7 @@ struct show_busy_params {
* e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
* keep iterating requests.
*/
-static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+static bool hctx_show_busy_rq(struct request *rq, void *data)
{
const struct show_busy_params *params = data;
@@ -711,11 +709,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
}
}
-void blk_mq_debugfs_unregister(struct request_queue *q)
-{
- q->sched_debugfs_dir = NULL;
-}
-
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
@@ -735,6 +728,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
char name[20];
int i;
+ if (!q->debugfs_dir)
+ return;
+
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
@@ -746,6 +742,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
{
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->debugfs_dir);
hctx->sched_debugfs_dir = NULL;
hctx->debugfs_dir = NULL;
@@ -773,6 +771,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
/*
* If the parent directory has not been created yet, return, we will be
* called again later on and the directory/files will be created then.
@@ -790,6 +790,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
void blk_mq_debugfs_unregister_sched(struct request_queue *q)
{
+ lockdep_assert_held(&q->debugfs_mutex);
+
debugfs_remove_recursive(q->sched_debugfs_dir);
q->sched_debugfs_dir = NULL;
}
@@ -811,6 +813,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
+ lockdep_assert_held(&rqos->q->debugfs_mutex);
+
+ if (!rqos->q->debugfs_dir)
+ return;
debugfs_remove_recursive(rqos->debugfs_dir);
rqos->debugfs_dir = NULL;
}
@@ -820,6 +826,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
struct request_queue *q = rqos->q;
const char *dir_name = rq_qos_id_to_name(rqos->id);
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
return;
@@ -833,17 +841,13 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
}
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
- debugfs_remove_recursive(q->rqos_debugfs_dir);
- q->rqos_debugfs_dir = NULL;
-}
-
void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
/*
* If the parent debugfs directory has not been created yet, return;
* We will be called again later on with appropriate parent debugfs
@@ -863,6 +867,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
{
+ lockdep_assert_held(&hctx->queue->debugfs_mutex);
+
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->sched_debugfs_dir);
hctx->sched_debugfs_dir = NULL;
}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 69918f4170d6..9c7d4b6117d4 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
void blk_mq_debugfs_register(struct request_queue *q);
-void blk_mq_debugfs_unregister(struct request_queue *q);
void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
@@ -36,16 +35,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
#else
static inline void blk_mq_debugfs_register(struct request_queue *q)
{
}
-static inline void blk_mq_debugfs_unregister(struct request_queue *q)
-{
-}
-
static inline void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
@@ -87,10 +81,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
}
-
-static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
-}
#endif
#ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 9e56a69422b6..a4f7c101b53b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -564,6 +564,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
int ret;
if (!e) {
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
return 0;
@@ -593,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err_free_map_and_rqs;
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
@@ -606,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return ret;
}
}
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
}
return 0;
@@ -647,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched_hctx(hctx);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
}
flags = hctx->flags;
}
+
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index c08426975856..93997d297d42 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,23 +203,6 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned long i;
-
- lockdep_assert_held(&q->sysfs_dir_lock);
-
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_unregister_hctx(hctx);
-
- kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
- kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
-
- q->mq_sysfs_init_done = false;
-}
-
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
@@ -252,16 +235,16 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int blk_mq_sysfs_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
unsigned long i, j;
int ret;
- WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
- ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
if (ret < 0)
goto out;
@@ -286,11 +269,27 @@ unreg:
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
return ret;
}
-void blk_mq_sysfs_unregister(struct request_queue *q)
+void blk_mq_sysfs_unregister(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_unregister_hctx(hctx);
+
+ kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(q->mq_kobj);
+
+ q->mq_sysfs_init_done = false;
+}
+
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@@ -306,7 +305,7 @@ unlock:
mutex_unlock(&q->sysfs_dir_lock);
}
-int blk_mq_sysfs_register(struct request_queue *q)
+int blk_mq_sysfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2dcd738c6952..8e3b36d1cb57 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -37,29 +37,25 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
* to get tag when first time, the other shared-tag users could reserve
* budget for it.
*/
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
- test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
- return true;
- }
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ return;
+ set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags);
} else {
- if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
- test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
- return true;
- }
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state);
}
users = atomic_inc_return(&hctx->tags->active_queues);
blk_mq_update_wake_batch(hctx->tags, users);
-
- return true;
}
/*
@@ -266,7 +262,6 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct request_queue *q = iter_data->q;
struct blk_mq_tag_set *set = q->tag_set;
- bool reserved = iter_data->reserved;
struct blk_mq_tags *tags;
struct request *rq;
bool ret = true;
@@ -276,7 +271,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
else
tags = hctx->tags;
- if (!reserved)
+ if (!iter_data->reserved)
bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
@@ -287,7 +282,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
return true;
if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
- ret = iter_data->fn(rq, iter_data->data, reserved);
+ ret = iter_data->fn(rq, iter_data->data);
blk_mq_put_rq_ref(rq);
return ret;
}
@@ -337,12 +332,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
- bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
bool ret = true;
bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
- if (!reserved)
+ if (!(iter_data->flags & BT_TAG_ITER_RESERVED))
bitnr += tags->nr_reserved_tags;
/*
@@ -358,7 +352,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
blk_mq_request_started(rq))
- ret = iter_data->fn(rq, iter_data->data, reserved);
+ ret = iter_data->fn(rq, iter_data->data);
if (!iter_static_rqs)
blk_mq_put_rq_ref(rq);
return ret;
@@ -448,8 +442,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
-static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
- void *data, bool reserved)
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
{
unsigned *count = data;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 5668e28be0b7..91ff37e3b43d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -47,15 +47,13 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
- return false;
-
- return __blk_mq_tag_busy(hctx);
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_tag_busy(hctx);
}
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e9bf950983c7..70177ee74295 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,6 +42,7 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
@@ -128,8 +129,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv,
- bool reserved)
+static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
@@ -474,6 +474,9 @@ retry:
if (!(data->rq_flags & RQF_ELV))
blk_mq_tag_busy(data->hctx);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ data->rq_flags |= RQF_RESV;
+
/*
* Try batched alloc if we want more than 1 tag.
*/
@@ -507,13 +510,13 @@ retry:
alloc_time_ns);
}
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
.nr_tags = 1,
};
struct request *rq;
@@ -537,12 +540,12 @@ out_queue_exit:
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
- unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
+ blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
.nr_tags = 1,
};
u64 alloc_time_ns = 0;
@@ -579,6 +582,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
if (!q->elevator)
@@ -586,6 +591,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
else
data.rq_flags |= RQF_ELV;
+ if (flags & BLK_MQ_REQ_RESERVED)
+ data.rq_flags |= RQF_RESV;
+
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
@@ -652,7 +660,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
{
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
rq->q->disk ? rq->q->disk->disk_name : "?",
- (unsigned long long) rq->cmd_flags);
+ (__force unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
@@ -705,8 +713,9 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
"phys_seg %u prio class %u\n",
blk_status_to_str(status),
req->q->disk ? req->q->disk->disk_name : "?",
- blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
- req->cmd_flags & ~REQ_OP_MASK,
+ blk_rq_pos(req), (__force u32)req_op(req),
+ blk_op_str(req_op(req)),
+ (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
IOPRIO_PRIO_CLASS(req->ioprio));
}
@@ -1391,8 +1400,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
-static bool blk_mq_rq_inflight(struct request *rq, void *priv,
- bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
/*
* If we find a request that isn't idle we know the queue is busy
@@ -1418,13 +1426,13 @@ bool blk_mq_queue_inflight(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req)
{
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
- ret = req->q->mq_ops->timeout(req, reserved);
+ ret = req->q->mq_ops->timeout(req);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
@@ -1461,7 +1469,7 @@ void blk_mq_put_rq_ref(struct request *rq)
__blk_mq_free_request(rq);
}
-static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv)
{
unsigned long *next = priv;
@@ -1473,7 +1481,7 @@ static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
* from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
- blk_mq_rq_timed_out(rq, reserved);
+ blk_mq_rq_timed_out(rq);
return true;
}
@@ -2083,14 +2091,10 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+ if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- put_cpu();
return;
}
-
- put_cpu();
}
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
@@ -2141,20 +2145,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
EXPORT_SYMBOL(blk_mq_run_hw_queue);
/*
- * Is the request queue handled by an IO scheduler that does not respect
- * hardware queues when dispatching?
- */
-static bool blk_mq_has_sqsched(struct request_queue *q)
-{
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.dispatch_request &&
- !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
- return true;
- return false;
-}
-
-/*
* Return prefered queue to dispatch from (if any) for non-mq aware IO
* scheduler.
*/
@@ -2168,7 +2158,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
* just causes lock contention inside the scheduler and pointless cache
* bouncing.
*/
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
+ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
if (!blk_mq_hctx_stopped(hctx))
return hctx;
@@ -2186,7 +2176,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
unsigned long i;
sq_hctx = NULL;
- if (blk_mq_has_sqsched(q))
+ if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
@@ -2214,7 +2204,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
unsigned long i;
sq_hctx = NULL;
- if (blk_mq_has_sqsched(q))
+ if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
@@ -2777,19 +2767,32 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
return NULL;
}
- rq_qos_throttle(q, *bio);
-
if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
return NULL;
- rq->cmd_flags = (*bio)->bi_opf;
+ /*
+ * If any qos ->throttle() end up blocking, we will have flushed the
+ * plug and hence killed the cached_rq list as well. Pop this entry
+ * before we throttle.
+ */
plug->cached_rq = rq_list_next(rq);
+ rq_qos_throttle(q, *bio);
+
+ rq->cmd_flags = (*bio)->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
}
+static void bio_set_ioprio(struct bio *bio)
+{
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2806,7 +2809,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(q, bio);
+ struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct request *rq;
unsigned int nr_segs = 1;
@@ -2819,6 +2822,8 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
return;
+ bio_set_ioprio(bio);
+
rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
if (!rq) {
if (!bio)
@@ -3283,7 +3288,7 @@ struct rq_iter_data {
bool has_rq;
};
-static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+static bool blk_mq_has_request(struct request *rq, void *data)
{
struct rq_iter_data *iter_data = data;
@@ -3443,8 +3448,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
- blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
- set->queue_depth, flush_rq);
+ if (blk_queue_init_done(q))
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
+ set->queue_depth, flush_rq);
if (set->ops->exit_request)
set->ops->exit_request(set, flush_rq, hctx_idx);
@@ -3901,7 +3907,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
- blk_cleanup_queue(q);
+ blk_put_queue(q);
return ERR_PTR(ret);
}
return q;
@@ -3913,6 +3919,35 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
+/**
+ * blk_mq_destroy_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * This shuts down a request queue allocated by blk_mq_init_queue() and drops
+ * the initial reference. All future requests will failed with -ENODEV.
+ *
+ * Context: can sleep
+ */
+void blk_mq_destroy_queue(struct request_queue *q)
+{
+ WARN_ON_ONCE(!queue_is_mq(q));
+ WARN_ON_ONCE(blk_queue_registered(q));
+
+ might_sleep();
+
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+ blk_queue_start_drain(q);
+ blk_freeze_queue(q);
+
+ blk_sync_queue(q);
+ blk_mq_cancel_work_sync(q);
+ blk_mq_exit_queue(q);
+
+ /* @q is and will stay empty, shutdown and put */
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL(blk_mq_destroy_queue);
+
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass)
{
@@ -3925,13 +3960,23 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
return ERR_PTR(-ENOMEM);
}
+ set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);
+struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
+ struct lock_class_key *lkclass)
+{
+ if (!blk_get_queue(q))
+ return NULL;
+ return __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
+}
+EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
+
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
@@ -4438,12 +4483,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
if (!qe)
return false;
+ /* q->elevator needs protection from ->sysfs_lock */
+ mutex_lock(&q->sysfs_lock);
+
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
list_add(&qe->node, head);
- mutex_lock(&q->sysfs_lock);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
@@ -4517,7 +4564,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
- blk_mq_sysfs_unregister(q);
+ blk_mq_sysfs_unregister_hctxs(q);
}
prev_nr_hw_queues = set->nr_hw_queues;
@@ -4548,7 +4595,7 @@ fallback:
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_sysfs_register(q);
+ blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2615bd58bad3..8ca453ac243d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -86,16 +86,16 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
-static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
+static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
- if (flags & REQ_POLLED)
+ if (opf & REQ_POLLED)
type = HCTX_TYPE_POLL;
- else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+ else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return type;
}
@@ -103,14 +103,14 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
- * @flags: request command flags
+ * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- unsigned int flags,
+ blk_opf_t opf,
struct blk_mq_ctx *ctx)
{
- return ctx->hctxs[blk_mq_get_hctx_type(flags)];
+ return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}
/*
@@ -118,9 +118,10 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
-extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
-extern int blk_mq_sysfs_register(struct request_queue *q);
-extern void blk_mq_sysfs_unregister(struct request_queue *q);
+int blk_mq_sysfs_register(struct gendisk *disk);
+void blk_mq_sysfs_unregister(struct gendisk *disk);
+int blk_mq_sysfs_register_hctxs(struct request_queue *q);
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
@@ -151,7 +152,7 @@ struct blk_mq_alloc_data {
struct request_queue *q;
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
req_flags_t rq_flags;
/* allocate multiple requests/tags in one go */
@@ -293,7 +294,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
/*
* blk_mq_plug() - Get caller context plug
- * @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
@@ -304,23 +304,22 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if the target request queue
- * is for a zoned block device and the BIO to plug is a write operation.
+ * ignoring the plug state of a BIO issuing context if it is for a zoned block
+ * device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
-static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
- struct bio *bio)
+static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{
+ /* Zoned block device write operation case: do not plug the BIO */
+ if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio)))
+ return NULL;
+
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
- if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
- return current->plug;
-
- /* Zoned block device write operation case: do not plug the BIO */
- return NULL;
+ return current->plug;
}
/* Free all requests on the list */
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index e83af7bc7591..88f0fe7dcf54 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -10,16 +10,10 @@ static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
unsigned int cur = atomic_read(v);
- for (;;) {
- unsigned int old;
-
+ do {
if (cur >= below)
return false;
- old = atomic_cmpxchg(v, cur, cur + 1);
- if (old == cur)
- break;
- cur = old;
- }
+ } while (!atomic_try_cmpxchg(v, &cur, cur + 1));
return true;
}
@@ -294,8 +288,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
- blk_mq_debugfs_unregister_queue_rqos(q);
-
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 68267007da1c..08b856570ad1 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -86,7 +86,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
init_waitqueue_head(&rq_wait->wait);
}
-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
@@ -98,14 +98,26 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock);
+ if (rq_qos_id(q, rqos->id))
+ goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
- if (rqos->ops->debugfs_attrs)
+ if (rqos->ops->debugfs_attrs) {
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
+ }
+
+ return 0;
+ebusy:
+ spin_unlock_irq(&q->queue_lock);
+ blk_mq_unfreeze_queue(q);
+ return -EBUSY;
+
}
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
@@ -129,7 +141,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
blk_mq_unfreeze_queue(q);
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
}
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ccceb421ed2..8bb9eef5310e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -893,18 +893,19 @@ static bool disk_has_partitions(struct gendisk *disk)
}
/**
- * blk_queue_set_zoned - configure a disk queue zoned model.
+ * disk_set_zoned - configure the zoned model for a disk
* @disk: the gendisk of the queue to configure
* @model: the zoned model to set
*
- * Set the zoned model of the request queue of @disk according to @model.
+ * Set the zoned model of @disk to @model.
+ *
* When @model is BLK_ZONED_HM (host managed), this should be called only
* if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
* If @model specifies BLK_ZONED_HA (host aware), the effective model used
* depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
* on the disk.
*/
-void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
{
struct request_queue *q = disk->queue;
@@ -945,10 +946,10 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
blk_queue_zone_write_granularity(q,
queue_logical_block_size(q));
} else {
- blk_queue_clear_zone_settings(q);
+ disk_clear_zone_settings(disk);
}
}
-EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
+EXPORT_SYMBOL_GPL(disk_set_zoned);
int bdev_alignment_offset(struct block_device *bdev)
{
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb59..c0303026752d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
return queue_var_show(q->limits.virt_boundary_mask, page);
}
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_dma_alignment(q), page);
+}
+
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
queue_##name##_show(struct request_queue *q, char *page) \
@@ -320,17 +325,17 @@ static ssize_t queue_zoned_show(struct request_queue *q, char *page)
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(blk_queue_nr_zones(q), page);
+ return queue_var_show(disk_nr_zones(q->disk), page);
}
static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_open_zones(q), page);
+ return queue_var_show(bdev_max_open_zones(q->disk->part0), page);
}
static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_active_zones(q), page);
+ return queue_var_show(bdev_max_active_zones(q->disk->part0), page);
}
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
@@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
&blk_throtl_sample_time_entry.attr,
#endif
&queue_virt_boundary_mask_entry.attr,
+ &queue_dma_alignment_entry.attr,
NULL,
};
@@ -748,11 +755,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
* decremented with blk_put_queue(). Once the refcount reaches 0 this function
* is called.
*
- * For drivers that have a request_queue on a gendisk and added with
- * __device_add_disk() the refcount to request_queue will reach 0 with
- * the last put_disk() called by the driver. For drivers which don't use
- * __device_add_disk() this happens with blk_cleanup_queue().
- *
* Drivers exist which depend on the release of the request_queue to be
* synchronous, it should not be deferred.
*
@@ -774,25 +776,15 @@ static void blk_release_queue(struct kobject *kobj)
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
- blk_queue_free_zone_bitmaps(q);
-
if (queue_is_mq(q))
blk_mq_release(q);
- blk_trace_shutdown(q);
- mutex_lock(&q->debugfs_mutex);
- debugfs_remove_recursive(q->debugfs_dir);
- mutex_unlock(&q->debugfs_mutex);
-
- if (queue_is_mq(q))
- blk_mq_debugfs_unregister(q);
-
bioset_exit(&q->bio_split);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -801,7 +793,13 @@ static const struct sysfs_ops queue_sysfs_ops = {
.store = queue_attr_store,
};
+static const struct attribute_group *blk_queue_attr_groups[] = {
+ &queue_attr_group,
+ NULL
+};
+
struct kobj_type blk_queue_ktype = {
+ .default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_release_queue,
};
@@ -812,43 +810,27 @@ struct kobj_type blk_queue_ktype = {
*/
int blk_register_queue(struct gendisk *disk)
{
- int ret;
- struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
-
- ret = blk_trace_init_sysfs(dev);
- if (ret)
- return ret;
+ int ret;
mutex_lock(&q->sysfs_dir_lock);
- ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
- if (ret < 0) {
- blk_trace_remove_sysfs(dev);
+ ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
+ if (ret < 0)
goto unlock;
- }
- ret = sysfs_create_group(&q->kobj, &queue_attr_group);
- if (ret) {
- blk_trace_remove_sysfs(dev);
- kobject_del(&q->kobj);
- kobject_put(&dev->kobj);
- goto unlock;
- }
+ if (queue_is_mq(q))
+ blk_mq_sysfs_register(disk);
+ mutex_lock(&q->sysfs_lock);
mutex_lock(&q->debugfs_mutex);
q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
blk_debugfs_root);
- mutex_unlock(&q->debugfs_mutex);
-
- if (queue_is_mq(q)) {
- __blk_mq_register_dev(dev, q);
+ if (queue_is_mq(q))
blk_mq_debugfs_register(q);
- }
-
- mutex_lock(&q->sysfs_lock);
+ mutex_unlock(&q->debugfs_mutex);
- ret = disk_register_independent_access_ranges(disk, NULL);
+ ret = disk_register_independent_access_ranges(disk);
if (ret)
goto put_dev;
@@ -897,8 +879,6 @@ put_dev:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
- blk_trace_remove_sysfs(dev);
- kobject_put(&dev->kobj);
return ret;
}
@@ -936,9 +916,8 @@ void blk_unregister_queue(struct gendisk *disk)
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
- blk_mq_unregister_dev(disk_to_dev(disk), q);
+ blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(q);
- blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
elv_unregister_queue(q);
@@ -948,8 +927,13 @@ void blk_unregister_queue(struct gendisk *disk)
/* Now that we've deleted all child objects, we can delete the queue. */
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
-
mutex_unlock(&q->sysfs_dir_lock);
- kobject_put(&disk_to_dev(disk)->kobj);
+ mutex_lock(&q->debugfs_mutex);
+ blk_trace_shutdown(q);
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->debugfs_dir = NULL;
+ q->sched_debugfs_dir = NULL;
+ q->rqos_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 139b2d7a99e2..9f5fe62afff9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2203,8 +2203,9 @@ out_unlock:
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_track_latency(struct throtl_data *td, sector_t size,
- int op, unsigned long time)
+ enum req_op op, unsigned long time)
{
+ const bool rw = op_is_write(op);
struct latency_bucket *latency;
int index;
@@ -2215,10 +2216,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets[op]);
+ latency = get_cpu_ptr(td->latency_buckets[rw]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[op]);
+ put_cpu_ptr(td->latency_buckets[rw]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0c119be0e813..a9982000b667 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -451,7 +451,7 @@ static bool close_io(struct rq_wb *rwb)
#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
-static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
unsigned int limit;
@@ -462,7 +462,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
if (!rwb_enabled(rwb))
return UINT_MAX;
- if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+ if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
return rwb->wb_background;
/*
@@ -473,9 +473,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
*/
- if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
limit = rwb->rq_depth.max_depth;
- else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+ else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
/*
* If less than 100ms since we completed unrelated IO,
* limit us to half the depth for background writeback.
@@ -490,13 +490,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
struct wbt_wait_data {
struct rq_wb *rwb;
enum wbt_flags wb_acct;
- unsigned long rw;
+ blk_opf_t opf;
};
static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
{
struct wbt_wait_data *data = private_data;
- return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
+ return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
}
static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
@@ -510,13 +510,13 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
* the timer to kick off queuing again.
*/
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
- unsigned long rw)
+ blk_opf_t opf)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
struct wbt_wait_data data = {
.rwb = rwb,
.wb_acct = wb_acct,
- .rw = rw,
+ .opf = opf,
};
rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
@@ -670,7 +670,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
static int wbt_data_dir(const struct request *rq)
{
- const int op = req_op(rq);
+ const enum req_op op = req_op(rq);
if (op == REQ_OP_READ)
return READ;
@@ -820,6 +820,7 @@ int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
+ int ret;
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
@@ -846,7 +847,10 @@ int wbt_init(struct request_queue *q)
/*
* Assign rwb and add the stats callback.
*/
- rq_qos_add(q, &rwb->rqos);
+ ret = rq_qos_add(q, &rwb->rqos);
+ if (ret)
+ goto err_free;
+
blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
@@ -855,4 +859,10 @@ int wbt_init(struct request_queue *q)
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
+
+err_free:
+ blk_stat_free_callback(rwb->cb);
+ kfree(rwb);
+ return ret;
+
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 38cd840d8838..a264621d4905 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str);
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (!rq->q->seq_zones_wlock)
+ if (blk_rq_is_passthrough(rq))
return false;
- if (blk_rq_is_passthrough(rq))
+ if (!rq->q->disk->seq_zones_wlock)
return false;
switch (req_op(rq)) {
@@ -77,7 +77,7 @@ bool blk_req_zone_write_trylock(struct request *rq)
{
unsigned int zno = blk_rq_zone_no(rq);
- if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+ if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
return false;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock)))
+ rq->q->disk->seq_zones_wlock)))
return;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -101,28 +101,29 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
void __blk_req_zone_write_unlock(struct request *rq)
{
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->seq_zones_wlock)
+ if (rq->q->disk->seq_zones_wlock)
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock));
+ rq->q->disk->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
/**
- * blkdev_nr_zones - Get number of zones
- * @disk: Target gendisk
+ * bdev_nr_zones - Get number of zones
+ * @bdev: Target device
*
* Return the total number of zones of a zoned block device. For a block
* device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct gendisk *disk)
+unsigned int bdev_nr_zones(struct block_device *bdev)
{
- sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
- if (!blk_queue_is_zoned(disk->queue))
+ if (!bdev_is_zoned(bdev))
return 0;
- return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+ return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
+ ilog2(zone_sectors);
}
-EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+EXPORT_SYMBOL_GPL(bdev_nr_zones);
/**
* blkdev_report_zones - Get zones information
@@ -149,8 +150,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = get_capacity(disk);
- if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
- WARN_ON_ONCE(!disk->fops->report_zones))
+ if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
return -EOPNOTSUPP;
if (!nr_zones || sector >= capacity)
@@ -189,27 +189,26 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
gfp_t gfp_mask)
{
- struct request_queue *q = bdev_get_queue(bdev);
- sector_t capacity = get_capacity(bdev->bd_disk);
- sector_t zone_sectors = blk_queue_zone_sectors(q);
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t capacity = bdev_nr_sectors(bdev);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
unsigned long *need_reset;
struct bio *bio = NULL;
sector_t sector = 0;
int ret;
- need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
+ need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
if (!need_reset)
return -ENOMEM;
- ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
- q->nr_zones, blk_zone_need_reset_cb,
- need_reset);
+ ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
+ blk_zone_need_reset_cb, need_reset);
if (ret < 0)
goto out_free_need_reset;
ret = 0;
while (sector < capacity) {
- if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
+ if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
sector += zone_sectors;
continue;
}
@@ -257,18 +256,17 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* The operation to execute on each zone can be a zone reset, open, close
* or finish request.
*/
-int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
- sector_t sector, sector_t nr_sectors,
- gfp_t gfp_mask)
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+ sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
- sector_t zone_sectors = blk_queue_zone_sectors(q);
- sector_t capacity = get_capacity(bdev->bd_disk);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
+ sector_t capacity = bdev_nr_sectors(bdev);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
int ret = 0;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
@@ -350,7 +348,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!q)
return -ENXIO;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
@@ -398,7 +396,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct request_queue *q;
struct blk_zone_range zrange;
- enum req_opf op;
+ enum req_op op;
int ret;
if (!argp)
@@ -408,7 +406,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
if (!q)
return -ENXIO;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
if (!(mode & FMODE_WRITE))
@@ -450,12 +448,12 @@ fail:
return ret;
}
-void blk_queue_free_zone_bitmaps(struct request_queue *q)
+void disk_free_zone_bitmaps(struct gendisk *disk)
{
- kfree(q->conv_zones_bitmap);
- q->conv_zones_bitmap = NULL;
- kfree(q->seq_zones_wlock);
- q->seq_zones_wlock = NULL;
+ kfree(disk->conv_zones_bitmap);
+ disk->conv_zones_bitmap = NULL;
+ kfree(disk->seq_zones_wlock);
+ disk->seq_zones_wlock = NULL;
}
struct blk_revalidate_zone_args {
@@ -605,15 +603,15 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
blk_mq_freeze_queue(q);
if (ret > 0) {
blk_queue_chunk_sectors(q, args.zone_sectors);
- q->nr_zones = args.nr_zones;
- swap(q->seq_zones_wlock, args.seq_zones_wlock);
- swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+ disk->nr_zones = args.nr_zones;
+ swap(disk->seq_zones_wlock, args.seq_zones_wlock);
+ swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
if (update_driver_data)
update_driver_data(disk);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- blk_queue_free_zone_bitmaps(q);
+ disk_free_zone_bitmaps(disk);
}
blk_mq_unfreeze_queue(q);
@@ -623,16 +621,18 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-void blk_queue_clear_zone_settings(struct request_queue *q)
+void disk_clear_zone_settings(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
+
blk_mq_freeze_queue(q);
- blk_queue_free_zone_bitmaps(q);
+ disk_free_zone_bitmaps(disk);
blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
- q->nr_zones = 0;
- q->max_open_zones = 0;
- q->max_active_zones = 0;
+ disk->nr_zones = 0;
+ disk->max_open_zones = 0;
+ disk->max_active_zones = 0;
q->limits.chunk_sectors = 0;
q->limits.zone_write_granularity = 0;
q->limits.max_zone_append_sectors = 0;
diff --git a/block/blk.h b/block/blk.h
index 434017701403..1d83b1d41cd1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -31,11 +31,6 @@ extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
-static inline void __blk_get_queue(struct request_queue *q)
-{
- kobject_get(&q->kobj);
-}
-
bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@@ -159,6 +154,19 @@ static inline bool blk_discard_mergable(struct request *req)
return false;
}
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+ enum req_op op)
+{
+ if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
+ return min(q->limits.max_discard_sectors,
+ UINT_MAX >> SECTOR_SHIFT);
+
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
+
+ return q->limits.max_sectors;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -392,11 +400,11 @@ static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif
#ifdef CONFIG_BLK_DEV_ZONED
-void blk_queue_free_zone_bitmaps(struct request_queue *q);
-void blk_queue_clear_zone_settings(struct request_queue *q);
+void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_clear_zone_settings(struct gendisk *disk);
#else
-static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
-static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
+static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_clear_zone_settings(struct gendisk *disk) {}
#endif
int blk_alloc_ext_minor(void);
@@ -411,6 +419,9 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
void blk_drop_partitions(struct gendisk *disk);
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass);
+
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
@@ -436,13 +447,14 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
+extern struct attribute_group blk_trace_attr_group;
+
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;
-int disk_register_independent_access_ranges(struct gendisk *disk,
- struct blk_independent_access_ranges *new_iars);
+int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
#ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/block/bounce.c b/block/bounce.c
index 8f7b6fe3b4db..c8f487af7be3 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -205,19 +205,26 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
int rw = bio_data_dir(*bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
- unsigned i = 0;
+ unsigned i = 0, bytes = 0;
bool bounce = false;
- int sectors = 0;
+ int sectors;
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
- sectors += from.bv_len >> 9;
+ bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
return;
+ /*
+ * Individual bvecs might not be logical block aligned. Round down
+ * the split size so that each bio is properly block size aligned,
+ * even if we do not use the full hardware limits.
+ */
+ sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
+ SECTOR_SHIFT;
if (sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index acfe1357bf6c..d6f5dcdce748 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -324,14 +324,14 @@ void bsg_remove_queue(struct request_queue *q)
container_of(q->tag_set, struct bsg_set, tag_set);
bsg_unregister_queue(bset->bd);
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
}
EXPORT_SYMBOL_GPL(bsg_remove_queue);
-static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return bsg_timeout(struct request *rq)
{
struct bsg_set *bset =
container_of(rq->q->tag_set, struct bsg_set, tag_set);
@@ -399,7 +399,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
return q;
out_cleanup_queue:
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:
diff --git a/block/elevator.h b/block/elevator.h
index 16cd8bdedb7e..3f0593b3bf9d 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -34,7 +34,7 @@ struct elevator_mq_ops {
int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
void (*requests_merged)(struct request_queue *, struct request *, struct request *);
- void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *);
+ void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *);
void (*prepare_request)(struct request *);
void (*finish_request)(struct request *);
void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
diff --git a/block/fops.c b/block/fops.c
index 6e86931ab847..a564cd81340c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -32,14 +32,21 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
+static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
- unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
/* avoid the need for a I/O completion work item */
if (iocb_is_dsync(iocb))
- op |= REQ_FUA;
- return op;
+ opf |= REQ_FUA;
+ return opf;
+}
+
+static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
+ struct iov_iter *iter)
+{
+ return pos & (bdev_logical_block_size(bdev) - 1) ||
+ !bdev_iter_is_aligned(bdev, iter);
}
#define DIO_INLINE_BIO_VECS 4
@@ -54,8 +61,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (nr_pages <= DIO_INLINE_BIO_VECS)
@@ -169,12 +175,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
- unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -292,14 +297,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
{
struct block_device *bdev = iocb->ki_filp->private_data;
bool is_read = iov_iter_rw(iter) == READ;
- unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -417,7 +421,7 @@ const struct address_space_operations def_blk_aops = {
.write_end = blkdev_write_end,
.writepages = blkdev_writepages,
.direct_IO = blkdev_direct_IO,
- .migratepage = buffer_migrate_page_norefs,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
diff --git a/block/genhd.c b/block/genhd.c
index 27205ae47d59..e1d5b10ac193 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -101,29 +101,6 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
}
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
-/*
- * Format the device name of the indicated block device into the supplied buffer
- * and return a pointer to that same buffer for convenience.
- *
- * Note: do not use this in new code, use the %pg specifier to sprintf and
- * printk insted.
- */
-const char *bdevname(struct block_device *bdev, char *buf)
-{
- struct gendisk *hd = bdev->bd_disk;
- int partno = bdev->bd_partno;
-
- if (!partno)
- snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
- else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
- snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
- else
- snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
-
- return buf;
-}
-EXPORT_SYMBOL(bdevname);
-
static void part_stat_read_all(struct block_device *part,
struct disk_stats *stat)
{
@@ -617,12 +594,15 @@ void del_gendisk(struct gendisk *disk)
* Fail any new I/O.
*/
set_bit(GD_DEAD, &disk->state);
+ if (test_bit(GD_OWNS_QUEUE, &disk->state))
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
set_capacity(disk, 0);
/*
* Prevent new I/O from crossing bio_queue_enter().
*/
blk_queue_start_drain(q);
+ blk_mq_freeze_queue_wait(q);
if (!(disk->flags & GENHD_FL_HIDDEN)) {
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -646,18 +626,32 @@ void del_gendisk(struct gendisk *disk)
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
device_del(disk_to_dev(disk));
- blk_mq_freeze_queue_wait(q);
-
blk_throtl_cancel_bios(disk->queue);
blk_sync_queue(q);
blk_flush_integrity();
+ blk_mq_cancel_work_sync(q);
+
+ blk_mq_quiesce_queue(q);
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ blk_mq_unquiesce_queue(q);
+
/*
- * Allow using passthrough request again after the queue is torn down.
+ * If the disk does not own the queue, allow using passthrough requests
+ * again. Else leave the queue frozen to fail all I/O.
*/
- blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
- __blk_mq_unfreeze_queue(q, true);
-
+ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
+ blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+ __blk_mq_unfreeze_queue(q, true);
+ } else {
+ if (queue_is_mq(q))
+ blk_mq_exit_queue(q);
+ }
}
EXPORT_SYMBOL(del_gendisk);
@@ -1117,34 +1111,12 @@ static struct attribute_group disk_attr_group = {
static const struct attribute_group *disk_attr_groups[] = {
&disk_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ &blk_trace_attr_group,
+#endif
NULL
};
-static void disk_release_mq(struct request_queue *q)
-{
- blk_mq_cancel_work_sync(q);
-
- /*
- * There can't be any non non-passthrough bios in flight here, but
- * requests stay around longer, including passthrough ones so we
- * still need to freeze the queue here.
- */
- blk_mq_freeze_queue(q);
-
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
- elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
- }
- rq_qos_exit(q);
- __blk_mq_unfreeze_queue(q, true);
-}
-
/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
@@ -1166,13 +1138,23 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
- if (queue_is_mq(disk->queue))
- disk_release_mq(disk->queue);
+ /*
+ * To undo the all initialization from blk_mq_init_allocated_queue in
+ * case of a probe failure where add_disk is never called we have to
+ * call blk_mq_exit_queue here. We can't do this for the more common
+ * teardown case (yet) as the tagset can be gone by the time the disk
+ * is released once it was added.
+ */
+ if (queue_is_mq(disk->queue) &&
+ test_bit(GD_OWNS_QUEUE, &disk->state) &&
+ !test_bit(GD_ADDED, &disk->state))
+ blk_mq_exit_queue(disk->queue);
blkcg_exit_queue(disk->queue);
disk_release_events(disk);
kfree(disk->random);
+ disk_free_zone_bitmaps(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1356,9 +1338,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
{
struct gendisk *disk;
- if (!blk_get_queue(q))
- return NULL;
-
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (!disk)
goto out_put_queue;
@@ -1409,7 +1388,6 @@ out_put_queue:
blk_put_queue(q);
return NULL;
}
-EXPORT_SYMBOL(__alloc_disk_node);
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
{
@@ -1422,9 +1400,10 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
disk = __alloc_disk_node(q, node, lkclass);
if (!disk) {
- blk_cleanup_queue(q);
+ blk_put_queue(q);
return NULL;
}
+ set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);
@@ -1436,6 +1415,9 @@ EXPORT_SYMBOL(__blk_alloc_disk);
* This decrements the refcount for the struct gendisk. When this reaches 0
* we'll have disk_release() called.
*
+ * Note: for blk-mq disk put_disk must be called before freeing the tag_set
+ * when handling probe errors (that is before add_disk() is called).
+ *
* Context: Any context, but the last reference must not be dropped from
* atomic context.
*/
@@ -1446,22 +1428,6 @@ void put_disk(struct gendisk *disk)
}
EXPORT_SYMBOL(put_disk);
-/**
- * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
- * @disk: gendisk to shutdown
- *
- * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
- * the queue DEAD, destroy and put it and the gendisk structure.
- *
- * Context: can sleep
- */
-void blk_cleanup_disk(struct gendisk *disk)
-{
- blk_cleanup_queue(disk->queue);
- put_disk(disk);
-}
-EXPORT_SYMBOL(blk_cleanup_disk);
-
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
char event[] = "DISK_RO=1";
diff --git a/block/holder.c b/block/holder.c
index 8d750281a1cd..5283bc804cc1 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -79,10 +79,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
WARN_ON_ONCE(!bdev->bd_holder);
- /* FIXME: remove the following once add_disk() handles errors */
- if (WARN_ON(!bdev->bd_holder_dir))
- goto out_unlock;
-
holder = bd_find_holder_disk(bdev, disk);
if (holder) {
holder->refcnt++;
diff --git a/block/ioctl.c b/block/ioctl.c
index 46949f1b0dba..60121e89052b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -495,7 +495,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
case BLKGETZONESZ:
return put_uint(argp, bdev_zone_sectors(bdev));
case BLKGETNRZONES:
- return put_uint(argp, blkdev_nr_zones(bdev->bd_disk));
+ return put_uint(argp, bdev_nr_zones(bdev));
case BLKROGET:
return put_int(argp, bdev_read_only(bdev) != 0);
case BLKSSZGET: /* get block device logical block size */
diff --git a/block/ioprio.c b/block/ioprio.c
index 2fe068fcaad5..32a456b45804 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -138,6 +138,32 @@ out:
return ret;
}
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+int __get_task_ioprio(struct task_struct *p)
+{
+ struct io_context *ioc = p->io_context;
+ int prio;
+
+ if (p != current)
+ lockdep_assert_held(&p->alloc_lock);
+ if (ioc)
+ prio = ioc->ioprio;
+ else
+ prio = IOPRIO_DEFAULT;
+
+ if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+ prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+ task_nice_ioprio(p));
+ return prio;
+}
+EXPORT_SYMBOL_GPL(__get_task_ioprio);
+
static int get_task_ioprio(struct task_struct *p)
{
int ret;
@@ -145,22 +171,38 @@ static int get_task_ioprio(struct task_struct *p)
ret = security_task_getioprio(p);
if (ret)
goto out;
- ret = IOPRIO_DEFAULT;
+ task_lock(p);
+ ret = __get_task_ioprio(p);
+ task_unlock(p);
+out:
+ return ret;
+}
+
+/*
+ * Return raw IO priority value as set by userspace. We use this for
+ * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
+ * also so that userspace can distinguish unset IO priority (which just gets
+ * overriden based on task's nice value) from IO priority set to some value.
+ */
+static int get_task_raw_ioprio(struct task_struct *p)
+{
+ int ret;
+
+ ret = security_task_getioprio(p);
+ if (ret)
+ goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
+ else
+ ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
-int ioprio_best(unsigned short aprio, unsigned short bprio)
+static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- if (!ioprio_valid(aprio))
- aprio = IOPRIO_DEFAULT;
- if (!ioprio_valid(bprio))
- bprio = IOPRIO_DEFAULT;
-
return min(aprio, bprio);
}
@@ -181,7 +223,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
p = find_task_by_vpid(who);
if (p)
- ret = get_task_ioprio(p);
+ ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 70ff2a599ef6..b05357bced99 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -195,9 +195,9 @@ struct kyber_hctx_data {
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
-static unsigned int kyber_sched_domain(unsigned int op)
+static unsigned int kyber_sched_domain(blk_opf_t opf)
{
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_READ:
return KYBER_READ;
case REQ_OP_WRITE:
@@ -421,6 +421,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_enable_accounting(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+
eq->elevator_data = kqd;
q->elevator = eq;
@@ -551,13 +553,13 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
}
}
-static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
/*
* We use the scheduler tags as per-hardware queue queueing tokens.
* Async requests can be limited at this stage.
*/
- if (!op_is_sync(op)) {
+ if (!op_is_sync(opf)) {
struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
data->shallow_depth = kqd->async_depth;
@@ -1033,7 +1035,6 @@ static struct elevator_type kyber_sched = {
#endif
.elevator_attrs = kyber_sched_attrs,
.elevator_name = "kyber",
- .elevator_features = ELEVATOR_F_MQ_AWARE,
.elevator_owner = THIS_MODULE,
};
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 6ed602b2f80a..5639921dfa92 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -543,12 +543,12 @@ unlock:
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
-static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct deadline_data *dd = data->q->elevator->elevator_data;
/* Do not throttle synchronous reads. */
- if (op_is_sync(op) && !op_is_write(op))
+ if (op_is_sync(opf) && !op_is_write(opf))
return;
/*
@@ -642,6 +642,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
q->elevator = eq;
return 0;
diff --git a/block/partitions/check.h b/block/partitions/check.h
index 4ffa2359b1a3..8d70a880c372 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -24,13 +24,13 @@ struct parsed_partitions {
};
typedef struct {
- struct page *v;
+ struct folio *v;
} Sector;
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p);
static inline void put_dev_sector(Sector p)
{
- put_page(p.v);
+ folio_put(p.v);
}
static inline void
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 8a0ec929023b..fc1d70384825 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -9,7 +9,6 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/vmalloc.h>
-#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
#include "check.h"
@@ -331,7 +330,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
case BLK_ZONED_HA:
pr_info("%s: disabling host aware zoned block device support due to partitions\n",
disk->disk_name);
- blk_queue_set_zoned(disk, BLK_ZONED_NONE);
+ disk_set_zoned(disk, BLK_ZONED_NONE);
break;
case BLK_ZONED_NONE:
break;
@@ -705,25 +704,19 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
- struct page *page;
+ struct folio *folio;
if (n >= get_capacity(state->disk)) {
state->access_beyond_eod = true;
- return NULL;
+ goto out;
}
- page = read_mapping_page(mapping,
- (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL);
- if (IS_ERR(page))
+ folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL);
+ if (IS_ERR(folio))
goto out;
- if (PageError(page))
- goto out_put_page;
-
- p->v = page;
- return (unsigned char *)page_address(page) +
- ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT);
-out_put_page:
- put_page(page);
+
+ p->v = folio;
+ return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE);
out:
p->v = NULL;
return NULL;