From 1e0dcca9e1aa3caa1a0dc4300db1a091078fe40b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:49 +0100 Subject: dm: use bdev_read_only to check if a device is read-only dm-thin and dm-cache also work on partitions, so use the proper interface to check if the device is read-only. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/dm-cache-metadata.c | 2 +- drivers/md/dm-thin-metadata.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index af6d4f898e4c..89a73204dbf4 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -449,7 +449,7 @@ static int __check_incompat_features(struct cache_disk_superblock *disk_super, /* * Check for read-only metadata to skip the following RDWR checks. */ - if (get_disk_ro(cmd->bdev->bd_disk)) + if (bdev_read_only(cmd->bdev)) return 0; features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 6ebb2127f3e2..e75b20480e46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -636,7 +636,7 @@ static int __check_incompat_features(struct thin_disk_superblock *disk_super, /* * Check for read-only metadata to skip the following RDWR checks. */ - if (get_disk_ro(pmd->bdev->bd_disk)) + if (bdev_read_only(pmd->bdev)) return 0; features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; -- cgit v1.2.3-70-g09d2 From 6f0d9689b670bc9f9640ff87b3f9226b7806dea2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:50 +0100 Subject: block: remove the NULL bdev check in bdev_read_only Only a single caller can end up in bdev_read_only, so move the check there. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 3 --- fs/super.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 419548e92d82..484a474648d5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1657,11 +1657,8 @@ EXPORT_SYMBOL(set_disk_ro); int bdev_read_only(struct block_device *bdev) { - if (!bdev) - return 0; return bdev->bd_read_only; } - EXPORT_SYMBOL(bdev_read_only); /* diff --git a/fs/super.c b/fs/super.c index 2c6cdea2ab2d..5a1f384ffc74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -865,7 +865,8 @@ int reconfigure_super(struct fs_context *fc) if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && + bdev_read_only(sb->s_bdev)) return -EACCES; #endif -- cgit v1.2.3-70-g09d2 From 52f019d43c229afd65dc11c8c1b05b6436bf6765 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:51 +0100 Subject: block: add a hard-readonly flag to struct gendisk Commit 20bd1d026aac ("scsi: sd: Keep disk read-only when re-reading partition") addressed a long-standing problem with user read-only policy being overridden as a result of a device-initiated revalidate. The commit has since been reverted due to a regression that left some USB devices read-only indefinitely. To fix the underlying problems with revalidate we need to keep track of hardware state and user policy separately. The gendisk has been updated to reflect the current hardware state set by the device driver. This is done to allow returning the device to the hardware state once the user clears the BLKROSET flag. The resulting semantics are as follows: - If BLKROSET sets a given partition read-only, that partition will remain read-only even if the underlying storage stack initiates a revalidate. However, the BLKRRPART ioctl will cause the partition table to be dropped and any user policy on partitions will be lost. - If BLKROSET has not been set, both the whole disk device and any partitions will reflect the current write-protect state of the underlying device. Based on a patch from Martin K. Petersen . Reported-by: Oleksii Kurochko Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201221 Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +--- block/genhd.c | 33 +++++++++++++++++++-------------- block/partitions/core.c | 3 +-- include/linux/genhd.h | 6 ++++-- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 7663a9b94b80..08ff8ca32529 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -694,9 +694,7 @@ static inline bool should_fail_request(struct block_device *part, static inline bool bio_check_ro(struct bio *bio, struct block_device *part) { - const int op = bio_op(bio); - - if (part->bd_read_only && op_is_write(op)) { + if (op_is_write(bio_op(bio)) && bdev_read_only(part)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) diff --git a/block/genhd.c b/block/genhd.c index 484a474648d5..1873e4571328 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1637,27 +1637,32 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_disk_ro(struct gendisk *disk, int flag) +/** + * set_disk_ro - set a gendisk read-only + * @disk: gendisk to operate on + * @ready_only: %true to set the disk read-only, %false set the disk read/write + * + * This function is used to indicate whether a given disk device should have its + * read-only flag set. set_disk_ro() is typically used by device drivers to + * indicate whether the underlying physical device is write-protected. + */ +void set_disk_ro(struct gendisk *disk, bool read_only) { - struct disk_part_iter piter; - struct block_device *part; - - if (disk->part0->bd_read_only != flag) { - set_disk_ro_uevent(disk, flag); - disk->part0->bd_read_only = flag; + if (read_only) { + if (test_and_set_bit(GD_READ_ONLY, &disk->state)) + return; + } else { + if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) + return; } - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - part->bd_read_only = flag; - disk_part_iter_exit(&piter); + set_disk_ro_uevent(disk, read_only); } - EXPORT_SYMBOL(set_disk_ro); int bdev_read_only(struct block_device *bdev) { - return bdev->bd_read_only; + return bdev->bd_read_only || + test_bit(GD_READ_ONLY, &bdev->bd_disk->state); } EXPORT_SYMBOL(bdev_read_only); diff --git a/block/partitions/core.c b/block/partitions/core.c index e7d776db803b..168d5906077c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -195,7 +195,7 @@ static ssize_t part_start_show(struct device *dev, static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); + return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, @@ -361,7 +361,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - bdev->bd_read_only = get_disk_ro(disk); if (info) { err = -ENOMEM; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 809aaa32d53c..a62ccbfac54b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -163,6 +163,7 @@ struct gendisk { int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 +#define GD_READ_ONLY 1 struct kobject *slave_dir; struct timer_rand_state *random; @@ -249,11 +250,12 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) extern void del_gendisk(struct gendisk *gp); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -extern void set_disk_ro(struct gendisk *disk, int flag); +void set_disk_ro(struct gendisk *disk, bool read_only); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0->bd_read_only; + return disk->part0->bd_read_only || + test_bit(GD_READ_ONLY, &disk->state); } extern void disk_block_events(struct gendisk *disk); -- cgit v1.2.3-70-g09d2 From 947139bf3cce097739380c9782a35de504f24203 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:52 +0100 Subject: block: propagate BLKROSET on the whole device to all partitions Change the policy so that a BLKROSET on the whole device also affects partitions. To quote Martin K. Petersen: It's very common for database folks to twiddle the read-only state of block devices and partitions. I know that our users will find it very counter-intuitive that setting /dev/sda read-only won't prevent writes to /dev/sda1. The existing behavior is inconsistent in the sense that doing: # blockdev --setro /dev/sda # echo foo > /dev/sda1 permits writes. But: # blockdev --setro /dev/sda # echo foo > /dev/sda1 doesn't. And a subsequent: # blockdev --setrw /dev/sda # echo foo > /dev/sda1 doesn't work either since sda1's read-only policy has been inherited from the whole-disk device. You need to do: # blockdev --rereadpt after setting the whole-disk device rw to effectuate the same change on the partitions, otherwise they are stuck being read-only indefinitely. However, setting the read-only policy on a partition does *not* require the revalidate step. As a matter of fact, doing the revalidate will blow away the policy setting you just made. So the user needs to take different actions depending on whether they are trying to read-protect a whole-disk device or a partition. Despite using the same ioctl. That is really confusing. I have lost count how many times our customers have had data clobbered because of ambiguity of the existing whole-disk device policy. The current behavior violates the principle of least surprise by letting the user think they write protected the whole disk when they actually didn't. Suggested-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 1873e4571328..ca5d880af512 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1661,8 +1661,7 @@ EXPORT_SYMBOL(set_disk_ro); int bdev_read_only(struct block_device *bdev) { - return bdev->bd_read_only || - test_bit(GD_READ_ONLY, &bdev->bd_disk->state); + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); -- cgit v1.2.3-70-g09d2 From cbf72cce6370b3ec1a6073cf777ab9b6ba5bf5b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:53 +0100 Subject: rbd: remove the ->set_read_only method Now that the hardware read-only state can't be changed by the BLKROSET ioctl, the code in this method is not required anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 59cfe71d0b3a..bbb88eb009e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -692,29 +692,10 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) put_device(&rbd_dev->dev); } -static int rbd_set_read_only(struct block_device *bdev, bool ro) -{ - struct rbd_device *rbd_dev = bdev->bd_disk->private_data; - - /* - * Both images mapped read-only and snapshots can't be marked - * read-write. - */ - if (!ro) { - if (rbd_is_ro(rbd_dev)) - return -EROFS; - - rbd_assert(!rbd_is_snap(rbd_dev)); - } - - return 0; -} - static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, - .set_read_only = rbd_set_read_only, }; /* -- cgit v1.2.3-70-g09d2 From d11cd28998e9d25389d8c20e7cce0e4b4f17bee1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:54 +0100 Subject: nvme: allow revalidate to set a namespace read-only Unconditionally call set_disk_ro now that it only updates the hardware state. This allows to properly set up the Linux devices read-only when the controller turns a previously writable namespace read-only. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8caf9b34734d..566788ba4e7d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2125,9 +2125,8 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if ((id->nsattr & NVME_NS_ATTR_RO) || - test_bit(NVME_NS_FORCE_RO, &ns->flags)) - set_disk_ro(disk, true); + set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)); } static inline bool nvme_first_scan(struct gendisk *disk) -- cgit v1.2.3-70-g09d2 From 74cb8994b22ad7b95ac38dad9c9609ae49e88ec1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:32 +0100 Subject: brd: remove the end of device check in brd_do_bvec The block layer already checks for this conditions in bio_check_eod before calling the driver. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/brd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c43a6ab4b1f3..c7c821419079 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -285,14 +285,10 @@ out: static blk_qc_t brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_disk->private_data; + sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; - sector_t sector; struct bvec_iter iter; - sector = bio->bi_iter.bi_sector; - if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) - goto io_error; - bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; int err; -- cgit v1.2.3-70-g09d2 From cf9a978f9781fb30b778ee61ef6bd164c655d9ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:33 +0100 Subject: dcssblk: remove the end of device check in dcssblk_submit_bio The block layer already checks for this conditions in bio_check_eod before calling the driver. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/s390/block/dcssblk.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 299e77ec2c41..5c5cff3f2374 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -886,10 +886,6 @@ dcssblk_submit_bio(struct bio *bio) (bio->bi_iter.bi_size & 4095) != 0) /* Request is not page-aligned. */ goto fail; - if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) { - /* Request beyond end of DCSS segment. */ - goto fail; - } /* verify data transfer direction */ if (dev_info->is_shared) { switch (dev_info->segment_type) { -- cgit v1.2.3-70-g09d2 From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:34 +0100 Subject: block: store a block_device pointer in struct bio Replace the gendisk pointer in struct bio with a pointer to the newly improved struct block device. From that the gendisk can be trivially accessed with an extra indirection, but it also allows to directly look up all information related to partition remapping. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 2 +- block/bio-integrity.c | 18 +++++++++--------- block/bio.c | 31 +++++++++++-------------------- block/blk-cgroup.c | 7 ++++--- block/blk-core.c | 37 +++++++++++++++++-------------------- block/blk-crypto-fallback.c | 2 +- block/blk-crypto.c | 2 +- block/blk-merge.c | 17 ++++++++--------- block/blk-mq.c | 2 +- block/blk-throttle.c | 2 +- block/blk.h | 2 -- block/bounce.c | 2 +- block/genhd.c | 2 +- drivers/block/brd.c | 2 +- drivers/block/drbd/drbd_int.h | 4 ++-- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/null_blk/main.c | 2 +- drivers/block/pktcdvd.c | 4 ++-- drivers/block/ps3vram.c | 2 +- drivers/block/rsxx/dev.c | 2 +- drivers/block/umem.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/lightnvm/pblk-init.c | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/request.c | 7 ++++--- drivers/md/dm-bio-record.h | 9 +++------ drivers/md/dm-raid1.c | 10 +++++----- drivers/md/dm.c | 14 +++++++------- drivers/md/md-linear.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 6 +++--- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 12 ++++++------ drivers/md/raid5.c | 2 +- drivers/nvdimm/blk.c | 4 ++-- drivers/nvdimm/btt.c | 4 ++-- drivers/nvdimm/pmem.c | 4 ++-- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/lightnvm.c | 3 +-- drivers/nvme/host/multipath.c | 6 +++--- drivers/nvme/host/rdma.c | 2 +- drivers/s390/block/dcssblk.c | 2 +- drivers/s390/block/xpram.c | 2 +- fs/btrfs/check-integrity.c | 10 +++++----- fs/btrfs/raid56.c | 7 ++----- fs/btrfs/scrub.c | 2 +- fs/direct-io.c | 2 +- fs/f2fs/data.c | 12 +----------- include/linux/bio.h | 18 ++++++++---------- include/linux/blk-mq.h | 4 ++-- include/linux/blk_types.h | 3 +-- include/linux/blkdev.h | 5 +++-- kernel/trace/blktrace.c | 16 +++++++++------- mm/page_io.c | 2 +- 55 files changed, 154 insertions(+), 184 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 92d26c812441..ba808543161a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -61,7 +61,7 @@ struct nfhd_device { static blk_qc_t nfhd_submit_bio(struct bio *bio) { - struct nfhd_device *dev = bio->bi_disk->private_data; + struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; int dir, len, shift; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 3447556d276d..fc09be7b1347 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -103,7 +103,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, static blk_qc_t simdisk_submit_bio(struct bio *bio) { - struct simdisk *dev = bio->bi_disk->private_data; + struct simdisk *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; sector_t sector = bio->bi_iter.bi_sector; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9ffd7e289554..c3e5abcfdc98 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -140,7 +140,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_disk->queue, + bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -162,7 +162,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; @@ -171,7 +171,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_disk->disk_name; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = prot_buf; @@ -208,8 +208,8 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct request_queue *q = bio->bi_disk->queue; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct request_queue *q = bio->bi_bdev->bd_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -329,7 +329,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced @@ -355,7 +355,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -381,7 +381,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -397,7 +397,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } diff --git a/block/bio.c b/block/bio.c index 1f2cc1fbe283..0b70ade17da6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -607,16 +607,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct block_device *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = bdev_nr_sectors(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -676,11 +667,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); @@ -730,7 +720,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -1037,7 +1027,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1145,7 +1135,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1422,8 +1413,8 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); /* * Need to have a real endio function for chained bios, otherwise @@ -1438,8 +1429,8 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 031114d454a6..3465d6ee708e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1800,7 +1800,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + blkg = blkg_lookup_create(css_to_blkcg(css), + bio->bi_bdev->bd_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1836,8 +1837,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_disk->queue->root_blkg; + blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-core.c b/block/blk-core.c index 08ff8ca32529..a3a54cd86c9c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -476,7 +476,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; int ret; @@ -712,7 +712,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -741,13 +741,9 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) */ static inline int blk_partition_remap(struct bio *bio) { - struct block_device *p; + struct block_device *p = bio->bi_bdev; int ret = -EIO; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p)) - goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) goto out; if (unlikely(bio_check_ro(bio, p))) @@ -761,10 +757,9 @@ static inline int blk_partition_remap(struct bio *bio) bio->bi_iter.bi_sector - p->bd_start_sect); } - bio->bi_partno = 0; + bio->bi_bdev = bdev_whole(p); ret = 0; out: - rcu_read_unlock(); return ret; } @@ -805,7 +800,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool submit_bio_checks(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev->bd_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -825,13 +821,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - if (bio->bi_partno) { + if (bio->bi_bdev->bd_partno) { if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio, bdev_whole(bdev)))) goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + if (unlikely(bio_check_eod(bio, get_capacity(bdev->bd_disk)))) goto end_io; } @@ -924,7 +920,7 @@ end_io: static blk_qc_t __submit_bio(struct bio *bio) { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { @@ -966,7 +962,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct bio_list lower, same; if (unlikely(bio_queue_enter(bio) != 0)) @@ -987,7 +983,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) + if (q == bio->bi_bdev->bd_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -1012,7 +1008,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; @@ -1055,7 +1051,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio) return BLK_QC_T_NONE; } - if (!bio->bi_disk->fops->submit_bio) + if (!bio->bi_bdev->bd_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } @@ -1067,7 +1063,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_disk and bi_partno fields. + * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -1087,7 +1083,8 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; + count = queue_logical_block_size( + bio->bi_bdev->bd_disk->queue) >> 9; else count = bio_sectors(bio); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c162b754efbd..8f1e18176731 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,7 +167,7 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 5da43f0973b4..09fcb18fa778 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, &bc_key->crypto_cfg)) return true; diff --git a/block/blk-merge.c b/block/blk-merge.c index 808768f6b174..ffb4aa0ea68b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -298,14 +298,13 @@ split: * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is - * the responsibility of the caller to ensure that - * @bio->bi_disk->queue->bio_split is only released after processing of the - * split bio has finished. + * function may allocate a new bio from q->bio_split, it is the responsibility + * of the caller to ensure that q->bio_split is only released after processing + * of the split bio has finished. */ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_disk->queue; + struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -358,9 +357,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of - * the caller to ensure that @bio->bi_disk->queue->bio_split is only released - * after processing of the split bio has finished. + * a new bio from q->bio_split, it is the responsibility of the caller to ensure + * that q->bio_split is only released after processing of the split bio has + * finished. */ void blk_queue_split(struct bio **bio) { @@ -866,7 +865,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device */ - if (rq->rq_disk != bio->bi_disk) + if (rq->rq_disk != bio->bi_bdev->bd_disk) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq.c b/block/blk-mq.c index f285a9123a8b..74b17b396f4c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2128,7 +2128,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ blk_qc_t blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d52cac9f3a7c..b1b22d863bdf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2178,7 +2178,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) bool blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); diff --git a/block/blk.h b/block/blk.h index 7550364c326c..10ab7c0d0766 100644 --- a/block/blk.h +++ b/block/blk.h @@ -202,8 +202,6 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct block_device *__disk_get_part(struct gendisk *disk, int partno); - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, diff --git a/block/bounce.c b/block/bounce.c index d3f51acd6e3b..a22a8a1942b2 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -246,7 +246,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/genhd.c b/block/genhd.c index ca5d880af512..e536d0b4bbae 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -161,7 +161,7 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -struct block_device *__disk_get_part(struct gendisk *disk, int partno) +static struct block_device *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c7c821419079..18bf99906662 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -284,7 +284,7 @@ out: static blk_qc_t brd_submit_bio(struct bio *bio) { - struct brd_device *brd = bio->bi_disk->private_data; + struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; struct bvec_iter iter; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8f879e5c2f67..b2c93a29c251 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1579,8 +1579,8 @@ static inline void drbd_submit_bio_noacct(struct drbd_device *device, int fault_type, struct bio *bio) { __release(local); - if (!bio->bi_disk) { - drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n"); + if (!bio->bi_bdev) { + drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 330f851cb8f0..ea0f31ab3343 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1595,7 +1595,7 @@ void do_submit(struct work_struct *ws) blk_qc_t drbd_submit_bio(struct bio *bio) { - struct drbd_device *device = bio->bi_disk->private_data; + struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; unsigned long start_jif; blk_queue_split(&bio); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5357c3a4a36f..d6c821d48090 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1420,7 +1420,7 @@ static blk_qc_t null_submit_bio(struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector; sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; + struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); struct nullb_cmd *cmd; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b8bb8ec7538d..658a0981cb54 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2374,7 +2374,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) blk_queue_split(&bio); - pd = bio->bi_disk->queue->queuedata; + pd = bio->bi_bdev->bd_disk->queue->queuedata; if (!pd) { pr_err("%s incorrect request queue\n", bio_devname(bio, b)); goto end_io; @@ -2418,7 +2418,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) split = bio; } - pkt_make_request_write(bio->bi_disk->queue, split); + pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); } while (split != bio); return BLK_QC_T_NONE; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index b71d28372ef3..1d738999fb69 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -581,7 +581,7 @@ out: static blk_qc_t ps3vram_submit_bio(struct bio *bio) { - struct ps3_system_bus_device *dev = bio->bi_disk->private_data; + struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); int busy; diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index edacefff6e35..9a28322a8cd8 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -122,7 +122,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, static blk_qc_t rsxx_submit_bio(struct bio *bio) { - struct rsxx_cardinfo *card = bio->bi_disk->private_data; + struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; struct rsxx_bio_meta *bio_meta; blk_status_t st = BLK_STS_IOERR; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 2b95d7b33b91..982732dbe82e 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -521,7 +521,7 @@ static int mm_check_plugged(struct cardinfo *card) static blk_qc_t mm_submit_bio(struct bio *bio) { - struct cardinfo *card = bio->bi_disk->private_data; + struct cardinfo *card = bio->bi_bdev->bd_disk->private_data; pr_debug("mm_make_request %llu %u\n", (unsigned long long)bio->bi_iter.bi_sector, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e2933cb7a82a..d6243dbc53cc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1596,7 +1596,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) */ static blk_qc_t zram_submit_bio(struct bio *bio) { - struct zram *zram = bio->bi_disk->private_data; + struct zram *zram = bio->bi_bdev->bd_disk->private_data; if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b6246f73895c..5924f09c217b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -49,7 +49,7 @@ struct bio_set pblk_bio_set; static blk_qc_t pblk_submit_bio(struct bio *bio) { - struct pblk *pblk = bio->bi_disk->queue->queuedata; + struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; if (bio_op(bio) == REQ_OP_DISCARD) { pblk_discard(pblk, bio); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index b00fd08d696b..058dd8014428 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; - check->bi_disk = bio->bi_disk; + check->bi_bdev = bio->bi_bdev; check->bi_opf = REQ_OP_READ; check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 85b1f2a9b72d..dfc35d6d05ed 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -894,7 +894,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, !(bio->bi_opf & (REQ_META|REQ_PRIO)) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_disk) - bio_end_sector(bio)); + get_capacity(bio->bi_bdev->bd_disk) - + bio_end_sector(bio)); s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); @@ -1167,7 +1168,7 @@ static void quit_max_writeback_rate(struct cache_set *c, blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); @@ -1274,7 +1275,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 2ea0360108e1..a3b71350eec8 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -18,8 +18,7 @@ */ struct dm_bio_details { - struct gendisk *bi_disk; - u8 bi_partno; + struct block_device *bi_bdev; int __bi_remaining; unsigned long bi_flags; struct bvec_iter bi_iter; @@ -31,8 +30,7 @@ struct dm_bio_details { static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_disk = bio->bi_disk; - bd->bi_partno = bio->bi_partno; + bd->bi_bdev = bio->bi_bdev; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; bd->__bi_remaining = atomic_read(&bio->__bi_remaining); @@ -44,8 +42,7 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_disk = bd->bi_disk; - bio->bi_partno = bd->bi_partno; + bio->bi_bdev = bd->bi_bdev; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; atomic_set(&bio->__bi_remaining, bd->__bi_remaining); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index fa09bc4e4c54..b0a82f29a2e4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; - /* if details->bi_disk == NULL, details were not saved */ + /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -1190,7 +1190,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ @@ -1257,7 +1257,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, goto out; if (unlikely(*error)) { - if (!bio_record->details.bi_disk) { + if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1282,7 +1282,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, bd = &bio_record->details; dm_bio_restore(bd, bio); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; bio->bi_status = 0; queue_bio(ms, bio, rw); @@ -1292,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, } out: - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; return DM_ENDIO_DONE; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7bac564f3faa..479ec5bea09e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -977,16 +977,17 @@ static void clone_endio(struct bio *bio) struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; struct bio *orig_bio = io->orig_bio; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && - !bio->bi_disk->queue->limits.max_discard_sectors) + !q->limits.max_discard_sectors) disable_discard(md); else if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !q->limits.max_write_same_sectors) disable_write_same(md); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !q->limits.max_write_zeroes_sectors) disable_write_zeroes(md); } @@ -996,7 +997,7 @@ static void clone_endio(struct bio *bio) */ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { sector_t written_sector = bio->bi_iter.bi_sector; - struct request_queue *q = orig_bio->bi_disk->queue; + struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue; u64 mask = (u64)blk_queue_zone_sectors(q) - 1; orig_bio->bi_iter.bi_sector += written_sector & mask; @@ -1422,8 +1423,7 @@ static int __send_empty_flush(struct clone_info *ci) */ bio_init(&flush_bio, NULL, 0); flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - flush_bio.bi_disk = ci->io->md->disk; - bio_associate_blkg(&flush_bio); + bio_set_dev(&flush_bio, ci->io->md->disk->part0); ci->bio = &flush_bio; ci->sector_count = 0; @@ -1626,7 +1626,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, static blk_qc_t dm_submit_bio(struct bio *bio) { - struct mapped_device *md = bio->bi_disk->private_data; + struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 68cac7d19278..63ed8329a98d 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -252,7 +252,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) { + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) { /* Just ignore it */ bio_endio(bio); } else { diff --git a/drivers/md/md.c b/drivers/md/md.c index 04384452a7ab..cf06dbb1aa53 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -486,7 +486,7 @@ static void md_end_io(struct bio *bio) static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - struct mddev *mddev = bio->bi_disk->private_data; + struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 34070ab30a8a..f13290ccc1c2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -556,7 +556,7 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bio->bi_disk->sync_io); + md_sync_acct(bio->bi_bdev, nr_sectors); } struct md_personality @@ -793,14 +793,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c0347997f6ff..3b19141cdb4b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,13 +794,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_disk; + struct md_rdev *rdev = (void *)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1520,7 +1520,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)conf->mirrors[i].rdev; + mbio->bi_bdev = (void *)conf->mirrors[i].rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c5d88ef6a45c..be8f14afb6d1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -882,13 +882,13 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1075,13 +1075,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1253,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)rdev; + mbio->bi_bdev = (void *)rdev; atomic_inc(&r10_bio->remaining); @@ -3003,7 +3003,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_disk set, + * have bi_end_io, bi_sector, bi_bdev set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3a90cc0e43ca..f411b9e5c332 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5310,7 +5310,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - WARN_ON_ONCE(bio->bi_partno); + WARN_ON_ONCE(bio->bi_bdev->bd_partno); chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 22e5617b2cea..e03a1f38d750 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -165,7 +165,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk, static blk_qc_t nd_blk_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip; - struct nd_namespace_blk *nsblk = bio->bi_disk->private_data; + struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -177,7 +177,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio) bip = bio_integrity(bio); rw = bio_data_dir(bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 12ff6f8784ac..41aa1f01fc07 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1442,7 +1442,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, static blk_qc_t btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct btt *btt = bio->bi_disk->private_data; + struct btt *btt = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -1452,7 +1452,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 875076b0ea6c..72740835c85c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -197,13 +197,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio) unsigned long start; struct bio_vec bvec; struct bvec_iter iter; - struct pmem_device *pmem = bio->bi_disk->private_data; + struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data; struct nd_region *nd_region = to_region(pmem); if (bio->bi_opf & REQ_PREFLUSH) ret = nvdimm_flush(nd_region, bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 566788ba4e7d..a39befb4deba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1113,7 +1113,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; struct request *req; struct bio *bio = NULL; void *meta = NULL; @@ -1133,8 +1133,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - bio->bi_disk = disk; - if (disk && meta_buffer && meta_len) { + bio->bi_bdev = bdev; + if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); if (IS_ERR(meta)) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 470cef3abec3..6c8eab8de288 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -757,7 +757,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write((struct nvme_command *)vcmd); struct nvm_dev *dev = ns->ndev; - struct gendisk *disk = ns->disk; struct request *rq; struct bio *bio = NULL; __le64 *ppa_list = NULL; @@ -817,7 +816,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - bio->bi_disk = disk; + bio->bi_bdev = ns->disk->part0; } blk_execute_rq(q, NULL, rq, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9ac762b28811..a6d44e7a775f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -296,7 +296,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { - struct nvme_ns_head *head = bio->bi_disk->private_data; + struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; blk_qc_t ret = BLK_QC_T_NONE; @@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (likely(ns)) { - bio->bi_disk = ns->disk; + bio->bi_bdev = ns->disk->part0; bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); @@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work) * Reset disk to the mpath node and resubmit to select a new * path. */ - bio->bi_disk = head->disk; + bio->bi_bdev = head->disk->part0; submit_bio_noacct(bio); } } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b7ce4f221d99..f5ef3edeb2fd 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1468,7 +1468,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, if (unlikely(nr)) goto mr_put; - nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c, + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, req->mr->sig_attrs, ns->pi_type); nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5c5cff3f2374..da33cb4cba28 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -879,7 +879,7 @@ dcssblk_submit_bio(struct bio *bio) blk_queue_split(&bio); bytes_done = 0; - dev_info = bio->bi_disk->private_data; + dev_info = bio->bi_bdev->bd_disk->private_data; if (dev_info == NULL) goto fail; if ((bio->bi_iter.bi_sector & 7) != 0 || diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index c2536f7767b3..d1ed39162943 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -184,7 +184,7 @@ static unsigned long xpram_highest_page_index(void) */ static blk_qc_t xpram_submit_bio(struct bio *bio) { - xpram_device_t *xdev = bio->bi_disk->private_data; + xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; unsigned int index; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 6ff44e53814c..113cb85c1fd4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk); + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_disk); + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..b2204a2942cb 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1105,8 +1105,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && !last->bi_status && - last->bi_disk == stripe->dev->bdev->bd_disk && - last->bi_partno == stripe->dev->bdev->bd_partno) { + last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1357,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && - stripe->dev->bdev && - bio->bi_disk == stripe->dev->bdev->bd_disk && - bio->bi_partno == stripe->dev->bdev->bd_partno) { + stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..33f8f0f108bf 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1695,7 +1695,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_disk); + WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which diff --git a/fs/direct-io.c b/fs/direct-io.c index d53fa92a1ab6..2660e744da2d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -434,7 +434,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_disk = bio->bi_disk; + dio->bio_disk = bio->bi_bdev->bd_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa34d620bec9..8cbf03159752 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -427,16 +427,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ -static bool __same_bdev(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) -{ - struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); - return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; -} - static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -741,7 +731,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, return false; if (last_blkaddr + 1 != cur_blkaddr) return false; - return __same_bdev(sbi, cur_blkaddr, bio); + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, diff --git a/include/linux/bio.h b/include/linux/bio.h index 1edda614f7ce..12af7aa5db37 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,24 +483,22 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ - (bio)->bi_disk = (bdev)->bd_disk; \ - (bio)->bi_partno = (bdev)->bd_partno; \ - bio_associate_blkg(bio); \ +#define bio_set_dev(bio, bdev) \ +do { \ + if ((bio)->bi_bdev != (bdev)) \ + bio_clear_flag(bio, BIO_THROTTLED); \ + (bio)->bi_bdev = (bdev); \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ - (dst)->bi_disk = (src)->bi_disk; \ - (dst)->bi_partno = (src)->bi_partno; \ + (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ - disk_devt((bio)->bi_disk) + disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d705b174d346..6b410dab48ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -602,8 +602,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; + if (bio->bi_bdev) + rq->rq_disk = bio->bi_bdev->bd_disk; } blk_qc_t blk_mq_submit_bio(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 866f74261b3b..8ebd8be3e050 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -222,7 +222,7 @@ static inline void bio_issue_init(struct bio_issue *issue, */ struct bio { struct bio *bi_next; /* request queue link */ - struct gendisk *bi_disk; + struct block_device *bi_bdev; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. @@ -231,7 +231,6 @@ struct bio { unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; - u8 bi_partno; atomic_t __bi_remaining; struct bvec_iter bi_iter; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94ee3089e01..b55bd534b2e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1967,7 +1967,8 @@ void part_end_io_acct(struct block_device *part, struct bio *bio, */ static inline unsigned long bio_start_io_acct(struct bio *bio) { - return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); + return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), + bio_op(bio)); } /** @@ -1977,7 +1978,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); + return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); } int bdev_read_only(struct block_device *bdev); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb0fe4c66b84..9e9ee4945043 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) @@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); @@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; diff --git a/mm/page_io.c b/mm/page_io.c index 9bca17ecc4df..a75f35464a4e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -433,7 +433,7 @@ int swap_readpage(struct page *page, bool synchronous) ret = -ENOMEM; goto out; } - disk = bio->bi_disk; + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. -- cgit v1.2.3-70-g09d2 From 2f9f6221b9b9944e96c80455b469a6f0269c558b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:35 +0100 Subject: block: simplify submit_bio_checks a bit Merge a few checks for whole devices vs partitions to streamline the sanity checks. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a3a54cd86c9c..64f69022de96 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -692,9 +692,9 @@ static inline bool should_fail_request(struct block_device *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct block_device *part) +static inline bool bio_check_ro(struct bio *bio) { - if (op_is_write(bio_op(bio)) && bdev_read_only(part)) { + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -702,7 +702,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->bd_partno); + bio_devname(bio, b), bio->bi_bdev->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -723,8 +723,9 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ -static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +static inline int bio_check_eod(struct bio *bio) { + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && @@ -739,28 +740,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) /* * Remap block n of partition p to block n+start(p) of the disk. */ -static inline int blk_partition_remap(struct bio *bio) +static int blk_partition_remap(struct bio *bio) { struct block_device *p = bio->bi_bdev; - int ret = -EIO; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) - goto out; - if (unlikely(bio_check_ro(bio, p))) - goto out; - + return -EIO; if (bio_sectors(bio)) { - if (bio_check_eod(bio, bdev_nr_sectors(p))) - goto out; bio->bi_iter.bi_sector += p->bd_start_sect; trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } bio->bi_bdev = bdev_whole(p); - ret = 0; -out: - return ret; + return 0; } /* @@ -820,16 +813,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - - if (bio->bi_bdev->bd_partno) { - if (unlikely(blk_partition_remap(bio))) - goto end_io; - } else { - if (unlikely(bio_check_ro(bio, bdev_whole(bdev)))) - goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bdev->bd_disk)))) - goto end_io; - } + if (unlikely(bio_check_ro(bio))) + goto end_io; + if (unlikely(bio_check_eod(bio))) + goto end_io; + if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio))) + goto end_io; /* * Filter flush bio's early so that bio based drivers without flush -- cgit v1.2.3-70-g09d2 From 30c5d3456c272f0de0d7e7eb9fc355fa64a5f649 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:36 +0100 Subject: block: do not reassig ->bi_bdev when partition remapping There is no good reason to reassign ->bi_bdev when remapping the partition-relative block number to the device wide one, as all the information required by the drivers comes from the gendisk anyway. Keeping the original ->bi_bdev alive will allow to greatly simplify the partition-away I/O accounting. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 64f69022de96..1c1b97a82caa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -752,7 +752,7 @@ static int blk_partition_remap(struct bio *bio) bio->bi_iter.bi_sector - p->bd_start_sect); } - bio->bi_bdev = bdev_whole(p); + bio_set_flag(bio, BIO_REMAPPED); return 0; } @@ -817,7 +817,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) goto end_io; if (unlikely(bio_check_eod(bio))) goto end_io; - if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio))) + if (bio->bi_bdev->bd_partno && !bio_flagged(bio, BIO_REMAPPED) && + unlikely(blk_partition_remap(bio))) goto end_io; /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 12af7aa5db37..2f1155eabaff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -485,6 +485,7 @@ extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ + bio_clear_flag(bio, BIO_REMAPPED); \ if ((bio)->bi_bdev != (bdev)) \ bio_clear_flag(bio, BIO_THROTTLED); \ (bio)->bi_bdev = (bdev); \ @@ -493,6 +494,7 @@ do { \ #define bio_copy_dev(dst, src) \ do { \ + bio_clear_flag(dst, BIO_REMAPPED); \ (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8ebd8be3e050..1bc6f6a01070 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -303,6 +303,7 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_REMAPPED, BIO_FLAG_LAST }; -- cgit v1.2.3-70-g09d2 From 99dfc43ecbf67f12a06512918aaba61d55863efc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:37 +0100 Subject: block: use ->bi_bdev for bio based I/O accounting Rework the I/O accounting for bio based drivers to use ->bi_bdev. This means all drivers can now simply use bio_start_io_acct to start accounting, and it will take partitions into account automatically. To end I/O account either bio_end_io_acct can be used if the driver never remaps I/O to a different device, or bio_end_io_acct_remapped if the driver did remap the I/O. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 23 +++++++++++++---------- drivers/md/bcache/request.c | 34 +++++++++++++++++++++------------- drivers/md/md.c | 8 ++++---- include/linux/blkdev.h | 21 ++++----------------- 4 files changed, 42 insertions(+), 44 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 1c1b97a82caa..9315311c27a9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1321,14 +1321,17 @@ static unsigned long __part_start_io_acct(struct block_device *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, - struct bio *bio) +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +unsigned long bio_start_io_acct(struct bio *bio) { - *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); - - return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); + return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); } -EXPORT_SYMBOL_GPL(part_start_io_acct); +EXPORT_SYMBOL_GPL(bio_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) @@ -1351,12 +1354,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct block_device *part, struct bio *bio, - unsigned long start_time) +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev) { - __part_end_io_acct(part, bio_op(bio), start_time); + __part_end_io_acct(orig_bdev, bio_op(bio), start_time); } -EXPORT_SYMBOL_GPL(part_end_io_acct); +EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index dfc35d6d05ed..29c231758293 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct block_device *part; + struct block_device *orig_bdev; unsigned long start_time; struct btree_op op; @@ -670,8 +670,8 @@ static void bio_complete(struct search *s) { if (s->orig_bio) { /* Count on bcache device */ - part_end_io_acct(s->part, s->orig_bio, s->start_time); - + bio_end_io_acct_remapped(s->orig_bio, s->start_time, + s->orig_bdev); trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -714,7 +714,8 @@ static void search_free(struct closure *cl) } static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) + struct bcache_device *d, struct block_device *orig_bdev, + unsigned long start_time) { struct search *s; @@ -732,7 +733,8 @@ static inline struct search *search_alloc(struct bio *bio, s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; /* Count on the bcache device */ - s->start_time = part_start_io_acct(d->disk, &s->part, bio); + s->orig_bdev = orig_bdev; + s->start_time = start_time; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -1074,7 +1076,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct block_device *part; + struct block_device *orig_bdev; }; static void detached_dev_end_io(struct bio *bio) @@ -1086,7 +1088,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_private = ddip->bi_private; /* Count on the bcache device */ - part_end_io_acct(ddip->part, bio, ddip->start_time); + bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1099,7 +1101,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io(bio); } -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, + struct block_device *orig_bdev, unsigned long start_time) { struct detached_dev_io_private *ddip; struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1112,7 +1115,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; /* Count on the bcache device */ - ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio); + ddip->orig_bdev = orig_bdev; + ddip->start_time = start_time; ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; @@ -1168,8 +1172,10 @@ static void quit_max_writeback_rate(struct cache_set *c, blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct block_device *orig_bdev = bio->bi_bdev; + struct bcache_device *d = orig_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); + unsigned long start_time; int rw = bio_data_dir(bio); if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || @@ -1194,11 +1200,13 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } + start_time = bio_start_io_acct(bio); + bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, orig_bdev, start_time); trace_bcache_request_start(s->d, bio); if (!bio->bi_iter.bi_size) { @@ -1219,7 +1227,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } else /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); + detached_dev_do_request(d, bio, orig_bdev, start_time); return BLK_QC_T_NONE; } @@ -1283,7 +1291,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - s = search_alloc(bio, d); + s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); cl = &s->cl; bio = &s->bio.bio; diff --git a/drivers/md/md.c b/drivers/md/md.c index cf06dbb1aa53..7d1bb24add31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -463,8 +463,8 @@ struct md_io { struct mddev *mddev; bio_end_io_t *orig_bi_end_io; void *orig_bi_private; + struct block_device *orig_bi_bdev; unsigned long start_time; - struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -472,7 +472,7 @@ static void md_end_io(struct bio *bio) struct md_io *md_io = bio->bi_private; struct mddev *mddev = md_io->mddev; - part_end_io_acct(md_io->part, bio, md_io->start_time); + bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev); bio->bi_end_io = md_io->orig_bi_end_io; bio->bi_private = md_io->orig_bi_private; @@ -514,12 +514,12 @@ static blk_qc_t md_submit_bio(struct bio *bio) md_io->mddev = mddev; md_io->orig_bi_end_io = bio->bi_end_io; md_io->orig_bi_private = bio->bi_private; + md_io->orig_bi_bdev = bio->bi_bdev; bio->bi_end_io = md_end_io; bio->bi_private = md_io; - md_io->start_time = part_start_io_acct(mddev->gendisk, - &md_io->part, bio); + md_io->start_time = bio_start_io_acct(bio); } /* bio could be mergeable after passing to underlayer */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b55bd534b2e1..4526b9ef8edb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1954,22 +1954,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long part_start_io_acct(struct gendisk *disk, - struct block_device **part, struct bio *bio); -void part_end_io_acct(struct block_device *part, struct bio *bio, - unsigned long start_time); - -/** - * bio_start_io_acct - start I/O accounting for bio based drivers - * @bio: bio to start account for - * - * Returns the start time that should be passed back to bio_end_io_acct(). - */ -static inline unsigned long bio_start_io_acct(struct bio *bio) -{ - return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), - bio_op(bio)); -} +unsigned long bio_start_io_acct(struct bio *bio); +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev); /** * bio_end_io_acct - end I/O accounting for bio based drivers @@ -1978,7 +1965,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); + return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } int bdev_read_only(struct block_device *bdev); -- cgit v1.2.3-70-g09d2 From 0b6e522cdc4a76352e5f02fc2d92198f03254425 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:38 +0100 Subject: blk-mq: use ->bi_bdev for I/O accounting Remove the reverse map from a sector to a partition for I/O accounting by simply using ->bi_bdev. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++++- block/blk.h | 2 -- block/genhd.c | 48 ------------------------------------------------ 3 files changed, 5 insertions(+), 51 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9315311c27a9..6dfbdde6b9ff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1298,7 +1298,11 @@ void blk_account_io_start(struct request *rq) if (!blk_do_io_stat(rq)) return; - rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else + rq->part = rq->rq_disk->part0; part_stat_lock(); update_io_ticks(rq->part, jiffies, false); diff --git a/block/blk.h b/block/blk.h index 10ab7c0d0766..d965cacc5bda 100644 --- a/block/blk.h +++ b/block/blk.h @@ -333,8 +333,6 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif -struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); - int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); diff --git a/block/genhd.c b/block/genhd.c index e536d0b4bbae..e46de616a19e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -285,54 +285,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter) } EXPORT_SYMBOL_GPL(disk_part_iter_exit); -static inline int sector_in_part(struct block_device *part, sector_t sector) -{ - return part->bd_start_sect <= sector && - sector < part->bd_start_sect + bdev_nr_sectors(part); -} - -/** - * disk_map_sector_rcu - map sector to partition - * @disk: gendisk of interest - * @sector: sector to map - * - * Find out which partition @sector maps to on @disk. This is - * primarily used for stats accounting. - * - * CONTEXT: - * RCU read locked. - * - * RETURNS: - * Found partition on success, part0 is returned if no partition matches - * or the matched partition is being deleted. - */ -struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) -{ - struct disk_part_tbl *ptbl; - struct block_device *part; - int i; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector)) - goto out_unlock; - - for (i = 1; i < ptbl->len; i++) { - part = rcu_dereference(ptbl->part[i]); - if (part && sector_in_part(part, sector)) { - rcu_assign_pointer(ptbl->last_lookup, part); - goto out_unlock; - } - } - - part = disk->part0; -out_unlock: - rcu_read_unlock(); - return part; -} - /** * disk_has_partitions * @disk: gendisk of interest -- cgit v1.2.3-70-g09d2 From bc359d03c7ec1bf3b86d03bafaf6bbb21e6414fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:39 +0100 Subject: block: add a disk_uevent helper Add a helper to call kobject_uevent for the disk and all partitions, and unexport the disk_part_iter_* helpers that are now only used in the core block code. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 27 ++++++++++++++------------- drivers/s390/block/dasd.c | 26 +++++--------------------- include/linux/genhd.h | 2 ++ 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index e46de616a19e..7094612c7510 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -203,7 +203,6 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(disk_part_iter_init); /** * disk_part_iter_next - proceed iterator to the next partition and return it @@ -266,7 +265,6 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) return piter->part; } -EXPORT_SYMBOL_GPL(disk_part_iter_next); /** * disk_part_iter_exit - finish up partition iteration @@ -283,7 +281,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter) bdput(piter->part); piter->part = NULL; } -EXPORT_SYMBOL_GPL(disk_part_iter_exit); /** * disk_has_partitions @@ -555,6 +552,18 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } +void disk_uevent(struct gendisk *disk, enum kobject_action action) +{ + struct disk_part_iter piter; + struct block_device *part; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(bdev_kobj(part), action); + disk_part_iter_exit(&piter); +} +EXPORT_SYMBOL_GPL(disk_uevent); + static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -572,8 +581,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct block_device *part; int err; ddev->parent = parent; @@ -616,15 +623,9 @@ static void register_disk(struct device *parent, struct gendisk *disk, disk_scan_partitions(disk); - /* announce disk after possible partitions are created */ + /* announce the disk and partitions after all partitions are created */ dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_ADD); - disk_part_iter_exit(&piter); + disk_uevent(disk, KOBJ_ADD); if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index c7eb9a10c680..28c04a4efa66 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -428,23 +428,15 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device) static int dasd_state_ready_to_online(struct dasd_device * device) { - struct gendisk *disk; - struct disk_part_iter piter; - struct block_device *part; - device->state = DASD_STATE_ONLINE; if (device->block) { dasd_schedule_block_bh(device->block); if ((device->features & DASD_FEATURE_USERAW)) { - disk = device->block->gdp; - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, + KOBJ_CHANGE); return 0; } - disk = device->block->bdev->bd_disk; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); - disk_part_iter_exit(&piter); + disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); } return 0; } @@ -455,9 +447,6 @@ dasd_state_ready_to_online(struct dasd_device * device) static int dasd_state_online_to_ready(struct dasd_device *device) { int rc; - struct gendisk *disk; - struct disk_part_iter piter; - struct block_device *part; if (device->discipline->online_to_ready) { rc = device->discipline->online_to_ready(device); @@ -466,13 +455,8 @@ static int dasd_state_online_to_ready(struct dasd_device *device) } device->state = DASD_STATE_READY; - if (device->block && !(device->features & DASD_FEATURE_USERAW)) { - disk = device->block->bdev->bd_disk; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); - disk_part_iter_exit(&piter); - } + if (device->block && !(device->features & DASD_FEATURE_USERAW)) + disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); return 0; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a62ccbfac54b..670eaef0e876 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -213,6 +213,8 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +void disk_uevent(struct gendisk *disk, enum kobject_action action); + /* * Smarter partition iterator without context limits. */ -- cgit v1.2.3-70-g09d2 From 0470dd9d5f103e7f1d5ba8f755f687c3106c7df1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:40 +0100 Subject: block: remove DISK_PITER_REVERSE There is good reason to iterate backwards when deleting all partitions in del_gendisk, just like we don't in blk_drop_partitions. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 37 +++++++------------------------------ include/linux/genhd.h | 1 - 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 7094612c7510..1832add5c738 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -184,24 +184,13 @@ static struct block_device *__disk_get_part(struct gendisk *disk, int partno) void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags) { - struct disk_part_tbl *ptbl; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - piter->disk = disk; piter->part = NULL; - - if (flags & DISK_PITER_REVERSE) - piter->idx = ptbl->len - 1; - else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) + if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) piter->idx = 0; else piter->idx = 1; - piter->flags = flags; - - rcu_read_unlock(); } /** @@ -216,7 +205,6 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { struct disk_part_tbl *ptbl; - int inc, end; /* put the last partition */ disk_part_iter_exit(piter); @@ -225,21 +213,8 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) rcu_read_lock(); ptbl = rcu_dereference(piter->disk->part_tbl); - /* determine iteration parameters */ - if (piter->flags & DISK_PITER_REVERSE) { - inc = -1; - if (piter->flags & (DISK_PITER_INCL_PART0 | - DISK_PITER_INCL_EMPTY_PART0)) - end = -1; - else - end = 0; - } else { - inc = 1; - end = ptbl->len; - } - /* iterate to the next partition */ - for (; piter->idx != end; piter->idx += inc) { + for (; piter->idx != ptbl->len; piter->idx += 1) { struct block_device *part; part = rcu_dereference(ptbl->part[piter->idx]); @@ -257,7 +232,10 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) continue; } - piter->idx += inc; + piter->part = bdgrab(part); + if (!piter->part) + continue; + piter->idx += 1; break; } @@ -781,8 +759,7 @@ void del_gendisk(struct gendisk *disk) down_write(&bdev_lookup_sem); /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); delete_partition(part); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 670eaef0e876..51609133c9a3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -218,7 +218,6 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); /* * Smarter partition iterator without context limits. */ -#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ #define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ -- cgit v1.2.3-70-g09d2 From a33df75c6328bf40078b35f2040d8e54d574c357 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:41 +0100 Subject: block: use an xarray for disk->part_tbl Now that no fast path lookups in the partition table are left, there is no point in micro-optimizing the data structure for it. Just use a bog standard xarray. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- block/blk.h | 1 - block/genhd.c | 163 ++++-------------------------------------------- block/partitions/core.c | 31 ++------- include/linux/genhd.h | 18 +----- 5 files changed, 22 insertions(+), 193 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..4c974340f1a9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -865,7 +865,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) + !xa_empty(&disk->part_tbl)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: diff --git a/block/blk.h b/block/blk.h index d965cacc5bda..ab0aaf958553 100644 --- a/block/blk.h +++ b/block/blk.h @@ -345,7 +345,6 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); -int disk_expand_part_tbl(struct gendisk *disk, int target); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/genhd.c b/block/genhd.c index 1832add5c738..d3ef29fbc536 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -161,15 +161,6 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -static struct block_device *__disk_get_part(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); - - if (unlikely(partno < 0 || partno >= ptbl->len)) - return NULL; - return rcu_dereference(ptbl->part[partno]); -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -204,41 +195,26 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, */ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { - struct disk_part_tbl *ptbl; + struct block_device *part; + unsigned long idx; /* put the last partition */ disk_part_iter_exit(piter); - /* get part_tbl */ rcu_read_lock(); - ptbl = rcu_dereference(piter->disk->part_tbl); - - /* iterate to the next partition */ - for (; piter->idx != ptbl->len; piter->idx += 1) { - struct block_device *part; - - part = rcu_dereference(ptbl->part[piter->idx]); - if (!part) - continue; - piter->part = bdgrab(part); - if (!piter->part) - continue; + xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) { if (!bdev_nr_sectors(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) { - bdput(piter->part); - piter->part = NULL; + piter->idx == 0)) continue; - } piter->part = bdgrab(part); if (!piter->part) continue; - piter->idx += 1; + piter->idx = idx + 1; break; } - rcu_read_unlock(); return piter->part; @@ -260,42 +236,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter) piter->part = NULL; } -/** - * disk_has_partitions - * @disk: gendisk of interest - * - * Walk through the partition table and check if valid partition exists. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * True if the gendisk has at least one valid non-zero size partition. - * Otherwise false. - */ -bool disk_has_partitions(struct gendisk *disk) -{ - struct disk_part_tbl *ptbl; - int i; - bool ret = false; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - /* Iterate partitions skipping the whole device at index 0 */ - for (i = 1; i < ptbl->len; i++) { - if (rcu_dereference(ptbl->part[i])) { - ret = true; - break; - } - } - - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL_GPL(disk_has_partitions); - /* * Can be deleted altogether. Later. * @@ -858,7 +798,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) struct block_device *bdev = NULL; rcu_read_lock(); - bdev = __disk_get_part(disk, partno); + bdev = xa_load(&disk->part_tbl, partno); if (bdev && !bdgrab(bdev)) bdev = NULL; rcu_read_unlock(); @@ -1248,83 +1188,6 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; -/** - * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way - * @disk: disk to replace part_tbl for - * @new_ptbl: new part_tbl to install - * - * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The - * original ptbl is freed using RCU callback. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - */ -static void disk_replace_part_tbl(struct gendisk *disk, - struct disk_part_tbl *new_ptbl) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(disk->part_tbl, new_ptbl); - - if (old_ptbl) { - rcu_assign_pointer(old_ptbl->last_lookup, NULL); - kfree_rcu(old_ptbl, rcu_head); - } -} - -/** - * disk_expand_part_tbl - expand disk->part_tbl - * @disk: disk to expand part_tbl for - * @partno: expand such that this partno can fit in - * - * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl - * uses RCU to allow unlocked dereferencing for stats and other stuff. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int disk_expand_part_tbl(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - struct disk_part_tbl *new_ptbl; - int len = old_ptbl ? old_ptbl->len : 0; - int i, target; - - /* - * check for int overflow, since we can get here from blkpg_ioctl() - * with a user passed 'partno'. - */ - target = partno + 1; - if (target < 0) - return -EINVAL; - - /* disk_max_parts() is zero during initialization, ignore if so */ - if (disk_max_parts(disk) && target > disk_max_parts(disk)) - return -EINVAL; - - if (target <= len) - return 0; - - new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, - disk->node_id); - if (!new_ptbl) - return -ENOMEM; - - new_ptbl->len = target; - - for (i = 0; i < len; i++) - rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); - - disk_replace_part_tbl(disk, new_ptbl); - return 0; -} - /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1348,7 +1211,7 @@ static void disk_release(struct device *dev) blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); - disk_replace_part_tbl(disk, NULL); + xa_destroy(&disk->part_tbl); bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); @@ -1501,7 +1364,6 @@ dev_t blk_lookup_devt(const char *name, int partno) struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - struct disk_part_tbl *ptbl; if (minors > DISK_MAX_PARTS) { printk(KERN_ERR @@ -1519,11 +1381,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_free_disk; disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) - goto out_bdput; - - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], disk->part0); + xa_init(&disk->part_tbl); + if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) + goto out_destroy_part_tbl; disk->minors = minors; rand_initialize_disk(disk); @@ -1532,7 +1392,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_bdput: +out_destroy_part_tbl: + xa_destroy(&disk->part_tbl); bdput(disk->part0); out_free_disk: kfree(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 168d5906077c..b1cdf88f96e2 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -287,13 +287,7 @@ struct device_type part_type = { */ void delete_partition(struct block_device *part) { - struct gendisk *disk = part->bd_disk; - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); - rcu_assign_pointer(ptbl->last_lookup, NULL); - + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -325,7 +319,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; - struct disk_part_tbl *ptbl; const char *dname; int err; @@ -347,12 +340,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, break; } - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - - if (ptbl->part[partno]) + if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); bdev = bdev_alloc(disk, partno); @@ -405,8 +393,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ + err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); + if (err) + goto out_del; bdev_add(bdev, devt); - rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) @@ -612,7 +602,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state; - int ret = -EAGAIN, p, highest; + int ret = -EAGAIN, p; if (!disk_part_scan_enabled(disk)) return 0; @@ -660,15 +650,6 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* - * Detect the highest partition number and preallocate disk->part_tbl. - * This is an optimization and not strictly necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - disk_expand_part_tbl(disk, highest); - for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, bdev, state, p)) goto out_free_state; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 51609133c9a3..f364619092cc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -32,6 +32,7 @@ extern struct class block_class; #include #include #include +#include #define PARTITION_META_INFO_VOLNAMELTH 64 /* @@ -116,13 +117,6 @@ enum { DISK_EVENT_FLAG_UEVENT = 1 << 1, }; -struct disk_part_tbl { - struct rcu_head rcu_head; - int len; - struct block_device __rcu *last_lookup; - struct block_device __rcu *part[]; -}; - struct disk_events; struct badblocks; @@ -148,12 +142,7 @@ struct gendisk { unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ - /* Array of pointers to partitions indexed by partno. - * Protected with matching bdev lock but stat and other - * non-critical accesses use RCU. Always access through - * helpers. - */ - struct disk_part_tbl __rcu *part_tbl; + struct xarray part_tbl; struct block_device *part0; const struct block_device_operations *fops; @@ -225,7 +214,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); struct disk_part_iter { struct gendisk *disk; struct block_device *part; - int idx; + unsigned long idx; unsigned int flags; }; @@ -233,7 +222,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -extern bool disk_has_partitions(struct gendisk *disk); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, -- cgit v1.2.3-70-g09d2 From b5f74ecacc3139ef873e69acc3aba28083ecc416 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:43 +0100 Subject: block, bfq: use half slice_idle as a threshold to check short ttime The value of the I/O plugging (idling) timeout is used also as the think-time threshold to decide whether a process has a short think time. In this respect, a good value of this timeout for rotational drives is un the order of several ms. Yet, this is often too long a time interval to be effective as a think-time threshold. This commit mitigates this problem (by a lot, according to tests), by halving the threshold. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e4eb0fc1c16..eb2ca32d5b63 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5238,12 +5238,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime + * bfqq. Otherwise check average think time to decide whether + * to mark as has_short_ttime. To this goal, compare average + * think time with half the I/O-plugging timeout. */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); -- cgit v1.2.3-70-g09d2 From d4fc3640ff361a09e359867e0bca898abd2b7ecb Mon Sep 17 00:00:00 2001 From: Jia Cheng Hu Date: Fri, 22 Jan 2021 19:19:44 +0100 Subject: block, bfq: set next_rq to waker_bfqq->next_rq in waker injection Since commit c5089591c3ba ("block, bfq: detect wakers and unconditionally inject their I/O"), when the in-service bfq_queue, say Q, is temporarily empty, BFQ checks whether there are I/O requests to inject (also) from the waker bfq_queue for Q. To this goal, the value pointed by bfqq->waker_bfqq->next_rq must be controlled. However, the current implementation mistakenly looks at bfqq->next_rq, which instead points to the next request of the currently served queue. This mistake evidently causes losses of throughput in scenarios with waker bfq_queues. This commit corrects this mistake. Fixes: c5089591c3ba ("block, bfq: detect wakers and unconditionally inject their I/O") Signed-off-by: Jia Cheng Hu Signed-off-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index eb2ca32d5b63..fdc5e163b2fe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4499,7 +4499,7 @@ check_queue: bfqq = bfqq->bic->bfqq[0]; else if (bfq_bfqq_has_waker(bfqq) && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->next_rq && + bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) -- cgit v1.2.3-70-g09d2 From ab1fb47e33dc7754a7593181ffe0742c7105ea9a Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:45 +0100 Subject: block, bfq: increase time window for waker detection Tests on slower machines showed current window to be way too small. This commit increases it. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fdc5e163b2fe..43e2c39cf7b5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1931,7 +1931,7 @@ static void bfq_add_request(struct request *rq) if (bfqd->last_completed_rq_bfqq && !bfq_bfqq_has_short_ttime(bfqq) && ktime_get_ns() - bfqd->last_completion < - 200 * NSEC_PER_USEC) { + 4 * NSEC_PER_MSEC) { if (bfqd->last_completed_rq_bfqq != bfqq && bfqd->last_completed_rq_bfqq != bfqq->waker_bfqq) { -- cgit v1.2.3-70-g09d2 From 91b896f65d32610d6d58af02170b15f8d37a7702 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:46 +0100 Subject: block, bfq: do not raise non-default weights BFQ heuristics try to detect interactive I/O, and raise the weight of the queues containing such an I/O. Yet, if also the user changes the weight of a queue (i.e., the user changes the ioprio of the process associated with that queue), then it is most likely better to prevent BFQ heuristics from silently changing the same weight. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 43e2c39cf7b5..161badb744d6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1671,15 +1671,19 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense). + * - is linked to a bfq_io_cq (it is not shared in any sense), + * - has a default weight (otherwise we assume the user wanted + * to control its weight explicitly) */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0; - *interactive = !in_burst && idle_for_long_time; + bfqq->dispatched == 0 && + bfqq->entity.new_weight == 40; + *interactive = !in_burst && idle_for_long_time && + bfqq->entity.new_weight == 40; wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && -- cgit v1.2.3-70-g09d2 From 3c337690d2ebb7a01fa13bfa59ce4911f358df42 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:47 +0100 Subject: block, bfq: avoid spurious switches to soft_rt of interactive queues BFQ tags some bfq_queues as interactive or soft_rt if it deems that these bfq_queues contain the I/O of, respectively, interactive or soft real-time applications. BFQ privileges both these special types of bfq_queues over normal bfq_queues. To privilege a bfq_queue, BFQ mainly raises the weight of the bfq_queue. In particular, soft_rt bfq_queues get a higher weight than interactive bfq_queues. A bfq_queue may turn from interactive to soft_rt. And this leads to a tricky issue. Soft real-time applications usually start with an I/O-bound, interactive phase, in which they load themselves into main memory. BFQ correctly detects this phase, and keeps the bfq_queues associated with the application in interactive mode for a while. Problems arise when the I/O pattern of the application finally switches to soft real-time. One of the conditions for a bfq_queue to be deemed as soft_rt is that the bfq_queue does not consume too much bandwidth. But the bfq_queues associated with a soft real-time application consume as much bandwidth as they can in the loading phase of the application. So, after the application becomes truly soft real-time, a lot of time should pass before the average bandwidth consumed by its bfq_queues finally drops to a value acceptable for soft_rt bfq_queues. As a consequence, there might be a time gap during which the application is not privileged at all, because its bfq_queues are not interactive any longer, but cannot be deemed as soft_rt yet. To avoid this problem, BFQ pretends that an interactive bfq_queue consumes zero bandwidth, and allows an interactive bfq_queue to switch to soft_rt. Yet, this fake zero-bandwidth consumption easily causes the bfq_queue to often switch to soft_rt deceptively, during its loading phase. As in soft_rt mode, the bfq_queue gets its bandwidth correctly computed, and therefore soon switches back to interactive. Then it switches again to soft_rt, and so on. These spurious fluctuations usually cause losses of throughput, because they deceive BFQ's mechanisms for boosting throughput (injection, I/O-plugging avoidance, ...). This commit addresses this issue as follows: 1) It does compute actual bandwidth consumption also for interactive bfq_queues. This avoids the above false positives. 2) When a bfq_queue switches from interactive to normal mode, the consumed bandwidth is reset (forgotten). This allows the bfq_queue to enjoy soft_rt very quickly. In particular, two alternatives are possible in this switch: - the bfq_queue still has backlog, and therefore there is a budget already scheduled to serve the bfq_queue; in this case, the scheduling of the current budget of the bfq_queue is not hindered, because only the scheduling of the next budget will be affected by the weight drop. After that, if the bfq_queue is actually in a soft_rt phase, and becomes empty during the service of its current budget, which is the natural behavior of a soft_rt bfq_queue, then the bfq_queue will be considered as soft_rt when its next I/O arrives. If, in contrast, the bfq_queue remains constantly non-empty, then its next budget will be scheduled with a low weight, which is the natural treatment for an I/O-bound (non soft_rt) bfq_queue. - the bfq_queue is empty; in this case, the bfq_queue may be considered unjustly soft_rt when its new I/O arrives. Yet the problem is now much smaller than before, because it is unlikely that more than one spurious fluctuation occurs. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 57 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 161badb744d6..003c96fa01ad 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2356,6 +2356,24 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { + /* + * If bfqq has been enjoying interactive weight-raising, then + * reset soft_rt_next_start. We do it for the following + * reason. bfqq may have been conveying the I/O needed to load + * a soft real-time application. Such an application actually + * exhibits a soft real-time I/O pattern after it finishes + * loading, and finally starts doing its job. But, if bfqq has + * been receiving a lot of bandwidth so far (likely to happen + * on a fast device), then soft_rt_next_start now contains a + * high value that. So, without this reset, bfqq would be + * prevented from being possibly considered as soft_rt for a + * very long time. + */ + + if (bfqq->wr_cur_max_time != + bfqq->bfqd->bfq_wr_rt_max_time) + bfqq->soft_rt_next_start = jiffies; + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -3956,30 +3974,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. And we do it, unless bfqq is in - * interactive weight raising. We do not do it in the - * latter subcase, for the following reason. bfqq may - * be conveying the I/O needed to load a soft - * real-time application. Such an application will - * actually exhibit a soft real-time I/O pattern after - * it finally starts doing its job. But, if - * soft_rt_next_start is computed here for an - * interactive bfqq, and bfqq had received a lot of - * service before remaining with no outstanding - * request (likely to happen on a fast device), then - * soft_rt_next_start would be assigned such a high - * value that, for a very long time, bfqq would be - * prevented from being possibly considered as soft - * real time. + * bfq_bfqq_softrt_next_start()). Therefore we can + * compute soft_rt_next_start. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0 && - bfqq->wr_coeff != bfqd->bfq_wr_coeff) + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4563,9 +4566,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) + bfq_wr_duration(bfqd))) { + /* + * Either in interactive weight + * raising, or in soft_rt weight + * raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ bfq_bfqq_end_wr(bfqq); - else { + } else { /* + * soft_rt finishing while still in + * interactive period, switch back to + * interactive weight raising + */ switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -5016,6 +5031,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", + bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } -- cgit v1.2.3-70-g09d2 From 2391d13ed484df1515f0025458e1f82317823fab Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:48 +0100 Subject: block, bfq: do not expire a queue when it is the only busy one This commits preserves I/O-dispatch plugging for a special symmetric case that may suddenly turn into asymmetric: the case where only one bfq_queue, say bfqq, is busy. In this case, not expiring bfqq does not cause any harm to any other queues in terms of service guarantees. In contrast, it avoids the following unlucky sequence of events: (1) bfqq is expired, (2) a new queue with a lower weight than bfqq becomes busy (or more queues), (3) the new queue is served until a new request arrives for bfqq, (4) when bfqq is finally served, there are so many requests of the new queue in the drive that the pending requests for bfqq take a lot of time to be served. In particular, event (2) may case even already dispatched requests of bfqq to be delayed, inside the drive. So, to avoid this series of events, the scenario is preventively declared as asymmetric also if bfqq is the only busy queues. By doing so, I/O-dispatch plugging is performed for bfqq. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 003c96fa01ad..c045613ce927 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3464,20 +3464,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. + * + * However, as an additional mitigation for this problem, we preserve + * plugging for a special symmetric case that may suddenly turn into + * asymmetric: the case where only bfqq is busy. In this case, not + * expiring bfqq does not cause any harm to any other queues in terms + * of service guarantees. In contrast, it avoids the following unlucky + * sequence of events: (1) bfqq is expired, (2) a new queue with a + * lower weight than bfqq becomes busy (or more queues), (3) the new + * queue is served until a new request arrives for bfqq, (4) when bfqq + * is finally served, there are so many requests of the new queue in + * the drive that the pending requests for bfqq take a lot of time to + * be served. In particular, event (2) may case even already + * dispatched requests of bfqq to be delayed, inside the drive. So, to + * avoid this series of events, the scenario is preventively declared + * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + int tot_busy_queues = bfq_tot_busy_queues(bfqd); + /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd) || + tot_busy_queues || bfqd->rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq); + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- cgit v1.2.3-70-g09d2 From 5ac83c644f5fb924f0b2c09102ab82fc788f8411 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jan 2021 17:47:16 +0100 Subject: Revert "blk-mq, elevator: Count requests per hctx to improve performance" This reverts commit b445547ec1bbd3e7bf4b1c142550942f70527d95. Since both mq-deadline and BFQ completely ignore hctx they are passed to their dispatch function and dispatch whatever request they deem fit checking whether any request for a particular hctx is queued is just pointless since we'll very likely get a request from a different hctx anyway. In the following commit we'll deal with lock contention in these IO schedulers in presence of multiple HW queues in a different way. Signed-off-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 ----- block/blk-mq.c | 1 - block/mq-deadline.c | 6 ------ include/linux/blk-mq.h | 4 ---- 4 files changed, 16 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c045613ce927..b12a416b51d7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4677,9 +4677,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -5597,7 +5594,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } } @@ -5965,7 +5961,6 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); - atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { diff --git a/block/blk-mq.c b/block/blk-mq.c index 74b17b396f4c..1af6b8a9da5a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2653,7 +2653,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); - atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 800ac902809b..b57470e154c8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); - if (rq) - atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } @@ -535,7 +533,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -582,9 +579,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6b410dab48ee..aabbf6830ffc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,10 +140,6 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; - /** - * @elevator_queued: Number of queued requests on hctx. - */ - atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; -- cgit v1.2.3-70-g09d2 From b6e68ee82585f2ee890b0a897a6aacbf49a467bb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jan 2021 17:47:17 +0100 Subject: blk-mq: Improve performance of non-mq IO schedulers with multiple HW queues Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for a queue with multiple HW queues, the performance it rather bad. The problem is that these IO schedulers use queue-wide locking and their dispatch function does not respect the hctx it is passed in and returns any request it finds appropriate. Thus locality of request access is broken and dispatch from multiple CPUs just contends on IO scheduler locks. For these IO schedulers there's little point in dispatching from multiple CPUs. Instead dispatch always only from a single CPU to limit contention. Below is a comparison of dbench runs on XFS filesystem where the storage is a raid card with 64 HW queues and to it attached a single rotating disk. BFQ is used as IO scheduler: clients MQ SQ MQ-Patched Amean 1 39.12 (0.00%) 43.29 * -10.67%* 36.09 * 7.74%* Amean 2 128.58 (0.00%) 101.30 * 21.22%* 96.14 * 25.23%* Amean 4 577.42 (0.00%) 494.47 * 14.37%* 508.49 * 11.94%* Amean 8 610.95 (0.00%) 363.86 * 40.44%* 362.12 * 40.73%* Amean 16 391.78 (0.00%) 261.49 * 33.25%* 282.94 * 27.78%* Amean 32 324.64 (0.00%) 267.71 * 17.54%* 233.00 * 28.23%* Amean 64 295.04 (0.00%) 253.02 * 14.24%* 242.37 * 17.85%* Amean 512 10281.61 (0.00%) 10211.16 * 0.69%* 10447.53 * -1.61%* Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is the same kernel with megaraid_sas.host_tagset_enable=0 so that the card advertises just a single HW queue. MQ-Patched is a kernel with this patch applied. You can see multiple hardware queues heavily hurt performance in combination with BFQ. The patch restores the performance. Signed-off-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 66 +++++++++++++++++++++++++++++++++++++++++++----- block/kyber-iosched.c | 1 + include/linux/elevator.h | 2 ++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1af6b8a9da5a..f21d922ecfaf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1646,6 +1646,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/* + * Is the request queue handled by an IO scheduler that does not respect + * hardware queues when dispatching? + */ +static bool blk_mq_has_sqsched(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.dispatch_request && + !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) + return true; + return false; +} + +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, + raw_smp_processor_id()); + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. @@ -1653,14 +1689,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue); */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1672,14 +1717,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_delay_run_hw_queue(hctx, msecs); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index dc89199bc8c6..c25c41d0d061 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -1029,6 +1029,7 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", + .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, }; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bacc40a0bdf3..1fe8e105b83b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); /* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) #endif /* CONFIG_BLOCK */ #endif -- cgit v1.2.3-70-g09d2 From 1a23e06cdab2be07cbda460c6417d7de564c48e6 Mon Sep 17 00:00:00 2001 From: huhai Date: Fri, 25 Dec 2020 21:00:16 +0800 Subject: bfq: don't duplicate code for different paths As we can see, returns parent_sched_may_change whether sd->next_in_service changes or not, so remove this judgment. Signed-off-by: huhai Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 26776bdbdf36..070e34a7feb1 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -137,9 +137,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, sd->next_in_service = next_in_service; - if (!next_in_service) - return parent_sched_may_change; - return parent_sched_may_change; } -- cgit v1.2.3-70-g09d2 From 49d1ec8573f74ff1e23df1d5092211de46baa236 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:52 +0800 Subject: block: manage bio slab cache by xarray Managing bio slab cache via xarray by using slab cache size as xarray index, and storing 'struct bio_slab' instance into xarray. So code is simplified a lot, meantime it becomes more readable than before. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 116 +++++++++++++++++++++++++----------------------------------- 1 file changed, 49 insertions(+), 67 deletions(-) diff --git a/block/bio.c b/block/bio.c index 0b70ade17da6..87bf16460e0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "blk.h" @@ -58,89 +59,80 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - mutex_lock(&bio_slab_lock); + if (!bslab) + return NULL; - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + if (!bslab->slab) + goto fail_alloc_slab; - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } + bslab->slab_ref = 1; + bslab->slab_size = size; - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; - bslab = &bio_slabs[entry]; + kmem_cache_destroy(bslab->slab); - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; +fail_alloc_slab: + kfree(bslab); + return NULL; +} - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + + BIO_INLINE_VECS * sizeof(struct bio_vec); +} + +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; + + mutex_lock(&bio_slab_lock); + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); @@ -1570,15 +1562,13 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1642,16 +1632,8 @@ static void __init biovec_init_slabs(void) static int __init init_bio(void) { - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - bio_integrity_init(); biovec_init_slabs(); -- cgit v1.2.3-70-g09d2 From c495a17679523c95f77f13697a71921dd5c224cd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:53 +0800 Subject: block: don't pass BIOSET_NEED_BVECS for q->bio_split q->bio_split is only used by bio_split() for fast cloning bio, and no need to allocate bvecs, so remove this flag. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6dfbdde6b9ff..88f608904432 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -531,7 +531,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (q->id < 0) goto fail_q; - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) goto fail_id; -- cgit v1.2.3-70-g09d2 From 9f180e315a93cde559ac1c9c4c5ce980aa681c1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:54 +0800 Subject: block: don't allocate inline bvecs if this bioset needn't bvecs The inline bvecs won't be used if user needn't bvecs by not passing BIOSET_NEED_BVECS, so don't allocate bvecs in this situation. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- include/linux/bio.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 87bf16460e0e..8ccda51dd831 100644 --- a/block/bio.c +++ b/block/bio.c @@ -89,8 +89,7 @@ fail_alloc_slab: static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { - return bs->front_pad + sizeof(struct bio) + - BIO_INLINE_VECS * sizeof(struct bio_vec); + return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) @@ -1563,6 +1562,10 @@ int bioset_init(struct bio_set *bs, int flags) { bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2f1155eabaff..5d8977aafa19 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -703,6 +703,7 @@ struct bio_set { mempool_t bvec_integrity_pool; #endif + unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details -- cgit v1.2.3-70-g09d2 From baa2c7c97153b8064dbeeb99f2f72de3a75c90a7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:55 +0800 Subject: block: set .bi_max_vecs as actual allocated vector number bvec_alloc() may allocate more bio vectors than requested, so set .bi_max_vecs as actual allocated vector number, instead of the requested number. This way can help fs build bigger bio because new bio often won't be allocated until the current one becomes full. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 8ccda51dd831..56a06f94fb63 100644 --- a/block/bio.c +++ b/block/bio.c @@ -505,12 +505,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, goto err_free; bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio->bi_max_vecs = bvec_nr_vecs(idx); } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; + bio->bi_max_vecs = inline_vecs; } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; return bio; -- cgit v1.2.3-70-g09d2 From eec716a1c18c796a69db0be5e2a6f282ba5bccd6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:56 +0800 Subject: block: move three bvec helpers declaration into private helper bvec_alloc(), bvec_free() and bvec_nr_vecs() are only used inside block layer core functions, no need to declare them in public header. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 4 ++++ include/linux/bio.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk.h b/block/blk.h index ab0aaf958553..0198335c5838 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,6 +55,10 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); +void bvec_free(mempool_t *, struct bio_vec *, unsigned int); +unsigned int bvec_nr_vecs(unsigned short idx); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 5d8977aafa19..e135b500df5d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -478,9 +478,6 @@ static inline void zero_fill_bio(struct bio *bio) zero_fill_bio_iter(bio, bio->bi_iter); } -extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ -- cgit v1.2.3-70-g09d2 From faa8e2c4fb30f336a289e3cbaa1e9a9dfd92ac8c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:57 +0800 Subject: bcache: don't pass BIOSET_NEED_BVECS for the 'bio_set' embedded in 'cache_set' This bioset is just for allocating bio only from bio_next_split, and it needn't bvecs, so remove the flag. Cc: linux-bcache@vger.kernel.org Cc: Coly Li Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Acked-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2047a9cccdb5..193fe7652329 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1939,7 +1939,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) goto err; if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + BIOSET_NEED_RESCUER)) goto err; c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); -- cgit v1.2.3-70-g09d2 From 0f7b4bc6bb1e57c48ef14f1818df947c1612b206 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Tue, 19 Jan 2021 04:33:11 -0800 Subject: bsg: free the request before return error code Free the request rq before returning error code. Fixes: 972248e9111e ("scsi: bsg-lib: handle bidi requests without block layer help") Signed-off-by: Pan Bian Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bsg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bsg.c b/block/bsg.c index d7bae94b64d9..3d78e843a83f 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -157,8 +157,10 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) return PTR_ERR(rq); ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); - if (ret) + if (ret) { + blk_put_request(rq); return ret; + } rq->timeout = msecs_to_jiffies(hdr.timeout); if (!rq->timeout) -- cgit v1.2.3-70-g09d2 From 8eeed0b554b9fda61be05b17cbb0b89ea2cbbb65 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 25 Jan 2021 05:49:57 +0100 Subject: block: remove unnecessary argument from blk_execute_rq_nowait The 'q' is not used since commit a1ce35fa4985 ("block: remove dead elevator code"), also update the comment of the function. And more importantly it never really was needed to start with given that we can trivial derive it from struct request. Cc: target-devel@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: linux-ide@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: linux-nfs@vger.kernel.org Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- block/blk-exec.c | 10 ++++------ drivers/block/sx8.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/lightnvm.c | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/target/passthru.c | 2 +- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/sg.c | 3 +-- drivers/scsi/st.c | 2 +- drivers/target/target_core_pscsi.c | 3 +-- include/linux/blkdev.h | 2 +- 11 files changed, 17 insertions(+), 21 deletions(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d072..2e37e85456fb 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -31,8 +31,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) } /** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -45,9 +44,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) * Note: * This function will invoke @done directly if the queue is dead. */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) +void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, + int at_head, rq_end_io_fn *done) { WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -83,7 +81,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, unsigned long hang_check; rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); + blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 4478eb7efee0..2cdf2771f8e8 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); + blk_execute_rq_nowait(NULL, rq, true, NULL); return 0; @@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); + blk_execute_rq_nowait(NULL, rq, true, NULL); return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a39befb4deba..0bea9ae03092 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q, rq->cmd_flags |= REQ_HIPRI; rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); + blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq); while (!completion_done(&wait)) { blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); @@ -1202,7 +1202,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; - blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); + blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); return 0; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 6c8eab8de288..0e5a55075e35 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, rq->end_io_data = rqd; - blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); + blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); return 0; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 856aa31931c1..5b78e68be9a1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } abort_req->end_io_data = NULL; - blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); + blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); /* * The aborted req will be completed on receiving the abort req. @@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(q, NULL, req, false, + blk_execute_rq_nowait(NULL, req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b9776fc8f08f..cbc88acdd233 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) schedule_work(&req->p.work); } else { rq->end_io_data = req; - blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, + blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, nvmet_passthru_req_done); } diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f11f51e2465f..c00f06e9ecb0 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->timeout = 10 * HZ; rq->retries = 5; - blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done); + blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done); } /** diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index bfa8d77322d7..4383d93110f8 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -829,8 +829,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk, - srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); return 0; } diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 43f7624508a9..841ad2fc369a 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, rq->retries = retries; req->end_io_data = SRpnt; - blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end); + blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end); return 0; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 7994f27e4527..33770e5808ce 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1000,8 +1000,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_OTHER; scsi_req(req)->retries = PS_RETRY; - blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req, - (cmd->sam_task_attr == TCM_HEAD_TAG), + blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), pscsi_req_done); return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4526b9ef8edb..623a61239429 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -950,7 +950,7 @@ extern int blk_rq_map_user_iov(struct request_queue *, struct request *, gfp_t); extern void blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); -extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, +extern void blk_execute_rq_nowait(struct gendisk *, struct request *, int, rq_end_io_fn *); /* Helper to convert REQ_OP_XXX to its string format XXX */ -- cgit v1.2.3-70-g09d2 From 684da7628d93bbdcfba9081b917d99f29ad04c23 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 25 Jan 2021 05:49:58 +0100 Subject: block: remove unnecessary argument from blk_execute_rq We can remove 'q' from blk_execute_rq as well after the previous change in blk_execute_rq_nowait. And more importantly it never really was needed to start with given that we can trivial derive it from struct request. Cc: linux-scsi@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: linux-ide@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: linux-nfs@vger.kernel.org Acked-by: Ulf Hansson # for mmc Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- block/blk-exec.c | 3 +-- block/bsg.c | 2 +- block/scsi_ioctl.c | 6 +++--- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/paride/pd.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/cdrom/cdrom.c | 2 +- drivers/ide/ide-atapi.c | 2 +- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-devsets.c | 2 +- drivers/ide/ide-disk.c | 2 +- drivers/ide/ide-ioctls.c | 4 ++-- drivers/ide/ide-park.c | 2 +- drivers/ide/ide-pm.c | 4 ++-- drivers/ide/ide-tape.c | 2 +- drivers/ide/ide-taskfile.c | 2 +- drivers/mmc/core/block.c | 10 +++++----- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/lightnvm.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- fs/nfsd/blocklayout.c | 2 +- include/linux/blkdev.h | 3 +-- 24 files changed, 33 insertions(+), 35 deletions(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 2e37e85456fb..0ab873f10133 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -74,8 +74,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) +void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; diff --git a/block/bsg.c b/block/bsg.c index 3d78e843a83f..bd10922d5cbb 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -183,7 +183,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) bio = rq->bio; - blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); blk_rq_unmap_user(bio); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index c9f009cc0446..6599bac0a78c 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, * (if he doesn't check that is his problem). * N.B. a non-zero SCSI status is _not_ necessarily an error. */ - blk_execute_rq(q, bd_disk, rq, at_head); + blk_execute_rq(bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; } - blk_execute_rq(q, disk, rq, 0); + blk_execute_rq(disk, rq, 0); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { @@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; scsi_req(rq)->cmd_len = 6; - blk_execute_rq(q, bd_disk, rq, 0); + blk_execute_rq(bd_disk, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 53ac59d19ae5..3fd99836bb1c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(rq->q, NULL, rq, true); + blk_execute_rq(NULL, rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index a7af4f27b7c3..897acda20ac8 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); + blk_execute_rq(disk->gd, rq, 0); blk_put_request(rq); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 658a0981cb54..fc4b0f1aa86d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); + blk_execute_rq(pd->bdev->bd_disk, rq, 0); if (scsi_req(rq)->result) ret = -EIO; out: diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 145606dc52db..b0285db7cf4f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -320,7 +320,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); + blk_execute_rq(vblk->disk, req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_put_request(req); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 8f0e52a71493..90ad34c6ef8e 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, rq->timeout = 60 * HZ; bio = rq->bio; - blk_execute_rq(q, cdi->disk, rq, 0); + blk_execute_rq(cdi->disk, rq, 0); if (scsi_req(rq)->result) { struct scsi_sense_hdr sshdr; diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 013ad33fbbc8..a1ce9f5ac3aa 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, memcpy(scsi_req(rq)->cmd, pc->c, 12); if (drive->media == ide_tape) scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1; - blk_execute_rq(drive->queue, disk, rq, 0); + blk_execute_rq(disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 25d2d88e82ad..cffbcc27a34c 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, } } - blk_execute_rq(drive->queue, info->disk, rq, 0); + blk_execute_rq(info->disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; if (buffer) diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 46f2df288c6a..011eab9c69b7 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; - blk_execute_rq(drive->queue, cd->disk, rq, 0); + blk_execute_rq(cd->disk, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); /* diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index f2f93ed40356..ca1d4b3d3878 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, *(int *)&scsi_req(rq)->cmd[1] = arg; ide_req(rq)->special = setting->set; - blk_execute_rq(q, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); ret = scsi_req(rq)->result; blk_put_request(rq); diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 34b9441084f8..8413731c6259 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg) drive->mult_req = arg; drive->special_flags |= IDE_SFLAG_SET_MULTMODE; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); blk_put_request(rq); return (drive->mult_count == arg) ? 0 : -EIO; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 58994da10c06..43fbc37d85c3 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive) ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; - blk_execute_rq(drive->queue, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); ret = scsi_req(rq)->result; blk_put_request(rq); return ret; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 8af7af6001eb..a80a0f28f7b9 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; ide_req(rq)->special = &timeout; - blk_execute_rq(q, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); rc = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); if (rc) diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 82ab308f1aaf..d680b3e3295f 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) mesg.event = PM_EVENT_FREEZE; rqpm.pm_state = mesg.event; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq) blk_mq_end_request(rq, BLK_STS_OK); return -ENXIO; } - blk_execute_rq(q, NULL, rq, true); + blk_execute_rq(NULL, rq, true); return scsi_req(rq)->result ? -EIO : 0; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 88b96437b22e..fa05e7e7d609 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) goto out_put; } - blk_execute_rq(drive->queue, tape->disk, rq, 0); + blk_execute_rq(tape->disk, rq, 0); /* calculate the number of transferred bytes and update buffer state */ size -= scsi_req(rq)->resid_len; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index d016cbe68cba..6665fc4724b9 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, ide_req(rq)->special = cmd; cmd->rq = rq; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 42e27a298218..a1d6b68320ae 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev, goto out_put; } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); @@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); blk_put_request(req); @@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; /* copy to user if data and response */ @@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; if (ret >= 0) { *val = ret; @@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD; req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); err = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); if (err) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0bea9ae03092..eb7963fb167b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (poll) nvme_execute_rq_polled(req->q, NULL, req, at_head); else - blk_execute_rq(req->q, NULL, req, at_head); + blk_execute_rq(NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq) u32 effects; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(rq->q, disk, rq, 0); + blk_execute_rq(disk, rq, 0); nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 0e5a55075e35..ec38128f51e9 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -819,7 +819,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, bio->bi_bdev = ns->disk->part0; } - blk_execute_rq(q, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b3f14f05340a..4d2280658559 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, /* * head injection *required* here otherwise quiesce won't work */ - blk_execute_rq(req->q, NULL, req, 1); + blk_execute_rq(NULL, req, 1); /* * Some devices (USB mass-storage in particular) may transfer diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index a07c39c94bbd..1058659a8d31 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -254,7 +254,7 @@ again: req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - blk_execute_rq(rq->q, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", req->result); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 623a61239429..20f3706b6b2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -948,8 +948,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); +extern void blk_execute_rq(struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct gendisk *, struct request *, int, rq_end_io_fn *); -- cgit v1.2.3-70-g09d2 From 0f1d344feb534555a0dcd0beafb7211a37c5355e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:02:57 +0000 Subject: splice: don't generate zero-len segement bvecs iter_file_splice_write() may spawn bvec segments with zero-length. In preparation for prohibiting them, filter out by hand at splice level. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- fs/splice.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 866d5c2367b2..474fb8b5562a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -662,12 +662,14 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* build the vector */ left = sd.total_len; - for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { + for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; - if (this_len > left) - this_len = left; + /* zero-length bvecs are not supported, skip them */ + if (!this_len) + continue; + this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { @@ -680,6 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, array[n].bv_len = this_len; array[n].bv_offset = buf->offset; left -= this_len; + n++; } iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); -- cgit v1.2.3-70-g09d2 From 9b2e0016d04c6542ace0128eb82ecb3b10c97e43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:02:58 +0000 Subject: bvec/iter: disallow zero-length segment bvecs zero-length bvec segments are allowed in general, but not handled by bio and down the block layer so filtered out. This inconsistency may be confusing and prevent from optimisations. As zero-length segments are useless and places that were generating them are patched, declare them not allowed. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/block/biovecs.rst | 2 ++ Documentation/filesystems/porting.rst | 7 +++++++ lib/iov_iter.c | 2 -- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index 36771a131b56..ddb867e0185b 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -40,6 +40,8 @@ normal code doesn't have to deal with bi_bvec_done. There is a lower level advance function - bvec_iter_advance() - which takes a pointer to a biovec, not a bio; this is used by the bio integrity code. +As of 5.12 bvec segments with zero bv_len are not supported. + What's all this get us? ======================= diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 867036aa90b8..c722d94f29ea 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -865,3 +865,10 @@ no matter what. Everything is handled by the caller. clone_private_mount() returns a longterm mount now, so the proper destructor of its result is kern_unmount() or kern_unmount_array(). + +--- + +**mandatory** + +zero-length bvec segments are disallowed, they must be filtered out before +passed on to an iterator. diff --git a/lib/iov_iter.c b/lib/iov_iter.c index a21e6a5792c5..6c597cdfcf5b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -72,8 +72,6 @@ __start.bi_bvec_done = skip; \ __start.bi_idx = 0; \ for_each_bvec(__v, i->bvec, __bi, __start) { \ - if (!__v.bv_len) \ - continue; \ (void)(STEP); \ } \ } -- cgit v1.2.3-70-g09d2 From 0cf41e5e9bafc185490624c3e321c915885a91f3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:02:59 +0000 Subject: block/psi: remove PSI annotations from direct IO Direct IO does not operate on the current working set of pages managed by the kernel, so it should not be accounted as memory stall to PSI infrastructure. The block layer and iomap direct IO use bio_iov_iter_get_pages() to build bios, and they are the only users of it, so to avoid PSI tracking for them clear out BIO_WORKINGSET flag. Do same for dio_bio_submit() because fs/direct_io constructs bios by hand directly calling bio_add_page(). Reported-by: Christoph Hellwig Suggested-by: Christoph Hellwig Suggested-by: Johannes Weiner Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ fs/direct-io.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/block/bio.c b/block/bio.c index 56a06f94fb63..1cd8a2e79048 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1081,6 +1081,9 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1105,6 +1108,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); + + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/fs/direct-io.c b/fs/direct-io.c index 2660e744da2d..aa1083ecd623 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -426,6 +426,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; -- cgit v1.2.3-70-g09d2 From ecd7fba0ade1d6d8d49d320df9caf96922a376b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 16:03:00 +0000 Subject: target/file: allocate the bvec array as part of struct target_core_file_cmd This saves one memory allocation, and ensures the bvecs aren't freed before the AIO completion. This will allow the lower level code to be optimized so that it can avoid allocating another bvec array. Signed-off-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/target/target_core_file.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index b0cb5b95e892..cce455929778 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -241,6 +241,7 @@ struct target_core_file_cmd { unsigned long len; struct se_cmd *cmd; struct kiocb iocb; + struct bio_vec bvecs[]; }; static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -268,29 +269,22 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct target_core_file_cmd *aio_cmd; struct iov_iter iter = {}; struct scatterlist *sg; - struct bio_vec *bvec; ssize_t len = 0; int ret = 0, i; - aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL); + aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL); if (!aio_cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL); - if (!bvec) { - kfree(aio_cmd); - return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - } - for_each_sg(sgl, sg, sgl_nents, i) { - bvec[i].bv_page = sg_page(sg); - bvec[i].bv_len = sg->length; - bvec[i].bv_offset = sg->offset; + aio_cmd->bvecs[i].bv_page = sg_page(sg); + aio_cmd->bvecs[i].bv_len = sg->length; + aio_cmd->bvecs[i].bv_offset = sg->offset; len += sg->length; } - iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); + iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len); aio_cmd->cmd = cmd; aio_cmd->len = len; @@ -307,8 +301,6 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, else ret = call_read_iter(file, &aio_cmd->iocb, &iter); - kfree(bvec); - if (ret != -EIOCBQUEUED) cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0); -- cgit v1.2.3-70-g09d2 From 54c8195b4ebe10af66b49ab9c809bc16939555fc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:01 +0000 Subject: iov_iter: optimise bvec iov_iter_advance() iov_iter_advance() is heavily used, but implemented through generic means. For bvecs there is a specifically crafted function for that, so use bvec_iter_advance() instead, it's faster and slimmer. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- lib/iov_iter.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 6c597cdfcf5b..e55357f09f71 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1065,6 +1065,21 @@ static void pipe_advance(struct iov_iter *i, size_t size) pipe_truncate(i); } +static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) +{ + struct bvec_iter bi; + + bi.bi_size = i->count; + bi.bi_bvec_done = i->iov_offset; + bi.bi_idx = 0; + bvec_iter_advance(i->bvec, &bi, size); + + i->bvec += bi.bi_idx; + i->nr_segs -= bi.bi_idx; + i->count = bi.bi_size; + i->iov_offset = bi.bi_bvec_done; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(iov_iter_is_pipe(i))) { @@ -1075,6 +1090,10 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; return; } + if (iov_iter_is_bvec(i)) { + iov_iter_bvec_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); -- cgit v1.2.3-70-g09d2 From 3e1a88ec96259282b9a8b45c3f1fda7a3ff4f6ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:02 +0000 Subject: bio: add a helper calculating nr segments to alloc Add a helper function calculating the number of bvec segments we need to allocate to construct a bio. It doesn't change anything functionally, but will be used to not duplicate special cases in the future. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 ++++--- fs/iomap/direct-io.c | 9 ++++----- include/linux/bio.h | 10 ++++++++++ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3b8963e228a1..6f5bd9950baf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; @@ -481,9 +481,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); - if (!nr_pages) + if (!iov_iter_count(iter)) return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5bec..ea1e8f696076 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -250,11 +250,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, orig_count = iov_iter_count(dio->submit.iter); iov_iter_truncate(dio->submit.iter, length); - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - if (nr_pages <= 0) { - ret = nr_pages; + if (!iov_iter_count(dio->submit.iter)) goto out; - } if (need_zeroout) { /* zero out from the start of the block to the write offset */ @@ -263,6 +260,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, iomap_dio_zero(dio, iomap, pos - pad, pad); } + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); do { size_t n; if (dio->error) { @@ -308,7 +306,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->size += n; copied += n; - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, + BIO_MAX_PAGES); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/include/linux/bio.h b/include/linux/bio.h index e135b500df5d..9ddb19801a03 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,6 +10,7 @@ #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include +#include #define BIO_DEBUG @@ -441,6 +442,15 @@ static inline void bio_wouldblock_error(struct bio *bio) bio_endio(bio); } +/* + * Calculate number of bvec segments that should be allocated to fit data + * pointed by @iter. + */ +static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) +{ + return iov_iter_npages(iter, max_segs); +} + struct request_queue; extern int submit_bio_wait(struct bio *bio); -- cgit v1.2.3-70-g09d2 From c42bca92be928ce7dece5fc04cf68d0e37ee6718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:03 +0000 Subject: bio: don't copy bvec for direct IO The block layer spends quite a while in blkdev_direct_IO() to copy and initialise bio's bvec. However, if we've already got a bvec in the input iterator it might be reused in some cases, i.e. when new ITER_BVEC_FLAG_FIXED flag is set. Simple tests show considerable performance boost, and it also reduces memory footprint. Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/filesystems/porting.rst | 9 +++++ block/bio.c | 67 +++++++++++++++-------------------- include/linux/bio.h | 5 ++- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index c722d94f29ea..1f8cf8e10b34 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -872,3 +872,12 @@ its result is kern_unmount() or kern_unmount_array(). zero-length bvec segments are disallowed, they must be filtered out before passed on to an iterator. + +--- + +**mandatory** + +For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but +uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and +page references stay until I/O has completed, i.e. until ->ki_complete() has +been called or returned with non -EIOCBQUEUED code. diff --git a/block/bio.c b/block/bio.c index 1cd8a2e79048..99040a7e6656 100644 --- a/block/bio.c +++ b/block/bio.c @@ -942,21 +942,17 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; - - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; - - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); + WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_max_vecs = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + + iov_iter_advance(iter, iter->count); return 0; } @@ -1070,12 +1066,12 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If @@ -1087,27 +1083,22 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; - - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + int ret = 0; - do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; - ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); + if (iov_iter_is_bvec(iter)) { + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return -EINVAL; + bio_iov_bvec_set(bio, iter); + bio_set_flag(bio, BIO_NO_PAGE_REF); + return 0; + } else { + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = __bio_iov_append_get_pages(bio, iter); else ret = __bio_iov_iter_get_pages(bio, iter); - } - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + } /* don't account direct I/O as memory stall */ bio_clear_flag(bio, BIO_WORKINGSET); diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ddb19801a03..676870b2c88d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -444,10 +444,13 @@ static inline void bio_wouldblock_error(struct bio *bio) /* * Calculate number of bvec segments that should be allocated to fit data - * pointed by @iter. + * pointed by @iter. If @iter is backed by bvec it's going to be reused + * instead of allocating a new one. */ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) { + if (iov_iter_is_bvec(iter)) + return 0; return iov_iter_npages(iter, max_segs); } -- cgit v1.2.3-70-g09d2 From 3a905c37c3510ea6d7cfcdfd0f272ba731286560 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Jan 2021 19:39:57 +0100 Subject: block: skip bio_check_eod for partition-remapped bios When an already remapped bio is resubmitted (e.g. by blk_queue_split), bio_check_eod will compare the remapped bi_sector against the size of the partition, leading to spurious I/O failures. Skip the EOD check in this case. Fixes: 309dca309fc3 ("block: store a block_device pointer in struct bio") Reported-by: Jens Axboe Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 88f608904432..5e752840b41a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -815,11 +815,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) goto end_io; if (unlikely(bio_check_ro(bio))) goto end_io; - if (unlikely(bio_check_eod(bio))) - goto end_io; - if (bio->bi_bdev->bd_partno && !bio_flagged(bio, BIO_REMAPPED) && - unlikely(blk_partition_remap(bio))) - goto end_io; + if (!bio_flagged(bio, BIO_REMAPPED)) { + if (unlikely(bio_check_eod(bio))) + goto end_io; + if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) + goto end_io; + } /* * Filter flush bio's early so that bio based drivers without flush -- cgit v1.2.3-70-g09d2 From eb2fd80f9d2c515a901599362e83bc3356fc5e97 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:43 +0100 Subject: block, bfq: replace mechanism for evaluating I/O intensity Some BFQ mechanisms make their decisions on a bfq_queue basing also on whether the bfq_queue is I/O bound. In this respect, the current logic for evaluating whether a bfq_queue is I/O bound is rather rough. This commits replaces this logic with a more effective one. The new logic measures the percentage of time during which a bfq_queue is active, and marks the bfq_queue as I/O bound if the latter if this percentage is above a fixed threshold. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 63 +++++++++++++++++++++++++++++++++++++---------------- block/bfq-iosched.h | 16 +++++++------- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b12a416b51d7..375e35c3d2fb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1026,6 +1026,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; + bfqq->io_start_time = bic->saved_io_start_time; + bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; @@ -1721,17 +1723,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1865,6 +1856,36 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } +static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) +{ + u64 tot_io_time = now_ns - bfqq->io_start_time; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) + bfqq->tot_idle_time += + now_ns - bfqq->ttime.last_end_request; + + if (unlikely(bfq_bfqq_just_created(bfqq))) + return; + + /* + * Must be busy for at least about 80% of the time to be + * considered I/O bound. + */ + if (bfqq->tot_idle_time * 5 > tot_io_time) + bfq_clear_bfqq_IO_bound(bfqq); + else + bfq_mark_bfqq_IO_bound(bfqq); + + /* + * Keep an observation window of at most 200 ms in the past + * from now. + */ + if (tot_io_time > 200 * NSEC_PER_MSEC) { + bfqq->io_start_time = now_ns - (tot_io_time>>1); + bfqq->tot_idle_time >>= 1; + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1872,6 +1893,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; + u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -1934,7 +1956,7 @@ static void bfq_add_request(struct request *rq) */ if (bfqd->last_completed_rq_bfqq && !bfq_bfqq_has_short_ttime(bfqq) && - ktime_get_ns() - bfqd->last_completion < + now_ns - bfqd->last_completion < 4 * NSEC_PER_MSEC) { if (bfqd->last_completed_rq_bfqq != bfqq && bfqd->last_completed_rq_bfqq != @@ -2051,6 +2073,9 @@ static void bfq_add_request(struct request *rq) } } + if (bfq_bfqq_sync(bfqq)) + bfq_update_io_intensity(bfqq, now_ns); + elv_rb_add(&bfqq->sort_list, rq); /* @@ -2712,6 +2737,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_io_start_time = bfqq->io_start_time; + bic->saved_tot_idle_time = bfqq->tot_idle_time; bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && @@ -3979,10 +4006,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (reason == BFQQE_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -5085,6 +5108,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync) { + u64 now_ns = ktime_get_ns(); + RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5112,7 +5137,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = now_ns + 1; + + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -6524,8 +6551,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 703895224562..c913b06016b3 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -291,6 +291,11 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* when bfqq started to do I/O within the last observation window */ + u64 io_start_time; + /* how long bfqq has remained empty during the last observ. window */ + u64 tot_idle_time; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -407,6 +412,9 @@ struct bfq_io_cq { */ bool saved_IO_bound; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Same purpose as the previous fields for the value of the * field keeping the queue's belonging to a large burst @@ -641,14 +649,6 @@ struct bfq_data { */ unsigned int bfq_timeout; - /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput -- cgit v1.2.3-70-g09d2 From 7f1995c27b19060dbdff23442f375e3097c90707 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:44 +0100 Subject: block, bfq: re-evaluate convenience of I/O plugging on rq arrivals Upon an I/O-dispatch attempt, BFQ may detect that it was better to plug I/O dispatch, and to wait for a new request to arrive for the currently in-service queue. But the arrival of a new request for an empty bfq_queue, and thus the switch from idle to busy of the bfq_queue, may cause the scenario to change, and make plugging no longer needed for service guarantees, or more convenient for throughput. In this case, keeping I/O-dispatch plugged would certainly lower throughput. To address this issue, this commit makes such a check, and stops plugging I/O if it is better to stop plugging I/O. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 375e35c3d2fb..44c6433b5b25 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1649,6 +1649,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +static bool bfq_better_to_idle(struct bfq_queue *bfqq); + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1750,10 +1752,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_add_bfqq_busy(bfqd, bfqq); /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In particular, we care only about two - * cases. The first is that bfqq has to recover a service - * hole, as explained in the comments on + * Expire in-service queue if preemption may be needed for + * guarantees or throughput. As for guarantees, we care + * explicitly about two cases. The first is that bfqq has to + * recover a service hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1780,11 +1782,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). + * + * As for throughput, we ask bfq_better_to_idle() whether we + * still need to plug I/O dispatching. If bfq_better_to_idle() + * says no, then plugging is not needed any longer, either to + * boost throughput or to perserve service guarantees. Then + * the best option is to stop plugging I/O, as not doing so + * would certainly lower throughput. We may end up in this + * case if: (1) upon a dispatch attempt, we detected that it + * was better to plug I/O dispatch, and to wait for a new + * request to arrive for the currently in-service queue, but + * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || + !bfq_better_to_idle(bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); -- cgit v1.2.3-70-g09d2 From d1f600fa4732dac36c71a03b790f0c829a076475 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:45 +0100 Subject: block, bfq: fix switch back from soft-rt weitgh-raising A bfq_queue may happen to be deemed as soft real-time while it is still enjoying interactive weight-raising. If this happens because of a false positive, then the bfq_queue is likely to loose its soft real-time status soon. Upon losing such a status, the bfq_queue must get back its interactive weight-raising, if its interactive period is not over yet. But this case is not handled. This commit corrects this error. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 44c6433b5b25..170aa0ccc121 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5290,8 +5290,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) - bfq_bfqq_end_wr(bfqq); + BFQQ_TOTALLY_SEEKY(bfqq)) { + if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + /* + * In soft_rt weight raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ + bfq_bfqq_end_wr(bfqq); + } else { /* + * stopping soft_rt weight raising + * while still in interactive period, + * switch back to interactive weight + * raising + */ + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -- cgit v1.2.3-70-g09d2 From e673914d52f913584cc4c454dfcff2e8eb04533f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:46 +0100 Subject: block, bfq: save also weight-raised service on queue merging To prevent weight-raising information from being lost on bfq_queue merging, also the amount of service that a bfq_queue receives must be saved and restored when the bfq_queue is merged and split, respectively. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ block/bfq-iosched.h | 1 + 2 files changed, 3 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 170aa0ccc121..5d48cba07cb5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1029,6 +1029,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->io_start_time = bic->saved_io_start_time; bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; @@ -2775,6 +2776,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_wr_coeff = bfqq->wr_coeff; bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_service_from_wr = bfqq->service_from_wr; bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index c913b06016b3..d15299d59f89 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -440,6 +440,7 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; + unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; -- cgit v1.2.3-70-g09d2 From 5a5436b98d5cd2714feaaa579cec49dd7f7057bb Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:47 +0100 Subject: block, bfq: save also injection state on queue merging To prevent injection information from being lost on bfq_queue merging, also the amount of service that a bfq_queue receives must be saved and restored when the bfq_queue is merged and split, respectively. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++++++ block/bfq-iosched.h | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5d48cba07cb5..79d232d41027 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1024,6 +1024,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); + bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; + bfqq->inject_limit = bic->saved_inject_limit; + bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; bfqq->io_start_time = bic->saved_io_start_time; @@ -2748,6 +2752,10 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; + bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bic->saved_inject_limit = bfqq->inject_limit; + bic->saved_decrease_time_jif = bfqq->decrease_time_jif; + bic->saved_weight = bfqq->entity.orig_weight; bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index d15299d59f89..3f350fa3c5fd 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -444,6 +444,11 @@ struct bfq_io_cq { unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + + /* Save also injection state */ + u64 saved_last_serv_time_ns; + unsigned int saved_inject_limit; + unsigned long saved_decrease_time_jif; }; /** -- cgit v1.2.3-70-g09d2 From 71217df39dc67a0aeed83352b0d712b7892036a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:48 +0100 Subject: block, bfq: make waker-queue detection more robust In the presence of many parallel I/O flows, the detection of waker bfq_queues suffers from false positives. This commits addresses this issue by making the filtering of actual wakers more selective. In more detail, a candidate waker must be found to meet waker requirements three times before being promoted to actual waker. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 211 +++++++++++++++++++++++++--------------------------- block/bfq-iosched.h | 7 +- 2 files changed, 108 insertions(+), 110 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 79d232d41027..eaeda18cb8c8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -158,7 +158,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1905,6 +1904,107 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) } } +/* + * Detect whether bfqq's I/O seems synchronized with that of some + * other queue, i.e., whether bfqq, after remaining empty, happens to + * receive new I/O only right after some I/O request of the other + * queue has been completed. We call waker queue the other queue, and + * we assume, for simplicity, that bfqq may have at most one waker + * queue. + * + * A remarkable throughput boost can be reached by unconditionally + * injecting the I/O of the waker queue, every time a new + * bfq_dispatch_request happens to be invoked while I/O is being + * plugged for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth and latency for + * bfqq. Note that these same results may be achieved with the general + * injection mechanism, but less effectively. For details on this + * aspect, see the comments on the choice of the queue for injection + * in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a queue Q is deemed + * as a waker queue for bfqq if, for three consecutive times, bfqq + * happens to become non empty right after a request of Q has been + * completed. In particular, on the first time, Q is tentatively set + * as a candidate waker queue, while on the third consecutive time + * that Q is detected, the field waker_bfqq is set to Q, to confirm + * that Q is a waker queue for bfqq. These detection steps are + * performed only if bfqq has a long think time, so as to make it more + * likely that bfqq's I/O is actually being blocked by a + * synchronization. This last filter, plus the above three-times + * requirement, make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner throughput can be + * boosted by injecting I/O from the waker queue. Fortunately, + * detection is likely to be actually fast, for the following + * reasons. While blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at least equal to 1 + * (see the comments in bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served during the very + * first I/O-plugging time interval for bfqq. This triggers the first + * step of the detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be confirmed no later than + * during the next I/O-plugging interval for bfqq. + * + * ISSUE + * + * On queue merging all waker information is lost. + */ +void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) +{ + if (!bfqd->last_completed_rq_bfqq || + bfqd->last_completed_rq_bfqq == bfqq || + bfq_bfqq_has_short_ttime(bfqq) || + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + return; + + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq) { + /* + * First synchronization detected with a + * candidate waker queue, or with a different + * candidate waker queue from the current one. + */ + bfqq->tentative_waker_bfqq = + bfqd->last_completed_rq_bfqq; + bfqq->num_waker_detections = 1; + } else /* Same tentative waker queue detected again */ + bfqq->num_waker_detections++; + + if (bfqq->num_waker_detections == 3) { + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1919,111 +2019,7 @@ static void bfq_add_request(struct request *rq) bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - /* - * Detect whether bfqq's I/O seems synchronized with - * that of some other queue, i.e., whether bfqq, after - * remaining empty, happens to receive new I/O only - * right after some I/O request of the other queue has - * been completed. We call waker queue the other - * queue, and we assume, for simplicity, that bfqq may - * have at most one waker queue. - * - * A remarkable throughput boost can be reached by - * unconditionally injecting the I/O of the waker - * queue, every time a new bfq_dispatch_request - * happens to be invoked while I/O is being plugged - * for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth - * and latency for bfqq. Note that these same results - * may be achieved with the general injection - * mechanism, but less effectively. For details on - * this aspect, see the comments on the choice of the - * queue for injection in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a - * queue Q is deemed as a waker queue for bfqq if, for - * two consecutive times, bfqq happens to become non - * empty right after a request of Q has been - * completed. In particular, on the first time, Q is - * tentatively set as a candidate waker queue, while - * on the second time, the flag - * bfq_bfqq_has_waker(bfqq) is set to confirm that Q - * is a waker queue for bfqq. These detection steps - * are performed only if bfqq has a long think time, - * so as to make it more likely that bfqq's I/O is - * actually being blocked by a synchronization. This - * last filter, plus the above two-times requirement, - * make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner - * throughput can be boosted by injecting I/O from the - * waker queue. Fortunately, detection is likely to be - * actually fast, for the following reasons. While - * blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at - * least equal to 1 (see the comments in - * bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served - * during the very first I/O-plugging time interval - * for bfqq. This triggers the first step of the - * detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be - * confirmed no later than during the next - * I/O-plugging interval for bfqq. - */ - if (bfqd->last_completed_rq_bfqq && - !bfq_bfqq_has_short_ttime(bfqq) && - now_ns - bfqd->last_completion < - 4 * NSEC_PER_MSEC) { - if (bfqd->last_completed_rq_bfqq != bfqq && - bfqd->last_completed_rq_bfqq != - bfqq->waker_bfqq) { - /* - * First synchronization detected with - * a candidate waker queue, or with a - * different candidate waker queue - * from the current one. - */ - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - - bfq_clear_bfqq_has_waker(bfqq); - } else if (bfqd->last_completed_rq_bfqq == - bfqq->waker_bfqq && - !bfq_bfqq_has_waker(bfqq)) { - /* - * synchronization with waker_bfqq - * seen for the second time - */ - bfq_mark_bfqq_has_waker(bfqq); - } - } + bfq_check_waker(bfqd, bfqq, now_ns); /* * Periodically reset inject limit, to make sure that @@ -4569,7 +4565,7 @@ check_queue: bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; - else if (bfq_bfqq_has_waker(bfqq) && + else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, @@ -4973,7 +4969,6 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; - bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3f350fa3c5fd..b8e793c34ff1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -376,6 +376,11 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; + /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ + struct bfq_queue *tentative_waker_bfqq; + /* number of times the same tentative waker has been detected */ + unsigned int num_waker_detections; + /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -776,7 +781,6 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ - BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -796,7 +800,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ -- cgit v1.2.3-70-g09d2 From a5bf0a92e1b8282c93018383b2526ca59602dd08 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Jan 2021 21:15:01 -0700 Subject: bfq: bfq_check_waker() should be static It's only used in the same file, mark is appropriately static. Fixes: 71217df39dc6 ("block, bfq: make waker-queue detection more robust") Reported-by: kernel test robot Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index eaeda18cb8c8..23e293d2943c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1952,7 +1952,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * * On queue merging all waker information is lost. */ -void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) +static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + u64 now_ns) { if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || -- cgit v1.2.3-70-g09d2 From a7c7f7b2b641bef52212fbe8be4a66ede043d3c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:33:06 +0100 Subject: nvme: use bio_set_dev to assign ->bi_bdev Always use the bio_set_dev helper to assign ->bi_bdev to make sure other state related to the device is uptodate. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/lightnvm.c | 2 +- drivers/nvme/host/multipath.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index eb7963fb167b..ba5df80881ea 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1133,7 +1133,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ec38128f51e9..b705988629f2 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -816,7 +816,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - bio->bi_bdev = ns->disk->part0; + bio_set_dev(bio, ns->disk->part0); } blk_execute_rq(NULL, rq, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a6d44e7a775f..65bd6efa5e1c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (likely(ns)) { - bio->bi_bdev = ns->disk->part0; + bio_set_dev(bio, ns->disk->part0); bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); @@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work) * Reset disk to the mpath node and resubmit to select a new * path. */ - bio->bi_bdev = head->disk->part0; + bio_set_dev(bio, head->disk->part0); submit_bio_noacct(bio); } } -- cgit v1.2.3-70-g09d2 From f65b95fe0cedc1be2ec33a2892ee43fae0408719 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:33:07 +0100 Subject: bcache: use bio_set_dev to assign ->bi_bdev Always use the bio_set_dev helper to assign ->bi_bdev to make sure other state related to the device is uptodate. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 058dd8014428..63e809f38e3f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; - check->bi_bdev = bio->bi_bdev; + bio_set_dev(check, bio->bi_bdev); check->bi_opf = REQ_OP_READ; check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; -- cgit v1.2.3-70-g09d2 From 46bbf653a67a36989a55dbb894c8b94c5ecb2858 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:33:08 +0100 Subject: block: inherit BIO_REMAPPED when cloning bios Cloned bios are can be used to on the same device, in which case we need to inherit the BIO_REMAPPED flag to avoid a double partition remap. When the cloned bios are used on another device, bio_set_dev will clear the flag. Fixes: 309dca309fc3 ("block: store a block_device pointer in struct bio") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ block/blk-crypto-fallback.c | 2 ++ block/bounce.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/block/bio.c b/block/bio.c index 99040a7e6656..dfd7740a3230 100644 --- a/block/bio.c +++ b/block/bio.c @@ -666,6 +666,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 8f1e18176731..50c225398e4d 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -168,6 +168,8 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/bounce.c b/block/bounce.c index a22a8a1942b2..fc55314aa426 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -247,6 +247,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; -- cgit v1.2.3-70-g09d2 From 767630c63bb23acf022adb265574996ca39a4645 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 7 Jan 2021 16:40:34 +0100 Subject: bdev: Do not return EBUSY if bdev discard races with write blkdev_fallocate() tries to detect whether a discard raced with an overlapping write by calling invalidate_inode_pages2_range(). However this check can give both false negatives (when writing using direct IO or when writeback already writes out the written pagecache range) and false positives (when write is not actually overlapping but ends in the same page when blocksize < pagesize). This actually causes issues for qemu which is getting confused by EBUSY errors. Fix the problem by removing this conflicting write detection since it is inherently racy and thus of little use anyway. Reported-by: Maxim Levitsky CC: "Darrick J. Wong" Link: https://lore.kernel.org/qemu-devel/20201111153913.41840-1-mlevitsk@redhat.com Signed-off-by: Jan Kara Reviewed-by: Maxim Levitsky Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6f5bd9950baf..289c3dd923a4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1801,13 +1801,11 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return error; /* - * Invalidate again; if someone wandered in and dirtied a page, - * the caller will be given -EBUSY. The third argument is - * inclusive, so the rounding here is safe. + * Invalidate the page cache again; if someone wandered in and dirtied + * a page, we just discard it - userspace has no way of knowing whether + * the write happened before or after discard completing... */ - return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); + return truncate_bdev_range(bdev, file->f_mode, start, end); } const struct file_operations def_blk_fops = { -- cgit v1.2.3-70-g09d2 From 482e302a61f1fc62b0e13be20bc7a11a91b5832d Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Mon, 25 Jan 2021 19:27:04 +0800 Subject: blk: wbt: remove unused parameter from wbt_should_throttle The first parameter rwb is not used for this function. So just remove it. Signed-off-by: Lei Chen Signed-off-by: Jens Axboe --- block/blk-wbt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0321ca83e73f..42aed0160f86 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -518,7 +518,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -545,7 +545,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; - } else if (wbt_should_throttle(rwb, bio)) { + } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) -- cgit v1.2.3-70-g09d2 From 2c2b9fd6b496b3616e9b9537ea0258b3040914f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 12:13:32 +0100 Subject: block: unexport truncate_bdev_range truncate_bdev_range is only used in always built-in block layer code, so remove the export and the !CONFIG_BLOCK stub. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 1 - include/linux/blkdev.h | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 289c3dd923a4..c1fe29dac485 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -126,7 +126,6 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, bd_abort_claiming(bdev, truncate_bdev_range); return 0; } -EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 20f3706b6b2e..2491e17b61c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,21 +1999,16 @@ void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } -static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, - loff_t lstart, loff_t lend) -{ - return 0; -} static inline int sync_blockdev(struct block_device *bdev) { return 0; -- cgit v1.2.3-70-g09d2 From 49d1822bc05e702be1665ffc2092ec5711e77491 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 25 Jan 2021 13:05:28 +0800 Subject: blkcg: delete redundant get/put operations for queue When calling blkcg_schedule_throttle(), for the same queue, redundant get/put operations can be removed. Signed-off-by: Chunguang Xu Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3465d6ee708e..02ce2058c14b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1757,12 +1757,15 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) if (unlikely(current->flags & PF_KTHREAD)) return; - if (!blk_get_queue(q)) - return; + if (current->throttle_queue != q) { + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + } - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); -- cgit v1.2.3-70-g09d2 From 41e76c85660c022c6bf5713bfb6c21e64a487cec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:16 +0200 Subject: bfq: Avoid false bfq queue merging bfq_setup_cooperator() uses bfqd->in_serv_last_pos so detect whether it makes sense to merge current bfq queue with the in-service queue. However if the in-service queue is freshly scheduled and didn't dispatch any requests yet, bfqd->in_serv_last_pos is stale and contains value from the previously scheduled bfq queue which can thus result in a bogus decision that the two queues should be merged. This bug can be observed for example with the following fio jobfile: [global] direct=0 ioengine=sync invalidate=1 size=1g rw=read [reader] numjobs=4 directory=/mnt where the 4 processes will end up in the one shared bfq queue although they do IO to physically very distant files (for some reason I was able to observe this only with slice_idle=1ms setting). Fix the problem by invalidating bfqd->in_serv_last_pos when switching in-service queue. Fixes: 058fdecc6de7 ("block, bfq: fix in-service-queue check for queue merging") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 23e293d2943c..4157cfe99ae2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3007,6 +3007,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, } bfqd->in_service_queue = bfqq; + bfqd->in_serv_last_pos = 0; } /* -- cgit v1.2.3-70-g09d2 From 28c6def009192b673f92ea357dfb535ba15e00a4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:17 +0200 Subject: bfq: Use 'ttime' local variable Use local variable 'ttime' instead of dereferencing bfqq. Signed-off-by: Jan Kara Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4157cfe99ae2..a0471ff97120 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5282,7 +5282,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); -- cgit v1.2.3-70-g09d2 From 7684fbde45169e6de15c180b1c084d2005e99961 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:18 +0200 Subject: bfq: Use only idle IO periods for think time calculations Currently whenever bfq queue has a request queued we add now - last_completion_time to the think time statistics. This is however misleading in case the process is able to submit several requests in parallel because e.g. if the queue has request completed at time T0 and then queues new requests at times T1, T2, then we will add T1-T0 and T2-T0 to think time statistics which just doesn't make any sence (the queue's think time is penalized by the queue being able to submit more IO). So add to think time statistics only time intervals when the queue had no IO pending. Signed-off-by: Jan Kara Acked-by: Paolo Valente [axboe: fix whitespace on empty line] Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a0471ff97120..dfa87e360d71 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5278,8 +5278,16 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed; + /* + * We are really interested in how long it takes for the queue to + * become busy when there is no outstanding IO for this queue. So + * ignore cases when the bfq queue has already IO queued. + */ + if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) + return; + elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; -- cgit v1.2.3-70-g09d2 From f91ca2a370bec58eb3d54315b5cfa3a2a9288acc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:31 +0100 Subject: zonefs: use bio_alloc in zonefs_file_dio_append Use bio_alloc instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/zonefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bec47f2d074b..faea2ed34b4a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -678,7 +678,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (!nr_pages) return 0; - bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); + bio = bio_alloc(GFP_NOFS, nr_pages); if (!bio) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 616c6a6884e273349cda19483dfd7f5b7fd3da52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:32 +0100 Subject: btrfs: use bio_kmalloc in __alloc_device Use bio_kmalloc instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a6de859eb22..584ba093cf49 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) * Preallocate a bio that's always going to be used for flushing device * barriers and matches the device lifespan */ - dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); if (!dev->flush_bio) { kfree(dev); return ERR_PTR(-ENOMEM); -- cgit v1.2.3-70-g09d2 From 4eb1d689045552eb966ebf25efbc3ce648797d96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:33 +0100 Subject: blk-crypto: use bio_kmalloc in blk_crypto_clone_bio Use bio_kmalloc instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 50c225398e4d..e8327c50d7c9 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -164,7 +164,7 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) struct bio_vec bv; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; -- cgit v1.2.3-70-g09d2 From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:34 +0100 Subject: block: split bio_kmalloc from bio_alloc_bioset bio_kmalloc shares almost no logic with the bio_set based fast path in bio_alloc_bioset. Split it into an entirely separate implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 167 ++++++++++++++++++++++++++-------------------------- include/linux/bio.h | 6 +- 2 files changed, 86 insertions(+), 87 deletions(-) diff --git a/block/bio.c b/block/bio.c index dfd7740a3230..d4375619348c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -396,123 +396,101 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); - - if (nr_iovecs > inline_vecs) { + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { unsigned long idx = 0; + struct bio_vec *bvl = NULL; bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, + &bs->bvec_pool); } if (unlikely(!bvl)) goto err_free; bio->bi_flags |= idx << BVEC_POOL_OFFSET; - bio->bi_max_vecs = bvec_nr_vecs(idx); + bio_init(bio, bvl, bvec_nr_vecs(idx)); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - bio->bi_max_vecs = inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -521,6 +499,31 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; diff --git a/include/linux/bio.h b/include/linux/bio.h index 676870b2c88d..c74857cf1252 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,6 +408,7 @@ extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -420,11 +421,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); -- cgit v1.2.3-70-g09d2 From c6bf3f0e25f4c0f0ecce6cf8d1c589bd9d74d3cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:35 +0100 Subject: block: use an on-stack bio in blkdev_issue_flush There is no point in allocating memory for a synchronous flush. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++----------- drivers/md/dm-zoned-metadata.c | 6 +++--- drivers/md/raid5-ppl.c | 2 +- drivers/nvme/target/io-cmd-bdev.c | 2 +- fs/block_dev.c | 2 +- fs/exfat/file.c | 2 +- fs/ext4/fast_commit.c | 4 ++-- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/super.c | 2 +- fs/fat/file.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hfsplus/super.c | 2 +- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 4 ++-- fs/jbd2/recovery.c | 2 +- fs/libfs.c | 2 +- fs/nilfs2/the_nilfs.h | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/file.c | 2 +- fs/xfs/xfs_super.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/blkdev.h | 4 ++-- 23 files changed, 33 insertions(+), 38 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 76c1624cb06c..7942ca6ed321 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,23 +432,18 @@ void blk_insert_flush(struct request *rq) /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +int blkdev_issue_flush(struct block_device *bdev) { - struct bio *bio; - int ret = 0; + struct bio bio; - bio = bio_alloc(gfp_mask, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; + bio_init(&bio, NULL, 0); + bio_set_dev(&bio, bdev); + bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b298fefb022e..039d17b28938 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); return ret; } @@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); return ret; } @@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); goto err; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d0f540296fe9..e8c118e05dfd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); + ret = blkdev_issue_flush(rdev->bdev); out: __free_page(page); return ret; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 125dde3f410e..bf6e0ac9ad28 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { - if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) + if (blkdev_issue_flush(req->ns->bdev)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index c1fe29dac485..9d4b1a884d76 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -680,7 +680,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL); + error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index a92478eabfa4..183ffdf4d43c 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } const struct file_operations exfat_file_operations = { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0a14a7c87bf8..6e8208acfc62 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { @@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) out: iput(inode); if (!ret) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(sb->s_bdev); return 0; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 113bfb023a4a..027a7d7037a0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + err = blkdev_issue_flush(inode->i_sb->s_bdev); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b215c564bc31..20f2fcb799f5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1583,7 +1583,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS); + blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a6f9875aa34..fb5985102c1d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5709,7 +5709,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } diff --git a/fs/fat/file.c b/fs/fat/file.c index f9ee27cf4d7c..5fee74f1ad61 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index e3da9e96b835..ca464328b79c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(inode->i_sb->s_bdev); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 807119ae5adf..b9e3db3f855f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ out: mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(sb->s_bdev); return error; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 472932b9e6bc..63b526d44886 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b121d7d434c6..3cc4ab2ba7f4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -825,7 +825,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -932,7 +932,7 @@ start_journal_io: stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_dev); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index dc0694fcfcd1..69f18fe20923 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); + err2 = blkdev_issue_flush(journal->j_fs_dev); if (!err) err = err2; } diff --git a/fs/libfs.c b/fs/libfs.c index d1c3bade9f30..8398a0efb401 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b55cdeb4d169..987c8ab02aee 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); + err = blkdev_issue_flush(nilfs->ns_bdev); if (err != -EIO) err = 0; return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 85979e2214b3..df6d709d2ae3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev); if (!err) err = ret; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 0b641ae694f1..1db0254bc38b 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(inode->i_sb->s_bdev); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 813be879a5e5..c3e32789829f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -342,7 +342,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); + blkdev_issue_flush(buftarg->bt_bdev); } STATIC void diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index faea2ed34b4a..ab68e27bb322 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev); if (ret) zonefs_io_error(inode, true); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2491e17b61c4..0dea268bd61b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1288,7 +1288,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } -int blkdev_issue_flush(struct block_device *, gfp_t); +int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { @@ -1316,7 +1316,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; } -- cgit v1.2.3-70-g09d2 From a587daa0643a3f9a0c83cc8ae38717d54e792604 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:36 +0100 Subject: dm-clone: use blkdev_issue_flush in commit_metadata Use blkdev_issue_flush instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/dm-clone-target.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index bdb255edc200..a90bdf9b2ca6 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -85,12 +85,6 @@ struct clone { struct dm_clone_metadata *cmd; - /* - * bio used to flush the destination device, before committing the - * metadata. - */ - struct bio flush_bio; - /* Region hydration hash table */ struct hash_table_bucket *ht; @@ -1155,11 +1149,7 @@ static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) goto out; } - bio_reset(&clone->flush_bio); - bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev); - clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - r = submit_bio_wait(&clone->flush_bio); + r = blkdev_issue_flush(clone->dest_dev->bdev); if (unlikely(r)) { __metadata_operation_failed(clone, "flush destination device", r); goto out; @@ -1886,7 +1876,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&clone->deferred_flush_completions); clone->hydration_offset = 0; atomic_set(&clone->hydrations_in_flight, 0); - bio_init(&clone->flush_bio, NULL, 0); clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!clone->wq) { @@ -1958,7 +1947,6 @@ static void clone_dtr(struct dm_target *ti) struct clone *clone = ti->private; mutex_destroy(&clone->commit_lock); - bio_uninit(&clone->flush_bio); for (i = 0; i < clone->nr_ctr_args; i++) kfree(clone->ctr_args[i]); -- cgit v1.2.3-70-g09d2 From 25ac84262cb5d5031f2769988ae5977a633b3f45 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:37 +0100 Subject: f2fs: use blkdev_issue_flush in __submit_flush_wait Use the blkdev_issue_flush helper instead of duplicating it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 12 +----------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8cbf03159752..0cf0c6059924 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,7 +56,8 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); } -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) +static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, + bool noio) { if (noio) { /* No failure on bio allocation */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb11759191dc..902bd3267c03 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3424,7 +3424,6 @@ void f2fs_destroy_checkpoint_caches(void); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index deca74cb17df..c495f170ee40 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -566,17 +566,7 @@ do_sync: static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); -- cgit v1.2.3-70-g09d2 From 67883ade7a98a7589ca50e97b1c7b7893886d30e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:38 +0100 Subject: f2fs: remove FAULT_ALLOC_BIO Sleeping bio allocations do not fail, which means that injecting an error into sleeping bio allocations is a little silly. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- Documentation/filesystems/f2fs.rst | 1 - fs/f2fs/data.c | 29 ++++------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 1 - 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index dae15c96e659..624f5f3ed93e 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0cf0c6059924..9fb6be65592b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -50,28 +50,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, - bool noio) -{ - if (noio) { - /* No failure on bio allocation */ - return __f2fs_bio_alloc(GFP_NOIO, npages); - } - - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -433,7 +411,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); + bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -1029,8 +1007,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), - for_write); + bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, + min_t(int, nr_pages, BIO_MAX_PAGES), + &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 902bd3267c03..6c78365d80ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,7 +43,6 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b4a07fe62d1a..3a312642907e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,7 +45,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", -- cgit v1.2.3-70-g09d2 From 19304f959ffd413359160969ad65b9829658840b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:39 +0100 Subject: drbd: remove bio_alloc_drbd Given that drbd_md_io_bio_set is initialized during module initialization and the module fails to load if the initialization fails there is no need to fall back to plain bio_alloc. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/block/drbd/drbd_int.h | 2 -- drivers/block/drbd/drbd_main.c | 13 ------------- 4 files changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 7227fc7ab8ed..72cf7603d51f 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -138,7 +138,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, op_flags |= REQ_FUA | REQ_PREFLUSH; op_flags |= REQ_SYNC; - bio = bio_alloc_drbd(GFP_NOIO); + bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); bio_set_dev(bio, bdev->md_bdev); bio->bi_iter.bi_sector = sector; err = -EIO; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index df53dca5d02c..c1f816f896a8 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -976,7 +976,7 @@ static void drbd_bm_endio(struct bio *bio) static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { - struct bio *bio = bio_alloc_drbd(GFP_NOIO); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; struct page *page; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2c93a29c251..02db50d7e4c6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1422,8 +1422,6 @@ extern mempool_t drbd_md_io_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ extern struct bio_set drbd_md_io_bio_set; -/* to allocate from that set */ -extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); /* And a bio_set for cloning */ extern struct bio_set drbd_io_bio_set; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1c8c18b2a25f..788dd97e6026 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -138,19 +138,6 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -struct bio *bio_alloc_drbd(gfp_t gfp_mask) -{ - struct bio *bio; - - if (!bioset_initialized(&drbd_md_io_bio_set)) - return bio_alloc(gfp_mask, 1); - - bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set); - if (!bio) - return NULL; - return bio; -} - #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will give tons of false positives. When this is a real functions sparse works. -- cgit v1.2.3-70-g09d2 From ae7153f1a7b05acd574d612ed9bdc0fe0a7e0451 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:40 +0100 Subject: drbd: remove drbd_req_make_private_bio Open code drbd_req_make_private_bio in the two callers to prepare for further changes. Also don't bother to initialize bi_next as the bio code already does that that. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 5 ++++- drivers/block/drbd/drbd_req.h | 12 ------------ drivers/block/drbd/drbd_worker.c | 5 ++++- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index ea0f31ab3343..9dbb660a7d7c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,7 +30,10 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - drbd_req_make_private_bio(req, bio_src); + req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; + req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 55bb0f8721fa..511f39a08de4 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -256,18 +256,6 @@ enum drbd_req_state_bits { #define MR_WRITE 1 #define MR_READ 2 -static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) -{ - struct bio *bio; - bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); - - req->private_bio = bio; - - bio->bi_private = req; - bio->bi_end_io = drbd_request_endio; - bio->bi_next = NULL; -} - /* Short lived temporary struct on the stack. * We could squirrel the error to be returned into * bio->bi_iter.bi_size, or similar. But that would be too ugly. */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 02044ab7f767..64563bfdf0da 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1523,8 +1523,11 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) drbd_al_begin_io(device, &req->i); - drbd_req_make_private_bio(req, req->master_bio); + req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO, + &drbd_io_bio_set); bio_set_dev(req->private_bio, device->ldev->backing_bdev); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; submit_bio_noacct(req->private_bio); return 0; -- cgit v1.2.3-70-g09d2 From a78f18da669242ad57237070f298212e342bf602 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:41 +0100 Subject: md: remove bio_alloc_mddev bio_alloc_mddev is never called with a NULL mddev, and ->bio_set is initialized in md_run, so it always must be initialized as well. Just open code the remaining call to bio_alloc_bioset. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/md.c | 12 +----------- drivers/md/md.h | 2 -- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- 4 files changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7d1bb24add31..e2b9dbb6e888 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -340,16 +340,6 @@ static int start_readonly; */ static bool create_on_open = true; -struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev) -{ - if (!mddev || !bioset_initialized(&mddev->bio_set)) - return bio_alloc(gfp_mask, nr_iovecs); - - return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_alloc_mddev); - static struct bio *md_bio_alloc_sync(struct mddev *mddev) { if (!mddev || !bioset_initialized(&mddev->sync_set)) @@ -613,7 +603,7 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index f13290ccc1c2..bcbba1b5ec4a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -742,8 +742,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3b19141cdb4b..d2378765dc15 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1104,7 +1104,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, int i = 0; struct bio *behind_bio = NULL; - behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); + behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set); if (!behind_bio) return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index be8f14afb6d1..e1eefbec15d4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4531,7 +4531,7 @@ read_more: return sectors_done; } - read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); + read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set); bio_set_dev(read_bio, rdev->bdev); read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr -- cgit v1.2.3-70-g09d2 From 32637385b834062d785a261841980ae7cffea007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:42 +0100 Subject: md: simplify sync_page_io Use an on-stack bio and biovec for the single page synchronous I/O. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/md.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e2b9dbb6e888..6a27f52007c8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1021,29 +1021,29 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = md_bio_alloc_sync(rdev->mddev); - int ret; + struct bio bio; + struct bio_vec bvec; + + bio_init(&bio, &bvec, 1); if (metadata_op && rdev->meta_bdev) - bio_set_dev(bio, rdev->meta_bdev); + bio_set_dev(&bio, rdev->meta_bdev); else - bio_set_dev(bio, rdev->bdev); - bio_set_op_attrs(bio, op, op_flags); + bio_set_dev(&bio, rdev->bdev); + bio.bi_opf = op | op_flags; if (metadata_op) - bio->bi_iter.bi_sector = sector + rdev->sb_start; + bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) - bio->bi_iter.bi_sector = sector + rdev->new_data_offset; + bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio_add_page(bio, page, size, 0); + bio.bi_iter.bi_sector = sector + rdev->data_offset; + bio_add_page(&bio, page, size, 0); - submit_bio_wait(bio); + submit_bio_wait(&bio); - ret = !bio->bi_status; - bio_put(bio); - return ret; + return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); -- cgit v1.2.3-70-g09d2 From 6a5965696856f5dc6834f351f093cc99bf2f03c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:43 +0100 Subject: md: remove md_bio_alloc_sync md_bio_alloc_sync is never called with a NULL mddev, and ->sync_set is initialized in md_run, so it always must be initialized as well. Just open code the remaining call to bio_alloc_bioset. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/md.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6a27f52007c8..399c81bddc1a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -340,14 +340,6 @@ static int start_readonly; */ static bool create_on_open = true; -static struct bio *md_bio_alloc_sync(struct mddev *mddev) -{ - if (!mddev || !bioset_initialized(&mddev->sync_set)) - return bio_alloc(GFP_NOIO, 1); - - return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); -} - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -989,7 +981,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = md_bio_alloc_sync(mddev); + bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); atomic_inc(&rdev->nr_pending); -- cgit v1.2.3-70-g09d2 From e82ed3a4fbb54b2d7dcb2a7733520f3e10b97abf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:44 +0100 Subject: md/raid6: refactor raid5_read_one_chunk Refactor raid5_read_one_chunk so that all simple checks are done before allocating the bio. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 108 ++++++++++++++++++++++------------------------------- 1 file changed, 45 insertions(+), 63 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f411b9e5c332..a348b2adf2a9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5393,90 +5393,72 @@ static void raid5_align_endio(struct bio *bi) static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; - int dd_idx; - struct bio* align_bi; + struct bio *align_bio; struct md_rdev *rdev; - sector_t end_sector; + sector_t sector, end_sector, first_bad; + int bad_sectors, dd_idx; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); return 0; } - /* - * use bio_clone_fast to make a copy of the bio - */ - align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); - if (!align_bi) - return 0; - /* - * set bi_end_io to a new function, and set bi_private to the - * original bio. - */ - align_bi->bi_end_io = raid5_align_endio; - align_bi->bi_private = raid_bio; - /* - * compute position - */ - align_bi->bi_iter.bi_sector = - raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, - 0, &dd_idx, NULL); - end_sector = bio_end_sector(align_bi); + sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, + &dd_idx, NULL); + end_sector = bio_end_sector(raid_bio); + rcu_read_lock(); + if (r5c_big_stripe_cached(conf, sector)) + goto out_rcu_unlock; + rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && - (test_bit(Faulty, &rdev->flags) || + if (!rdev) + goto out_rcu_unlock; + if (test_bit(Faulty, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) || - rdev->recovery_offset >= end_sector))) - rdev = NULL; + rdev->recovery_offset >= end_sector)) + goto out_rcu_unlock; } - if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { - rcu_read_unlock(); - bio_put(align_bi); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + bio_set_dev(align_bio, rdev->bdev); + align_bio->bi_end_io = raid5_align_endio; + align_bio->bi_private = raid_bio; + align_bio->bi_iter.bi_sector = sector; + + raid_bio->bi_next = (void *)rdev; + + if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, + &bad_sectors)) { + bio_put(align_bio); + rdev_dec_pending(rdev, mddev); return 0; } - if (rdev) { - sector_t first_bad; - int bad_sectors; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - raid_bio->bi_next = (void*)rdev; - bio_set_dev(align_bi, rdev->bdev); - - if (is_badblock(rdev, align_bi->bi_iter.bi_sector, - bio_sectors(align_bi), - &first_bad, &bad_sectors)) { - bio_put(align_bi); - rdev_dec_pending(rdev, mddev); - return 0; - } + /* No reshape active, so we can trust rdev->data_offset */ + align_bio->bi_iter.bi_sector += rdev->data_offset; - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_iter.bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, - conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + if (mddev->gendisk) + trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); + submit_bio_noacct(align_bio); + return 1; - if (mddev->gendisk) - trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); - submit_bio_noacct(align_bi); - return 1; - } else { - rcu_read_unlock(); - bio_put(align_bi); - return 0; - } +out_rcu_unlock: + rcu_read_unlock(); + return 0; } static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) -- cgit v1.2.3-70-g09d2 From 6808f7af964be4ed6b04d2aa4ba884a2e47c6214 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:45 +0100 Subject: nfs/blocklayout: remove cruft in bl_alloc_init_bio bio_alloc never returns NULL when it can sleep. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/nfs/blocklayout/blocklayout.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 3be6836074ae..1a96ce28efb0 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -123,11 +123,6 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, npg = min(npg, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, npg); - if (!bio && (current->flags & PF_MEMALLOC)) { - while (!bio && (npg /= 2)) - bio = bio_alloc(GFP_NOIO, npg); - } - if (bio) { bio->bi_iter.bi_sector = disk_sector; bio_set_dev(bio, bdev); -- cgit v1.2.3-70-g09d2 From 64820ac6c6962f76d164fa690deaa688d59278e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:46 +0100 Subject: nilfs2: remove cruft in nilfs_alloc_seg_bio bio_alloc never returns NULL when it can sleep. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/nilfs2/segbuf.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1a8729eded8b..1e75417bfe6e 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -386,10 +386,6 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, struct bio *bio; bio = bio_alloc(GFP_NOIO, nr_vecs); - if (bio == NULL) { - while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOIO, nr_vecs); - } if (likely(bio)) { bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = -- cgit v1.2.3-70-g09d2 From 48d15436fde6feebcded7bd0fdc8ea4a9181b8fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:47 +0100 Subject: mm: remove get_swap_bio Just reuse the block_device and sector from the swap_info structure, just as used by the SWP_SYNCHRONOUS path. Also remove the checks for NULL returns from bio_alloc as that can't happen for sleeping allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/swap.h | 1 - mm/page_io.c | 45 +++++++++++++-------------------------------- mm/swapfile.c | 10 ---------- 3 files changed, 13 insertions(+), 43 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 596bc2f4d9b0..3f1f7ae0fbe9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -468,7 +468,6 @@ extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); -extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); diff --git a/mm/page_io.c b/mm/page_io.c index a75f35464a4e..92f7941c6d01 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,25 +26,6 @@ #include #include -static struct bio *get_swap_bio(gfp_t gfp_flags, - struct page *page, bio_end_io_t end_io) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, 1); - if (bio) { - struct block_device *bdev; - - bio->bi_iter.bi_sector = map_swap_page(page, &bdev); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_end_io = end_io; - - bio_add_page(bio, page, thp_size(page), 0); - } - return bio; -} - void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = get_swap_bio(GFP_NOIO, page, end_write_func); - if (bio == NULL) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } + bio = bio_alloc(GFP_NOIO, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio->bi_end_io = end_write_func; + bio_add_page(bio, page, thp_size(page), 0); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); @@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); - if (bio == NULL) { - unlock_page(page); - ret = -ENOMEM; - goto out; - } + bio = bio_alloc(GFP_KERNEL, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_opf = REQ_OP_READ; + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); if (synchronous) { bio->bi_opf |= REQ_HIPRI; get_task_struct(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fffc5af29d1..bfa9e8b0c2ef 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2301,16 +2301,6 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) return se->start_block + (offset - se->start_page); } -/* - * Returns the page offset into bdev for the specified page's swap entry. - */ -sector_t map_swap_page(struct page *page, struct block_device **bdev) -{ - swp_entry_t entry; - entry.val = page_private(page); - return map_swap_entry(entry, bdev); -} - /* * Free all of a swapdev's extent information */ -- cgit v1.2.3-70-g09d2 From 3e3126cf2a6d0afa4c013574df621d08f08d3912 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Jan 2021 10:04:49 -0700 Subject: mm: only make map_swap_entry available for CONFIG_HIBERNATION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current tree spews this on compile: mm/swapfile.c:2290:17: warning: ‘map_swap_entry’ defined but not used [-Wunused-function] 2290 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) | ^~~~~~~~~~~~~~ if !CONFIG_HIBERNATION, as we don't use the function unless we have that config option set. Fixes: 48d15436fde6 ("mm: remove get_swap_bio") Signed-off-by: Jens Axboe --- mm/swapfile.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index bfa9e8b0c2ef..351999a84e6e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,7 +47,6 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1791,6 +1790,9 @@ int free_swap_and_cache(swp_entry_t entry) } #ifdef CONFIG_HIBERNATION + +static sector_t map_swap_entry(swp_entry_t, struct block_device**); + /* * Find the swap type that corresponds to given device (if any). * @@ -2281,6 +2283,7 @@ static void drain_mmlist(void) spin_unlock(&mmlist_lock); } +#ifdef CONFIG_HIBERNATION /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which * corresponds to page offset for the specified swap entry. @@ -2300,6 +2303,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) se = offset_to_swap_extent(sis, offset); return se->start_block + (offset - se->start_page); } +#endif /* * Free all of a swapdev's extent information -- cgit v1.2.3-70-g09d2 From 59c157433fbc6a7f63f3d708ca2966d0f56bcb7c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 26 Jan 2021 21:37:38 -0800 Subject: nvme-core: check bdev value for NULL The nvme-core sets the bdev to NULL when admin comamnd is issued from IOCTL in the following path e.g. nvme list :- block_ioctl() blkdev_ioctl() nvme_ioctl() nvme_user_cmd() nvme_submit_user_cmd() The commit 309dca309fc3 ("block: store a block_device pointer in struct bio") now uses bdev unconditionally in the macro bio_set_dev() and assumes that bdev value is not NULL which results in the following crash in since thats where bdev is actually accessed :- void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { --------------> blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); [ 345.385947] BUG: kernel NULL pointer dereference, address: 0000000000000690 [ 345.387103] #PF: supervisor read access in kernel mode [ 345.387894] #PF: error_code(0x0000) - not-present page [ 345.388756] PGD 162a2b067 P4D 162a2b067 PUD 1633eb067 PMD 0 [ 345.389625] Oops: 0000 [#1] SMP NOPTI [ 345.390206] CPU: 15 PID: 4100 Comm: nvme Tainted: G OE 5.11.0-rc5blk+ #141 [ 345.391377] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba52764 [ 345.393074] RIP: 0010:bio_associate_blkg_from_css.cold.47+0x58/0x21f [ 345.396362] RSP: 0018:ffffc90000dbbce8 EFLAGS: 00010246 [ 345.397078] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 [ 345.398114] RDX: 0000000000000000 RSI: ffff888813be91f0 RDI: ffff888813be91f8 [ 345.399039] RBP: ffffc90000dbbd30 R08: 0000000000000001 R09: 0000000000000001 [ 345.399950] R10: 0000000064c66670 R11: 00000000ef955201 R12: ffff888812d32800 [ 345.401031] R13: 0000000000000000 R14: ffff888113e51540 R15: ffff888113e51540 [ 345.401976] FS: 00007f3747f1d780(0000) GS:ffff888813a00000(0000) knlGS:0000000000000000 [ 345.402997] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 345.403737] CR2: 0000000000000690 CR3: 000000081a4bc000 CR4: 00000000003506e0 [ 345.404685] Call Trace: [ 345.405031] bio_associate_blkg+0x71/0x1c0 [ 345.405649] nvme_submit_user_cmd+0x1aa/0x38e [nvme_core] [ 345.406348] nvme_user_cmd.isra.73.cold.98+0x54/0x92 [nvme_core] [ 345.407117] nvme_ioctl+0x226/0x260 [nvme_core] [ 345.407707] blkdev_ioctl+0x1c8/0x2b0 [ 345.408183] block_ioctl+0x3f/0x50 [ 345.408627] __x64_sys_ioctl+0x84/0xc0 [ 345.409117] do_syscall_64+0x33/0x40 [ 345.409592] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 345.410233] RIP: 0033:0x7f3747632107 [ 345.413125] RSP: 002b:00007ffe461b6648 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [ 345.414086] RAX: ffffffffffffffda RBX: 00000000007b7fd0 RCX: 00007f3747632107 [ 345.414998] RDX: 00007ffe461b6650 RSI: 00000000c0484e41 RDI: 0000000000000004 [ 345.415966] RBP: 0000000000000004 R08: 00000000007b7fe8 R09: 00000000007b9080 [ 345.416883] R10: 00007ffe461b62c0 R11: 0000000000000206 R12: 00000000007b7fd0 [ 345.417808] R13: 0000000000000000 R14: 0000000000000003 R15: 0000000000000000 Add a NULL check before we set the bdev for bio. This issue is found on block/for-next tree. Fixes: 309dca309fc3 ("block: store a block_device pointer in struct bio") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ba5df80881ea..1a3cdc6b1036 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1133,7 +1133,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - bio_set_dev(bio, bdev); + if (bdev) + bio_set_dev(bio, bdev); if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); -- cgit v1.2.3-70-g09d2 From 6b4eeba331cd857701bcc28f4b688510b5d7a3e7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 28 Jan 2021 15:18:50 +0800 Subject: blk-cgroup: Remove obsolete macro Remove the obsolete 'MAX_KEY_LEN' macro. Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 02ce2058c14b..f26a8415172a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -32,8 +32,6 @@ #include #include "blk.h" -#define MAX_KEY_LEN 100 - /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire -- cgit v1.2.3-70-g09d2 From 7f31bee3601986b66446acc83d9db57f21d764fd Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 29 Jan 2021 05:55:05 +0100 Subject: block: remove typo in kernel-doc of set_disk_ro() Commit 52f019d43c22 ("block: add a hard-readonly flag to struct gendisk") provides some kernel-doc for set_disk_ro(), but introduces a small typo. Hence, make htmldocs warns on ./block/genhd.c:1441: warning: Function parameter or member 'read_only' not described in 'set_disk_ro' warning: Excess function parameter 'ready_only' description in 'set_disk_ro' Remove that typo in the kernel-doc for set_disk_ro(). Signed-off-by: Lukas Bulwahn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index d3ef29fbc536..304f8dcc9a9b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1431,7 +1431,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on - * @ready_only: %true to set the disk read-only, %false set the disk read/write + * @read_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to -- cgit v1.2.3-70-g09d2 From f7bf5e24e0b40fdb2321d9cf2b41043425fb4f9d Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 29 Jan 2021 06:01:51 +0100 Subject: block: drop removed argument from kernel-doc of blk_execute_rq() Commit 684da7628d93 ("block: remove unnecessary argument from blk_execute_rq") changes the signature of blk_execute_rq(), but misses to adjust its kernel-doc. Hence, make htmldocs warns on ./block/blk-exec.c:78: warning: Excess function parameter 'q' description in 'blk_execute_rq' Drop removed argument from kernel-doc of blk_execute_rq() as well. Signed-off-by: Lukas Bulwahn Acked-by: Guoqing Jiang Signed-off-by: Jens Axboe --- block/blk-exec.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 0ab873f10133..beae70a0e5e5 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,6 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); /** * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue -- cgit v1.2.3-70-g09d2 From d7a4783883d350e33308bf7c9ef0fe4e38f9c8e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Feb 2021 14:17:20 +0100 Subject: md: check for NULL ->meta_bdev before calling bdev_read_only ->meta_bdev is optional and not set for most arrays. Add a rdev_read_only helper that calls bdev_read_only for both devices in a safe way. Fixes: 6f0d9689b670 ("block: remove the NULL bdev check in bdev_read_only") Reported-by: Guoqing Jiang Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/md.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 399c81bddc1a..7c0f61078653 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2399,6 +2399,12 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_add_rdev); +static bool rdev_read_only(struct md_rdev *rdev) +{ + return bdev_read_only(rdev->bdev) || + (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); +} + static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; @@ -2408,8 +2414,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && - mddev->pers) + if (rdev_read_only(rdev) && mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ @@ -5843,9 +5848,7 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); - if (mddev->ro != 1 && - (bdev_read_only(rdev->bdev) || - bdev_read_only(rdev->meta_bdev))) { + if (mddev->ro != 1 && rdev_read_only(rdev)) { mddev->ro = 1; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); -- cgit v1.2.3-70-g09d2 From a42e0d70c517c88c52154bf74ec39092d897aaca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Feb 2021 14:17:21 +0100 Subject: md: use rdev_read_only in restart_array Make the read-only check in restart_array identical to the other two read-only checks. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7c0f61078653..21da0c48f6c2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6143,7 +6143,7 @@ static int restart_array(struct mddev *mddev) if (test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) has_journal = true; - if (bdev_read_only(rdev->bdev)) + if (rdev_read_only(rdev)) has_readonly = true; } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 8358c28a5d44bf0223a55a2334086c3707bb4185 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 Feb 2021 23:54:10 +0800 Subject: block: fix memory leak of bvec bio_init() clears bio instance, so the bvec index has to be set after bio_init(), otherwise bio->bi_io_vec may be leaked. Fixes: 3175199ab0ac ("block: split bio_kmalloc from bio_alloc_bioset") Cc: Johannes Thumshirn Cc: Chaitanya Kulkarni Cc: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index d4375619348c..757fee46cefc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -482,8 +482,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; bio_init(bio, bvl, bvec_nr_vecs(idx)); + bio->bi_flags |= idx << BVEC_POOL_OFFSET; } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); } else { -- cgit v1.2.3-70-g09d2 From dc0b8a57ad7b05036fcb19a5bf0319467597e67a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:19 +0100 Subject: block: reuse BIO_INLINE_VECS for integrity bvecs bvec_alloc always uses biovec_slabs, and thus always needs to use the same number of inline vecs. Share a single definition for the data and integrity bvecs. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 6 ++---- block/bio.c | 6 ------ block/blk.h | 1 + 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c3e5abcfdc98..19617fa326c3 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,8 +14,6 @@ #include #include "blk.h" -#define BIP_INLINE_VECS 4 - static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; @@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; + inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) @@ -470,6 +468,6 @@ void __init bio_integrity_init(void) bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } diff --git a/block/bio.c b/block/bio.c index 757fee46cefc..cee2d310f02e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,12 +25,6 @@ #include "blk.h" #include "blk-rq-qos.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an diff --git a/block/blk.h b/block/blk.h index 0198335c5838..e022a0d0f2ce 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,6 +55,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +#define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); void bvec_free(mempool_t *, struct bio_vec *, unsigned int); unsigned int bvec_nr_vecs(unsigned short idx); -- cgit v1.2.3-70-g09d2 From 6ac0b71537e1c14e7532408fe4aae553aa314237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:20 +0100 Subject: block: move struct biovec_slab to bio.c struct biovec_slab is only used inside of bio.c, so move it there. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ include/linux/bio.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index cee2d310f02e..2c359dadfdf6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,6 +25,12 @@ #include "blk.h" #include "blk-rq-qos.h" +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an diff --git a/include/linux/bio.h b/include/linux/bio.h index c74857cf1252..4a84207dd996 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -720,12 +720,6 @@ struct bio_set { struct workqueue_struct *rescue_workqueue; }; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; -- cgit v1.2.3-70-g09d2 From f2c3eb9bb0ef77517976f8be926a77a574da8fe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:21 +0100 Subject: block: factor out a bvec_alloc_gfp helper Clean up bvec_alloc a little by factoring out a helper for the gfp_t manipulations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/bio.c b/block/bio.c index 2c359dadfdf6..c2152c4bf8a3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -159,6 +159,16 @@ void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) } } +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +{ + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; +} + struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, mempool_t *pool) { @@ -199,20 +209,12 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); - - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BVEC_POOL_MAX; goto fallback; -- cgit v1.2.3-70-g09d2 From f007a3d66c5480c8dae3fa20a89a06861ef1f5db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:22 +0100 Subject: block: streamline bvec_alloc Avoid the pointless goto by trying the slab allocation first and falling through to the mempool. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/block/bio.c b/block/bio.c index c2152c4bf8a3..321b3479a154 100644 --- a/block/bio.c +++ b/block/bio.c @@ -172,8 +172,6 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, mempool_t *pool) { - struct bio_vec *bvl; - /* * see comment near bvec_array define! */ @@ -201,28 +199,24 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_PAGES entries. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { + if (*idx < BVEC_POOL_MAX) { struct biovec_slab *bvs = bvec_slabs + *idx; + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) { + (*idx)++; + return bvl; } + *idx = BVEC_POOL_MAX; } (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) -- cgit v1.2.3-70-g09d2 From de76fd893074ab2cea132c28ac9efd9d0434215e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:23 +0100 Subject: block: remove the 1 and 4 vec bvec_slabs entries All bios with up to 4 bvecs use the inline bvecs in the bio itself, so don't bother to define bvec_slabs entries for them. Also decruftify the bvec_slabs definition and initialization while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 53 ++++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/block/bio.c b/block/bio.c index 321b3479a154..ae241252ea14 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,23 +25,17 @@ #include "blk.h" #include "blk-rq-qos.h" -struct biovec_slab { +static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), -}; -#undef BV - /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -176,12 +170,7 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, * see comment near bvec_array define! */ switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; + /* smaller bios use inline vecs */ case 5 ... 16: *idx = 2; break; @@ -1613,31 +1602,21 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static void __init biovec_init_slabs(void) +static int __init init_bio(void) { int i; - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; - - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } -} - -static int __init init_bio(void) -{ BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); bio_integrity_init(); - biovec_init_slabs(); + + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + struct biovec_slab *bvs = bvec_slabs + i; + + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + } if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); -- cgit v1.2.3-70-g09d2 From 0f2e6ab851ae146c468bc5151c302c6e2473f70a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:24 +0100 Subject: block: turn the nr_iovecs argument to bio_alloc* into an unsigned short The bi_max_vecs and bi_vcnt fields are defined as unsigned short, so don't allow passing larger values in. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- include/linux/bio.h | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index ae241252ea14..3d28d4723f6f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -407,7 +407,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; @@ -493,7 +493,7 @@ EXPORT_SYMBOL(bio_alloc_bioset); * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) { struct bio *bio; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4a84207dd996..9ceeb8ecdb7f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -407,8 +407,9 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, + struct bio_set *bs); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -416,7 +417,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -- cgit v1.2.3-70-g09d2 From 86004515ed80c01d59ab54b5d048164750af3c4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:25 +0100 Subject: block: remove a layer of indentation in bio_iov_iter_get_pages Remove a pointless layer of indentation after a return statement. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3d28d4723f6f..dd3b2a01c9bf 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1081,15 +1081,15 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) bio_iov_bvec_set(bio, iter); bio_set_flag(bio, BIO_NO_PAGE_REF); return 0; - } else { - do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); } + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = __bio_iov_append_get_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + /* don't account direct I/O as memory stall */ bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; -- cgit v1.2.3-70-g09d2 From ed97ce5e1daf26d456760443fc89dc14d2b677e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:26 +0100 Subject: block: set BIO_NO_PAGE_REF in bio_iov_bvec_set bio_iov_bvec_set assigns the foreign bvec, so setting the NO_PAGE_REF directly there seems like the best fit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index dd3b2a01c9bf..f75320123827 100644 --- a/block/bio.c +++ b/block/bio.c @@ -941,6 +941,7 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iter->count; + bio_set_flag(bio, BIO_NO_PAGE_REF); iov_iter_advance(iter, iter->count); return 0; @@ -1078,9 +1079,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_is_bvec(iter)) { if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return -EINVAL; - bio_iov_bvec_set(bio, iter); - bio_set_flag(bio, BIO_NO_PAGE_REF); - return 0; + return bio_iov_bvec_set(bio, iter); } do { -- cgit v1.2.3-70-g09d2 From 977be01273844626ddeef4a464b42b99418d76e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:27 +0100 Subject: block: mark the bio as cloned in bio_iov_bvec_set bio_iov_bvec_set clones the bio_vecs from the iter, and thus should be treated like a cloned bio in every respect. That also includes not touching bi_max_vecs as that is a property of the bio allocation and not its current payload. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index f75320123827..a36f955cd120 100644 --- a/block/bio.c +++ b/block/bio.c @@ -937,11 +937,11 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); bio->bi_vcnt = iter->nr_segs; - bio->bi_max_vecs = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iter->count; bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_CLONED); iov_iter_advance(iter, iter->count); return 0; -- cgit v1.2.3-70-g09d2 From 72b043654ba8b8ce2e0cf3da49247b2db3acb2c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:28 +0100 Subject: md/raid10: remove dead code in reshape_request A bio allocated by bio_alloc_bioset comes pre-zeroed, no need to clear random fields. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e1eefbec15d4..a9ae7d113492 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4539,10 +4539,6 @@ read_more: read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); - read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - read_bio->bi_status = 0; - read_bio->bi_vcnt = 0; - read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; -- cgit v1.2.3-70-g09d2 From 7a800a20ae6329e803c5c646b20811a6ae9ca136 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:29 +0100 Subject: block: use bi_max_vecs to find the bvec pool Instead of encoding of the bvec pool using magic bio flags, just use a helper to find the pool based on the max_vecs value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 11 ++--- block/bio.c | 104 +++++++++++++++++++--------------------------- block/blk.h | 6 +-- include/linux/bio.h | 1 - include/linux/blk_types.h | 29 +------------ 5 files changed, 51 insertions(+), 100 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 19617fa326c3..dfa652122a2d 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -28,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -70,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { - unsigned long idx = 0; - - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - &bs->bvec_integrity_pool); + bip->bip_max_vcnt = nr_vecs; + bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, + &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - bip->bip_max_vcnt = bvec_nr_vecs(idx); - bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; diff --git a/block/bio.c b/block/bio.c index a36f955cd120..a0eabe2f8b07 100644 --- a/block/bio.c +++ b/block/bio.c @@ -36,6 +36,24 @@ static struct biovec_slab { { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_PAGES: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} + /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -131,26 +149,14 @@ out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} + BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; - - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_PAGES) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* @@ -163,48 +169,34 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { + struct biovec_slab *bvs = biovec_slab(*nr_vecs); + + if (WARN_ON_ONCE(!bvs)) + return NULL; + /* - * see comment near bvec_array define! + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - switch (nr) { - /* smaller bios use inline vecs */ - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } + *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_PAGES entries. */ - if (*idx < BVEC_POOL_MAX) { - struct biovec_slab *bvs = bvec_slabs + *idx; + if (*nr_vecs < BIO_MAX_PAGES) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) { - (*idx)++; + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - } - *idx = BVEC_POOL_MAX; + *nr_vecs = BIO_MAX_PAGES; } - (*idx)++; return mempool_alloc(pool, gfp_mask); } @@ -231,7 +223,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -275,12 +267,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -453,22 +441,18 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, bio = p + bs->front_pad; if (nr_iovecs > BIO_INLINE_VECS) { - unsigned long idx = 0; struct bio_vec *bvl = NULL; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, - &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, bvec_nr_vecs(idx)); - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); } else { @@ -644,7 +628,7 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* * most users will be overriding ->bi_bdev with a new target, @@ -934,7 +918,7 @@ EXPORT_SYMBOL_GPL(bio_release_pages); static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; @@ -1495,7 +1479,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1605,8 +1589,6 @@ static int __init init_bio(void) { int i; - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - bio_integrity_init(); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { diff --git a/block/blk.h b/block/blk.h index e022a0d0f2ce..bfc4d526f626 100644 --- a/block/blk.h +++ b/block/blk.h @@ -56,9 +56,9 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); #define BIO_INLINE_VECS 4 -struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -unsigned int bvec_nr_vecs(unsigned short idx); +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask); +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ceeb8ecdb7f..3cbbaf76906e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -329,7 +329,6 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; - unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1bc6f6a01070..db026b6ec15a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -227,7 +227,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, etc and bvec pool number */ + unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; @@ -307,33 +307,6 @@ enum { BIO_FLAG_LAST }; -/* See BVEC_POOL_OFFSET below before adding new flags */ - -/* - * We support 6 different bvec pools, the last one is magic in that it - * is backed by a mempool. - */ -#define BVEC_POOL_NR 6 -#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) - -/* - * Top 3 bits of bio flags indicate the pool the bvecs came from. We add - * 1 to the actual index so that 0 indicates that there are no bvecs to be - * freed. - */ -#define BVEC_POOL_BITS (3) -#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) -#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) -# error "BVEC_POOL_BITS is too small" -#endif - -/* - * Flags starting here get preserved by bio_reset() - this includes - * only BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS BVEC_POOL_OFFSET - typedef __u32 __bitwise blk_mq_req_flags_t; /* -- cgit v1.2.3-70-g09d2 From f1836426cea77fad342aa74bec8bf489a5d64b27 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:26 +0900 Subject: block: document zone_append_max_bytes attribute The description of the zone_append_max_bytes sysfs queue attribute is missing from Documentation/block/queue-sysfs.rst. Add it. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 2638d3446b79..edc6e6960b96 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -261,6 +261,12 @@ For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES is not supported. +zone_append_max_bytes (RO) +-------------------------- +This is the maximum number of bytes that can be written to a sequential +zone of a zoned block device using a zone append write operation +(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices. + zoned (RO) ---------- This indicates if the device is a zoned block device and the zone model of the -- cgit v1.2.3-70-g09d2 From 73d90386b559d6f4c3c5db5e6bb1b68aae8fd3e7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:27 +0900 Subject: nvme: cleanup zone information initialization For a zoned namespace, in nvme_update_ns_info(), call nvme_update_zone_info() after executing nvme_update_disk_info() so that the namespace queue logical and physical block size limits are set. This allows setting the namespace queue max_zone_append_sectors limit in nvme_update_zone_info() instead of nvme_revalidate_zones(), simplifying this function. Also use blk_queue_set_zoned() to set the namespace zoned model. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 11 ++++++----- drivers/nvme/host/zns.c | 11 +++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a3cdc6b1036..81a1c7f6223f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2176,17 +2176,18 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); + ret = nvme_configure_metadata(ns, id); + if (ret) + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + if (ns->head->ids.csi == NVME_CSI_ZNS) { ret = nvme_update_zone_info(ns, lbaf); if (ret) goto out_unfreeze; } - ret = nvme_configure_metadata(ns, id); - if (ret) - goto out_unfreeze; - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->disk, ns, id); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 1dfe9a3500e3..c7e3ec561ba0 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -9,13 +9,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns) { - struct request_queue *q = ns->queue; - int ret; - - ret = blk_revalidate_disk_zones(ns->disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); - return ret; + return blk_revalidate_disk_zones(ns->disk, NULL); } static int nvme_set_max_append(struct nvme_ctrl *ctrl) @@ -109,10 +103,11 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) goto free_data; } - q->limits.zoned = BLK_ZONED_HM; + blk_queue_set_zoned(ns->disk, BLK_ZONED_HM); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); free_data: kfree(id); return status; -- cgit v1.2.3-70-g09d2 From 5752dc78a18118ae143962e10e5c28344d8ab731 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:28 +0900 Subject: nullb: use blk_queue_set_zoned() to setup zoned devices Use blk_queue_set_zoned() to set a nullb device zone model instead of directly assigning the device queue zoned limit. This initialization of the devicve zoned model as well as the setup of the queue flag QUEUE_FLAG_ZONE_RESETALL and of the device queue elevator feature are moved from null_init_zoned_dev() to null_register_zoned_dev() so that the initialization of the queue limits is done when the gendisk of the nullb device is available. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 148b871f263b..78cae8703dcf 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -146,10 +146,6 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) sector += dev->zone_size_sects; } - q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - return 0; } @@ -158,6 +154,10 @@ int null_register_zoned_dev(struct nullb *nullb) struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; + blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + if (queue_is_mq(q)) { int ret = blk_revalidate_disk_zones(nullb->disk, NULL); -- cgit v1.2.3-70-g09d2 From eafc63a9f78e315e7a93c455859b776713da8b69 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:29 +0900 Subject: block: use blk_queue_set_zoned in add_partition() When changing the zoned model of host-aware zoned block devices, use blk_queue_set_zoned() instead of directly assigning the gendisk queue zoned limit. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index b1cdf88f96e2..d6094203116a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -334,7 +334,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); - disk->queue->limits.zoned = BLK_ZONED_NONE; + blk_queue_set_zoned(disk, BLK_ZONED_NONE); break; case BLK_ZONED_NONE: break; -- cgit v1.2.3-70-g09d2 From a805a4fa4fa376bbc145762bb8b09caa2fa8af48 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:30 +0900 Subject: block: introduce zone_write_granularity limit Per ZBC and ZAC specifications, host-managed SMR hard-disks mandate that all writes into sequential write required zones be aligned to the device physical block size. However, NVMe ZNS does not have this constraint and allows write operations into sequential zones to be aligned to the device logical block size. This inconsistency does not help with software portability across device types. To solve this, introduce the zone_write_granularity queue limit to indicate the alignment constraint, in bytes, of write operations into zones of a zoned block device. This new limit is exported as a read-only sysfs queue attribute and the helper blk_queue_zone_write_granularity() introduced for drivers to set this limit. The function blk_queue_set_zoned() is modified to set this new limit to the device logical block size by default. NVMe ZNS devices as well as zoned nullb devices use this default value as is. The scsi disk driver is modified to execute the blk_queue_zone_write_granularity() helper to set the zone write granularity of host-managed SMR disks to the disk physical block size. The accessor functions queue_zone_write_granularity() and bdev_zone_write_granularity() are also introduced. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.rst | 7 +++++++ block/blk-settings.c | 37 ++++++++++++++++++++++++++++++++++++- block/blk-sysfs.c | 8 ++++++++ drivers/scsi/sd_zbc.c | 8 ++++++++ include/linux/blkdev.h | 15 +++++++++++++++ 5 files changed, 74 insertions(+), 1 deletion(-) diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index edc6e6960b96..4dc7f0d499a8 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -279,4 +279,11 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC do not support zone commands, they will be treated as regular block devices and zoned will report "none". +zone_write_granularity (RO) +--------------------------- +This indicates the alignment constraint, in bytes, for write operations in +sequential zones of zoned block devices (devices with a zoned attributed +that reports "host-managed" or "host-aware"). This value is always 0 for +regular block devices. + Jens Axboe , February 2009 diff --git a/block/blk-settings.c b/block/blk-settings.c index 4c974340f1a9..a1e66165adcf 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; + lim->zone_write_granularity = 0; } EXPORT_SYMBOL(blk_set_default_limits); @@ -366,6 +367,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) } EXPORT_SYMBOL(blk_queue_physical_block_size); +/** + * blk_queue_zone_write_granularity - set zone write granularity for the queue + * @q: the request queue for the zoned device + * @size: the zone write granularity size, in bytes + * + * Description: + * This should be set to the lowest possible size allowing to write in + * sequential zones of a zoned block device. + */ +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size) +{ + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return; + + q->limits.zone_write_granularity = size; + + if (q->limits.zone_write_granularity < q->limits.logical_block_size) + q->limits.zone_write_granularity = q->limits.logical_block_size; +} +EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); + /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device @@ -631,6 +654,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + t->zone_write_granularity = max(t->zone_write_granularity, + b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); return ret; } @@ -847,6 +872,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); */ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { + struct request_queue *q = disk->queue; + switch (model) { case BLK_ZONED_HM: /* @@ -875,7 +902,15 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) break; } - disk->queue->limits.zoned = model; + q->limits.zoned = model; + if (model != BLK_ZONED_NONE) { + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + blk_queue_zone_write_granularity(q, + queue_logical_block_size(q)); + } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683af0..ae39c7f3d83d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -219,6 +219,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_write_granularity_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_zone_write_granularity(q), page); +} + static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -585,6 +591,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); @@ -639,6 +646,7 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, + &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index cf07b7f93579..8293b29584b3 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -789,6 +789,14 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); + /* + * Per ZBC and ZAC specifications, writes in sequential write required + * zones of host-managed devices must be aligned to the device physical + * block size. + */ + if (blk_queue_zoned_model(q) == BLK_ZONED_HM) + blk_queue_zone_write_granularity(q, sdkp->physical_block_size); + /* READ16/WRITE16 is mandatory for ZBC disks */ sdkp->device->use_16_for_rw = 1; sdkp->device->use_10_for_rw = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0dea268bd61b..9149f4a5adb3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -337,6 +337,7 @@ struct queue_limits { unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int zone_write_granularity; unsigned short max_segments; unsigned short max_integrity_segments; @@ -1160,6 +1161,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); @@ -1473,6 +1476,18 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } +static inline unsigned int +queue_zone_write_granularity(const struct request_queue *q) +{ + return q->limits.zone_write_granularity; +} + +static inline unsigned int +bdev_zone_write_granularity(struct block_device *bdev) +{ + return queue_zone_write_granularity(bdev_get_queue(bdev)); +} + static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) -- cgit v1.2.3-70-g09d2 From 0f1ba5f5d80f84b605e70cf4661cb1a44a3c02a6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:31 +0900 Subject: zonefs: use zone write granularity as block size Zoned block devices have different granularity constraints for write operations into sequential zones. E.g. ZBC and ZAC devices require that writes be aligned to the device physical block size while NVMe ZNS devices allow logical block size aligned write operations. To correctly handle such difference, use the device zone write granularity limit to set the block size of a zonefs volume, thus allowing the smallest possible write unit for all zoned device types. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- fs/zonefs/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ab68e27bb322..b9fb55b250ae 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1581,12 +1581,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; /* - * The block size is set to the device physical sector size to ensure - * that write operations on 512e devices (512B logical block and 4KB - * physical block) are always aligned to the device physical blocks, - * as mandated by the ZBC/ZAC specifications. + * The block size is set to the device zone write granularity to ensure + * that write operations are always aligned according to the device + * interface constraints. */ - sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev)); + sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); sbi->s_uid = GLOBAL_ROOT_UID; sbi->s_gid = GLOBAL_ROOT_GID; -- cgit v1.2.3-70-g09d2 From 508aebb805277c541e94ee14daba4191ff02347e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:32 +0900 Subject: block: introduce blk_queue_clear_zone_settings() Introduce the internal function blk_queue_clear_zone_settings() to cleanup all limits and resources related to zoned block devices. This new function is called from blk_queue_set_zoned() when a disk zoned model is set to BLK_ZONED_NONE. This particular case can happens when a partition is created on a host-aware scsi disk. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ block/blk-zoned.c | 17 +++++++++++++++++ block/blk.h | 2 ++ 3 files changed, 21 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index a1e66165adcf..7dd8be314ac6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -910,6 +910,8 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) */ blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); + } else { + blk_queue_clear_zone_settings(q); } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7a68b6e4300c..833978c02e60 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -549,3 +549,20 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +void blk_queue_clear_zone_settings(struct request_queue *q) +{ + blk_mq_freeze_queue(q); + + blk_queue_free_zone_bitmaps(q); + blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); + q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; + q->nr_zones = 0; + q->max_open_zones = 0; + q->max_active_zones = 0; + q->limits.chunk_sectors = 0; + q->limits.zone_write_granularity = 0; + q->limits.max_zone_append_sectors = 0; + + blk_mq_unfreeze_queue(q); +} diff --git a/block/blk.h b/block/blk.h index bfc4d526f626..3b53e44b967e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -334,8 +334,10 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); +void blk_queue_clear_zone_settings(struct request_queue *q); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} +static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif int blk_alloc_devt(struct block_device *part, dev_t *devt); -- cgit v1.2.3-70-g09d2 From 78e1663f19b5c34579cf186e776df3bf1ed326a5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:33 +0900 Subject: sd_zbc: clear zone resources for non-zoned case For host-aware ZBC disk, setting the device zoned model to BLK_ZONED_HA using blk_queue_set_zoned() in sd_read_block_characteristics() may result in the block device effective zoned model to be "none" (BLK_ZONED_NONE) if partitions are present on the device. In this case, sd_zbc_read_zones() should not setup the zone related queue limits for the disk so that the device limits and configuration is consistent with a regular disk and resources not uselessly allocated (e.g. the zone write pointer tracking array for zone append emulation). Furthermore, if the disk zoned model changes at run time due to the creation of a partition by the user, the zone related resources can be released. Fix both problems by introducing the function sd_zbc_clear_zone_info() to reset the scsi disk zone information and free resources and by returning early in sd_zbc_read_zones() for a block device that has a zoned model equal to BLK_ZONED_NONE. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/scsi/sd_zbc.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 8293b29584b3..03adb39293c2 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -665,12 +665,28 @@ static int sd_zbc_init_disk(struct scsi_disk *sdkp) return 0; } -void sd_zbc_release_disk(struct scsi_disk *sdkp) +static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp) { + /* Serialize against revalidate zones */ + mutex_lock(&sdkp->rev_mutex); + kvfree(sdkp->zones_wp_offset); sdkp->zones_wp_offset = NULL; kfree(sdkp->zone_wp_update_buf); sdkp->zone_wp_update_buf = NULL; + + sdkp->nr_zones = 0; + sdkp->rev_nr_zones = 0; + sdkp->zone_blocks = 0; + sdkp->rev_zone_blocks = 0; + + mutex_unlock(&sdkp->rev_mutex); +} + +void sd_zbc_release_disk(struct scsi_disk *sdkp) +{ + if (sd_is_zoned(sdkp)) + sd_zbc_clear_zone_info(sdkp); } static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) @@ -769,6 +785,21 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) */ return 0; + /* READ16/WRITE16 is mandatory for ZBC disks */ + sdkp->device->use_16_for_rw = 1; + sdkp->device->use_10_for_rw = 0; + + if (!blk_queue_is_zoned(q)) { + /* + * This can happen for a host aware disk with partitions. + * The block device zone information was already cleared + * by blk_queue_set_zoned(). Only clear the scsi disk zone + * information and exit early. + */ + sd_zbc_clear_zone_info(sdkp); + return 0; + } + /* Check zoned block device characteristics (unconstrained reads) */ ret = sd_zbc_check_zoned_characteristics(sdkp, buf); if (ret) @@ -797,10 +828,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) if (blk_queue_zoned_model(q) == BLK_ZONED_HM) blk_queue_zone_write_granularity(q, sdkp->physical_block_size); - /* READ16/WRITE16 is mandatory for ZBC disks */ - sdkp->device->use_16_for_rw = 1; - sdkp->device->use_10_for_rw = 0; - sdkp->rev_nr_zones = nr_zones; sdkp->rev_zone_blocks = zone_blocks; -- cgit v1.2.3-70-g09d2 From f885056a48ccf4ad4332def91e973f3993fa8695 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Feb 2021 18:14:19 +0100 Subject: mm: simplify swapdev_block Open code the parts of map_swap_entry that was actually used by swapdev_block, and remove the now unused map_swap_entry function. Signed-off-by: Christoph Hellwig Reviewed-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- mm/swapfile.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 351999a84e6e..21a98cb8d646 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1790,9 +1790,6 @@ int free_swap_and_cache(swp_entry_t entry) } #ifdef CONFIG_HIBERNATION - -static sector_t map_swap_entry(swp_entry_t, struct block_device**); - /* * Find the swap type that corresponds to given device (if any). * @@ -1852,12 +1849,13 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct block_device *bdev; struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; - return map_swap_entry(swp_entry(type, offset), &bdev); + se = offset_to_swap_extent(si, offset); + return se->start_block + (offset - se->start_page); } /* @@ -2283,28 +2281,6 @@ static void drain_mmlist(void) spin_unlock(&mmlist_lock); } -#ifdef CONFIG_HIBERNATION -/* - * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset for the specified swap entry. - * Note that the type of this function is sector_t, but it returns page offset - * into the bdev, not sector offset. - */ -static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) -{ - struct swap_info_struct *sis; - struct swap_extent *se; - pgoff_t offset; - - sis = swp_swap_info(entry); - *bdev = sis->bdev; - - offset = swp_offset(entry); - se = offset_to_swap_extent(sis, offset); - return se->start_block + (offset - se->start_page); -} -#endif - /* * Free all of a swapdev's extent information */ -- cgit v1.2.3-70-g09d2