diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
commit | 73ba2fb33c492916853dfe63e3b3163da0be661d (patch) | |
tree | c2fda8ca1273744d2e884d24189a15ac1a7d63c2 /block/bio.c | |
parent | 958f338e96f874a0d29442396d6adf9c1e17aa2d (diff) | |
parent | b86d865cb1cae1e61527ea0b8977078bbf694328 (diff) |
Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"First pull request for this merge window, there will also be a
followup request with some stragglers.
This pull request contains:
- Fix for a thundering heard issue in the wbt block code (Anchal
Agarwal)
- A few NVMe pull requests:
* Improved tracepoints (Keith)
* Larger inline data support for RDMA (Steve Wise)
* RDMA setup/teardown fixes (Sagi)
* Effects log suppor for NVMe target (Chaitanya Kulkarni)
* Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
* TP4004 (ANA) support (Christoph)
* Various NVMe fixes
- Block io-latency controller support. Much needed support for
properly containing block devices. (Josef)
- Series improving how we handle sense information on the stack
(Kees)
- Lightnvm fixes and updates/improvements (Mathias/Javier et al)
- Zoned device support for null_blk (Matias)
- AIX partition fixes (Mauricio Faria de Oliveira)
- DIF checksum code made generic (Max Gurtovoy)
- Add support for discard in iostats (Michael Callahan / Tejun)
- Set of updates for BFQ (Paolo)
- Removal of async write support for bsg (Christoph)
- Bio page dirtying and clone fixups (Christoph)
- Set of bcache fix/changes (via Coly)
- Series improving blk-mq queue setup/teardown speed (Ming)
- Series improving merging performance on blk-mq (Ming)
- Lots of other fixes and cleanups from a slew of folks"
* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
blkcg: Make blkg_root_lookup() work for queues in bypass mode
bcache: fix error setting writeback_rate through sysfs interface
null_blk: add lock drop/acquire annotation
Blk-throttle: reduce tail io latency when iops limit is enforced
block: paride: pd: mark expected switch fall-throughs
block: Ensure that a request queue is dissociated from the cgroup controller
block: Introduce blk_exit_queue()
blkcg: Introduce blkg_root_lookup()
block: Remove two superfluous #include directives
blk-mq: count the hctx as active before allocating tag
block: bvec_nr_vecs() returns value for wrong slab
bcache: trivial - remove tailing backslash in macro BTREE_FLAG
bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
bcache: set max writeback rate when I/O request is idle
bcache: add code comments for bset.c
bcache: fix mistaken comments in request.c
bcache: fix mistaken code comments in bcache.h
bcache: add a comment in super.c
bcache: avoid unncessary cache prefetch bch_btree_node_get()
bcache: display rate debug parameters to 0 when writeback is not running
...
Diffstat (limited to 'block/bio.c')
-rw-r--r-- | block/bio.c | 208 |
1 files changed, 84 insertions, 124 deletions
diff --git a/block/bio.c b/block/bio.c index 047c5dca6d90..b12966e415d3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -28,9 +28,11 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" +#include "blk-rq-qos.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -156,7 +158,7 @@ out: unsigned int bvec_nr_vecs(unsigned short idx) { - return bvec_slabs[idx].nr_vecs; + return bvec_slabs[--idx].nr_vecs; } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) @@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) EXPORT_SYMBOL(bio_clone_fast); /** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. - */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - bio->bi_disk = bio_src->bi_disk; - bio->bi_opf = bio_src->bi_opf; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } - - bio_clone_blkcg_association(bio, bio_src); - - return bio; -} -EXPORT_SYMBOL(bio_clone_bioset); - -/** * bio_add_pc_page - attempt to add page to bio * @q: the target queue * @bio: destination bio @@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio) int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); + if (!PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); @@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio) struct bio_vec *bvec; int i; - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); } /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. + * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will run one put_page() against each page and will run one @@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list; */ static void bio_dirty_fn(struct work_struct *work) { - unsigned long flags; - struct bio *bio; + struct bio *bio, *next; - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; + spin_lock_irq(&bio_dirty_lock); + next = bio_dirty_list; bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); + spin_unlock_irq(&bio_dirty_lock); - while (bio) { - struct bio *next = bio->bi_private; + while ((bio = next) != NULL) { + next = bio->bi_private; bio_set_pages_dirty(bio); bio_release_pages(bio); bio_put(bio); - bio = next; } } void bio_check_pages_dirty(struct bio *bio) { struct bio_vec *bvec; - int nr_clean_pages = 0; + unsigned long flags; int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - put_page(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } + if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + goto defer; } - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } + bio_release_pages(bio); + bio_put(bio); + return; +defer: + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { + const int sgrp = op_stat_group(op); int cpu = part_stat_lock(); part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(q, part, rw); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, sectors[sgrp], sectors); + part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; + const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(q, cpu, part); - part_dec_in_flight(q, part, rw); + part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); } @@ -1834,6 +1745,9 @@ again: if (!bio_integrity_endio(bio)) return; + if (bio->bi_disk) + rq_qos_done_bio(bio->bi_disk->queue, bio); + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that @@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP +#ifdef CONFIG_MEMCG +/** + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * @bio: target bio + * @page: the page to lookup the blkcg from + * + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. + */ +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *blkcg_css; + + if (unlikely(bio->bi_css)) + return -EBUSY; + if (!page->mem_cgroup) + return 0; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; +} +#endif /* CONFIG_MEMCG */ + /** * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio @@ -2065,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** + * bio_associate_blkg - associate a bio with the specified blkg + * @bio: target bio + * @blkg: the blkg to associate + * + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. + */ +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +{ + if (unlikely(bio->bi_blkg)) + return -EBUSY; + blkg_get(blkg); + bio->bi_blkg = blkg; + return 0; +} + +/** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio */ @@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio) css_put(bio->bi_css); bio->bi_css = NULL; } + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } } /** |