diff options
author | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
commit | b74b953b998bcc2db91b694446f3a2619ec32de6 (patch) | |
tree | 6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /block | |
parent | abb438526201c6a79949ad45375c051b6681c253 (diff) | |
parent | f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff) |
Merge commit 'v2.6.36' into kbuild/misc
Update to be able to fix a recent change to scripts/basic/docproc.c
(commit eda603f).
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 22 | ||||
-rw-r--r-- | block/Kconfig.iosched | 16 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/blk-barrier.c | 183 | ||||
-rw-r--r-- | block/blk-cgroup.c | 861 | ||||
-rw-r--r-- | block/blk-cgroup.h | 189 | ||||
-rw-r--r-- | block/blk-core.c | 213 | ||||
-rw-r--r-- | block/blk-exec.c | 2 | ||||
-rw-r--r-- | block/blk-integrity.c | 3 | ||||
-rw-r--r-- | block/blk-ioc.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 231 | ||||
-rw-r--r-- | block/blk-map.c | 2 | ||||
-rw-r--r-- | block/blk-merge.c | 29 | ||||
-rw-r--r-- | block/blk-settings.c | 160 | ||||
-rw-r--r-- | block/blk-sysfs.c | 122 | ||||
-rw-r--r-- | block/blk-tag.c | 1 | ||||
-rw-r--r-- | block/blk-timeout.c | 12 | ||||
-rw-r--r-- | block/blk.h | 14 | ||||
-rw-r--r-- | block/bsg.c | 5 | ||||
-rw-r--r-- | block/cfq-iosched.c | 571 | ||||
-rw-r--r-- | block/cfq.h | 115 | ||||
-rw-r--r-- | block/compat_ioctl.c | 58 | ||||
-rw-r--r-- | block/elevator.c | 111 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | block/ioctl.c | 37 | ||||
-rw-r--r-- | block/noop-iosched.c | 1 |
26 files changed, 2129 insertions, 836 deletions
diff --git a/block/Kconfig b/block/Kconfig index e20fbde0875c..9be0b56eaee1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,28 +77,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_CGROUP - bool - depends on CGROUPS - default n - ---help--- - Generic block IO controller cgroup interface. This is the common - cgroup interface which should be used by various IO controlling - policies. - - Currently, CFQ IO scheduler uses it to recognize task groups and - control disk bandwidth allocation (proportional time slice allocation) - to such task groups. - -config DEBUG_BLK_CGROUP - bool - depends on BLK_CGROUP - default n - ---help--- - Enable some debugging help. Currently it stores the cgroup path - in the blk group which can be used by cfq for tracing various - group related activity. - endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b71abfb0d726..3199b76f795d 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,6 +23,8 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -32,23 +34,15 @@ config IOSCHED_CFQ This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && CGROUPS - select BLK_CGROUP + depends on IOSCHED_CFQ && BLK_CGROUP default n ---help--- Enable group IO scheduling in CFQ. -config DEBUG_CFQ_IOSCHED - bool "Debug CFQ Scheduling" - depends on CFQ_GROUP_IOSCHED - select DEBUG_BLK_CGROUP - default n - ---help--- - Enable CFQ IO scheduling debugging in CFQ. Currently it makes - blktrace output more verbose. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6e..0bb499a739cd 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8618d8996fea..f0faefca032f 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/gfp.h> #include "blk.h" @@ -12,7 +13,6 @@ * blk_queue_ordered - does this queue support ordered writes * @q: the request queue * @ordered: one of QUEUE_ORDERED_* - * @prepare_flush_fn: rq setup helper for cache flush ordered writes * * Description: * For journalled file systems, doing ordered writes on a commit @@ -21,15 +21,8 @@ * feature should call this function and indicate so. * **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn) +int blk_queue_ordered(struct request_queue *q, unsigned ordered) { - if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_POSTFLUSH))) { - printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__); - return -EINVAL; - } - if (ordered != QUEUE_ORDERED_NONE && ordered != QUEUE_ORDERED_DRAIN && ordered != QUEUE_ORDERED_DRAIN_FLUSH && @@ -43,7 +36,6 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered, q->ordered = ordered; q->next_ordered = ordered; - q->prepare_flush_fn = prepare_flush_fn; return 0; } @@ -78,7 +70,7 @@ unsigned blk_ordered_req_seq(struct request *rq) * * http://thread.gmane.org/gmane.linux.kernel/537473 */ - if (!blk_fs_request(rq)) + if (rq->cmd_type != REQ_TYPE_FS) return QUEUE_ORDSEQ_DRAIN; if ((rq->cmd_flags & REQ_ORDERED_COLOR) == @@ -142,10 +134,10 @@ static void queue_flush(struct request_queue *q, unsigned which) } blk_rq_init(q, rq); - rq->cmd_flags = REQ_HARDBARRIER; - rq->rq_disk = q->bar_rq.rq_disk; + rq->cmd_type = REQ_TYPE_FS; + rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH; + rq->rq_disk = q->orig_bar_rq->rq_disk; rq->end_io = end_io; - q->prepare_flush_fn(q, rq); elv_insert(q, rq, ELEVATOR_INSERT_FRONT); } @@ -202,7 +194,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) /* initialize proxy request and queue it */ blk_rq_init(q, rq); if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; + rq->cmd_flags |= REQ_WRITE; if (q->ordered & QUEUE_ORDERED_DO_FUA) rq->cmd_flags |= REQ_FUA; init_request_from_bio(rq, q->orig_bar_rq->bio); @@ -235,7 +227,8 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) bool blk_do_ordered(struct request_queue *q, struct request **rqp) { struct request *rq = *rqp; - const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); + const int is_barrier = rq->cmd_type == REQ_TYPE_FS && + (rq->cmd_flags & REQ_HARDBARRIER); if (!q->ordseq) { if (!is_barrier) @@ -260,7 +253,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) */ /* Special requests are not subject to ordering rules. */ - if (!blk_fs_request(rq) && + if (rq->cmd_type != REQ_TYPE_FS && rq != &q->pre_flush_rq && rq != &q->post_flush_rq) return true; @@ -285,26 +278,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - - complete(bio->bi_private); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector, unsigned long flags) { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; - int ret; + int ret = 0; if (bdev->bd_disk == NULL) return -ENXIO; @@ -313,23 +311,34 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) if (!q) return -ENXIO; - bio = bio_alloc(GFP_KERNEL, 0); + /* + * some block devices may not have their queue correctly set up here + * (e.g. loop device without a backing file) and so issuing a flush + * here will panic. Ensure there is a request function before issuing + * the barrier. + */ + if (!q->make_request_fn) + return -ENXIO; + + bio = bio_alloc(gfp_mask, 0); bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) + bio->bi_private = &wait; - wait_for_completion(&wait); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; + bio_get(bio); + submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) { + wait_for_completion(&wait); + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + } - ret = 0; if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) @@ -339,107 +348,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); - -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - if (bio->bi_private) - complete(bio->bi_private); - __free_page(bio_page(bio)); - - bio_put(bio); -} - -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: DISCARD_FL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q = bdev_get_queue(bdev); - int type = flags & DISCARD_FL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; - struct bio *bio; - struct page *page; - int ret = 0; - - if (!q) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; - unsigned int max_discard_sectors = - min(q->limits.max_discard_sectors, UINT_MAX >> 9); - - bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - if (flags & DISCARD_FL_WAIT) - bio->bi_private = &wait; - - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(gfp_mask | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } - - bio_get(bio); - submit_bio(type, bio); - - if (flags & DISCARD_FL_WAIT) - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; -} -EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1fa2654db0a6..2fef1ef931a0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,7 +15,12 @@ #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/blkdev.h> +#include <linux/slab.h> #include "blk-cgroup.h" +#include <linux/genhd.h> + +#define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); @@ -23,19 +28,56 @@ static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); -bool blkiocg_css_tryget(struct blkio_cgroup *blkcg) +static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, + struct cgroup *); +static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, + struct task_struct *, bool); +static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, + struct cgroup *, struct task_struct *, bool); +static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); +static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, +#ifdef CONFIG_BLK_CGROUP + /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ + .subsys_id = blkio_subsys_id, +#endif + .use_id = 1, + .module = THIS_MODULE, +}; +EXPORT_SYMBOL_GPL(blkio_subsys); + +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) { - if (!css_tryget(&blkcg->css)) - return false; - return true; + list_add(&pn->node, &blkcg->policy_list); } -EXPORT_SYMBOL_GPL(blkiocg_css_tryget); -void blkiocg_css_put(struct blkio_cgroup *blkcg) +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { - css_put(&blkcg->css); + list_del(&pn->node); +} + +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; } -EXPORT_SYMBOL_GPL(blkiocg_css_put); struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { @@ -44,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} + +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; + } +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) +{ + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); +} + +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); + +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); + +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->sectors += bytes >> 9; + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, + sync); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) { - blkg->time += time; - blkg->sectors += sectors; + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, + sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) @@ -58,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, unsigned long flags; spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); -#endif blkg->dev = dev; } EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); @@ -89,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg) rcu_read_lock(); css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (!css) - goto out; - - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; + if (css) { + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); } - spin_unlock_irqrestore(&blkcg->lock, flags); -out: + rcu_read_unlock(); return ret; } @@ -142,31 +428,179 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) struct blkio_group *blkg; struct hlist_node *n; struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkio_list_lock); + pn = blkio_policy_search_node(blkcg, blkg->dev); + + if (pn) + continue; + list_for_each_entry(blkiop, &blkio_list, list) blkiop->ops.blkio_update_group_weight_fn(blkg, blkcg->weight); - spin_unlock(&blkio_list_lock); } spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); return 0; } -#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int +blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct blkio_group_stats *stats; + struct hlist_node *n; + uint64_t queued[BLKIO_STAT_TOTAL]; + int i; +#ifdef CONFIG_DEBUG_BLK_CGROUP + bool idling, waiting, empty; + unsigned long long now = sched_clock(); +#endif + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock(&blkg->stats_lock); + stats = &blkg->stats; +#ifdef CONFIG_DEBUG_BLK_CGROUP + idling = blkio_blkg_idling(stats); + waiting = blkio_blkg_waiting(stats); + empty = blkio_blkg_empty(stats); +#endif + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; + memset(stats, 0, sizeof(struct blkio_group_stats)); + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (idling) { + blkio_mark_blkg_idling(stats); + stats->start_idle_time = now; + } + if (waiting) { + blkio_mark_blkg_waiting(stats); + stats->start_group_wait_time = now; + } + if (empty) { + blkio_mark_blkg_empty(stats); + stats->start_empty_time = now; + } +#endif + spin_unlock(&blkg->stats_lock); + } + spin_unlock_irq(&blkcg->lock); + return 0; +} + +static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, + int chars_left, bool diskname_only) +{ + snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + chars_left -= strlen(str); + if (chars_left <= 0) { + printk(KERN_WARNING + "Possibly incorrect cgroup stat display format"); + return; + } + if (diskname_only) + return; + switch (type) { + case BLKIO_STAT_READ: + strlcat(str, " Read", chars_left); + break; + case BLKIO_STAT_WRITE: + strlcat(str, " Write", chars_left); + break; + case BLKIO_STAT_SYNC: + strlcat(str, " Sync", chars_left); + break; + case BLKIO_STAT_ASYNC: + strlcat(str, " Async", chars_left); + break; + case BLKIO_STAT_TOTAL: + strlcat(str, " Total", chars_left); + break; + default: + strlcat(str, " Invalid", chars_left); + } +} + +static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, + struct cgroup_map_cb *cb, dev_t dev) +{ + blkio_get_key_name(0, dev, str, chars_left, true); + cb->fill(cb, str, val); + return val; +} + +/* This should be called with blkg->stats_lock held */ +static uint64_t blkio_get_stat(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) +{ + uint64_t disk_total; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.time, cb, dev); + if (type == BLKIO_STAT_SECTORS) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.sectors, cb, dev); +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { + uint64_t sum = blkg->stats.avg_queue_size_sum; + uint64_t samples = blkg->stats.avg_queue_size_samples; + if (samples) + do_div(sum, samples); + else + sum = 0; + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); + } + if (type == BLKIO_STAT_GROUP_WAIT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.group_wait_time, cb, dev); + if (type == BLKIO_STAT_IDLE_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.idle_time, cb, dev); + if (type == BLKIO_STAT_EMPTY_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.empty_time, cb, dev); + if (type == BLKIO_STAT_DEQUEUE) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.dequeue, cb, dev); +#endif + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + } + disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + + blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct seq_file *m) \ + struct cftype *cftype, struct cgroup_map_cb *cb) \ { \ struct blkio_cgroup *blkcg; \ struct blkio_group *blkg; \ struct hlist_node *n; \ + uint64_t cgroup_total = 0; \ \ if (!cgroup_lock_live_group(cgroup)) \ return -ENODEV; \ @@ -174,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ blkcg = cgroup_to_blkio_cgroup(cgroup); \ rcu_read_lock(); \ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) \ - seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ - MINOR(blkg->dev), blkg->__VAR); \ + if (blkg->dev) { \ + spin_lock_irq(&blkg->stats_lock); \ + cgroup_total += blkio_get_stat(blkg, cb, \ + blkg->dev, type); \ + spin_unlock_irq(&blkg->stats_lock); \ + } \ } \ + if (show_total) \ + cb->fill(cb, "Total", cgroup_total); \ rcu_read_unlock(); \ cgroup_unlock(); \ return 0; \ } -SHOW_FUNCTION_PER_GROUP(time); -SHOW_FUNCTION_PER_GROUP(sectors); +SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); +SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); +SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); +SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); +SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); #ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue); +SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); +SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); +SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); +SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); #endif #undef SHOW_FUNCTION_PER_GROUP -#ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) +static int blkio_check_dev_num(dev_t dev) { - blkg->dequeue += dequeue; + int part = 0; + struct gendisk *disk; + + disk = get_gendisk(dev, &part); + if (!disk || part) + return -ENODEV; + + return 0; +} + +static int blkio_policy_parse_and_set(char *buf, + struct blkio_policy_node *newpn) +{ + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + int ret; + unsigned long major, minor, temp; + int i = 0; + dev_t dev; + + memset(s, 0, sizeof(s)); + + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 3) + break; + } + + if (i != 2) + return -EINVAL; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + return -EINVAL; + + minor_s = s[0]; + if (!minor_s) + return -EINVAL; + + ret = strict_strtoul(major_s, 10, &major); + if (ret) + return -EINVAL; + + ret = strict_strtoul(minor_s, 10, &minor); + if (ret) + return -EINVAL; + + dev = MKDEV(major, minor); + + ret = blkio_check_dev_num(dev); + if (ret) + return ret; + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; + + newpn->weight = temp; + + return 0; +} + +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev); + if (pn) + return pn->weight; + else + return blkcg->weight; +} +EXPORT_SYMBOL_GPL(blkcg_get_weight); + + +static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + int ret = 0; + char *buf; + struct blkio_policy_node *newpn, *pn; + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + int keep_newpn = 0; + struct hlist_node *n; + struct blkio_policy_type *blkiop; + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = blkio_policy_parse_and_set(buf, newpn); + if (ret) + goto free_newpn; + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + spin_lock_irq(&blkcg->lock); + + pn = blkio_policy_search_node(blkcg, newpn->dev); + if (!pn) { + if (newpn->weight != 0) { + blkio_policy_insert_node(blkcg, newpn); + keep_newpn = 1; + } + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + + if (newpn->weight == 0) { + /* weight == 0 means deleteing a specific weight */ + blkio_policy_delete_node(pn); + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + spin_unlock_irq(&blkcg->lock); + + pn->weight = newpn->weight; + +update_io_group: + /* update weight for each cfqg */ + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (newpn->dev == blkg->dev) { + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_update_group_weight_fn(blkg, + newpn->weight ? + newpn->weight : + blkcg->weight); + } + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + struct blkio_policy_node *pn; + + seq_printf(m, "dev\tweight\n"); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + if (!list_empty(&blkcg->policy_list)) { + spin_lock_irq(&blkcg->lock); + list_for_each_entry(pn, &blkcg->policy_list, node) { + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight); + } + spin_unlock_irq(&blkcg->lock); + } + + return 0; } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); -#endif struct cftype blkio_files[] = { { + .name = "weight_device", + .read_seq_string = blkiocg_weight_device_read, + .write_string = blkiocg_weight_device_write, + .max_write_len = 256, + }, + { .name = "weight", .read_u64 = blkiocg_weight_read, .write_u64 = blkiocg_weight_write, }, { .name = "time", - .read_seq_string = blkiocg_time_read, + .read_map = blkiocg_time_read, }, { .name = "sectors", - .read_seq_string = blkiocg_sectors_read, + .read_map = blkiocg_sectors_read, + }, + { + .name = "io_service_bytes", + .read_map = blkiocg_io_service_bytes_read, + }, + { + .name = "io_serviced", + .read_map = blkiocg_io_serviced_read, + }, + { + .name = "io_service_time", + .read_map = blkiocg_io_service_time_read, + }, + { + .name = "io_wait_time", + .read_map = blkiocg_io_wait_time_read, + }, + { + .name = "io_merged", + .read_map = blkiocg_io_merged_read, + }, + { + .name = "io_queued", + .read_map = blkiocg_io_queued_read, + }, + { + .name = "reset_stats", + .write_u64 = blkiocg_reset_stats, }, #ifdef CONFIG_DEBUG_BLK_CGROUP - { + { + .name = "avg_queue_size", + .read_map = blkiocg_avg_queue_size_read, + }, + { + .name = "group_wait_time", + .read_map = blkiocg_group_wait_time_read, + }, + { + .name = "idle_time", + .read_map = blkiocg_idle_time_read, + }, + { + .name = "empty_time", + .read_map = blkiocg_empty_time_read, + }, + { .name = "dequeue", - .read_seq_string = blkiocg_dequeue_read, - }, + .read_map = blkiocg_dequeue_read, + }, #endif }; @@ -234,56 +911,62 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) struct blkio_group *blkg; void *key; struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn, *pntmp; rcu_read_lock(); -remove_entry: - spin_lock_irqsave(&blkcg->lock, flags); + do { + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + break; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); - if (hlist_empty(&blkcg->blkg_list)) { spin_unlock_irqrestore(&blkcg->lock, flags); - goto done; - } - blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, - blkcg_node); - key = rcu_dereference(blkg->key); - __blkiocg_del_blkio_group(blkg); + /* + * This blkio_group is being unlinked as associated cgroup is + * going away. Let all the IO controlling policies know about + * this event. Currently this is static call to one io + * controlling policy. Once we have more policies in place, we + * need some dynamic registration of callback function. + */ + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_unlink_group_fn(key, blkg); + spin_unlock(&blkio_list_lock); + } while (1); - spin_unlock_irqrestore(&blkcg->lock, flags); + list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { + blkio_policy_delete_node(pn); + kfree(pn); + } - /* - * This blkio_group is being unlinked as associated cgroup is going - * away. Let all the IO controlling policies know about this event. - * - * Currently this is static call to one io controlling policy. Once - * we have more policies in place, we need some dynamic registration - * of callback function. - */ - spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_unlink_group_fn(key, blkg); - spin_unlock(&blkio_list_lock); - goto remove_entry; -done: free_css_id(&blkio_subsys, &blkcg->css); rcu_read_unlock(); - kfree(blkcg); + if (blkcg != &blkio_root_cgroup) + kfree(blkcg); } static struct cgroup_subsys_state * blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) { - struct blkio_cgroup *blkcg, *parent_blkcg; + struct blkio_cgroup *blkcg; + struct cgroup *parent = cgroup->parent; - if (!cgroup->parent) { + if (!parent) { blkcg = &blkio_root_cgroup; goto done; } /* Currently we do not support hierarchy deeper than two level (0,1) */ - parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); - if (css_depth(&parent_blkcg->css) > 0) - return ERR_PTR(-EINVAL); + if (parent != cgroup->top_cgroup) + return ERR_PTR(-EPERM); blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) @@ -294,6 +977,7 @@ done: spin_lock_init(&blkcg->lock); INIT_HLIST_HEAD(&blkcg->blkg_list); + INIT_LIST_HEAD(&blkcg->policy_list); return &blkcg->css; } @@ -333,17 +1017,6 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, task_unlock(tsk); } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, - .populate = blkiocg_populate, - .subsys_id = blkio_subsys_id, - .use_id = 1, -}; - void blkio_policy_register(struct blkio_policy_type *blkiop) { spin_lock(&blkio_list_lock); @@ -359,3 +1032,17 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop) spin_unlock(&blkio_list_lock); } EXPORT_SYMBOL_GPL(blkio_policy_unregister); + +static int __init init_cgroup_blkio(void) +{ + return cgroup_load_subsys(&blkio_subsys); +} + +static void __exit exit_cgroup_blkio(void) +{ + cgroup_unload_subsys(&blkio_subsys); +} + +module_init(init_cgroup_blkio); +module_exit(exit_cgroup_blkio); +MODULE_LICENSE("GPL"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 4d316df863b4..2b866ec1dcea 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,13 +15,92 @@ #include <linux/cgroup.h> -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + +#ifndef CONFIG_BLK_CGROUP +/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ +extern struct cgroup_subsys blkio_subsys; +#define blkio_subsys_id blkio_subsys.subsys_id +#endif + +enum stat_type { + /* Total time spent (in ns) between request dispatch to the driver and + * request completion for IOs doen by this cgroup. This may not be + * accurate when NCQ is turned on. */ + BLKIO_STAT_SERVICE_TIME = 0, + /* Total bytes transferred */ + BLKIO_STAT_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_SERVICED, + /* Total time spent waiting in scheduler queue in ns */ + BLKIO_STAT_WAIT_TIME, + /* Number of IOs merged */ + BLKIO_STAT_MERGED, + /* Number of IOs queued up */ + BLKIO_STAT_QUEUED, + /* All the single valued stats go below this */ + BLKIO_STAT_TIME, + BLKIO_STAT_SECTORS, +#ifdef CONFIG_DEBUG_BLK_CGROUP + BLKIO_STAT_AVG_QUEUE_SIZE, + BLKIO_STAT_IDLE_TIME, + BLKIO_STAT_EMPTY_TIME, + BLKIO_STAT_GROUP_WAIT_TIME, + BLKIO_STAT_DEQUEUE +#endif +}; + +enum stat_sub_type { + BLKIO_STAT_READ = 0, + BLKIO_STAT_WRITE, + BLKIO_STAT_SYNC, + BLKIO_STAT_ASYNC, + BLKIO_STAT_TOTAL +}; + +/* blkg state flags */ +enum blkg_state_flags { + BLKG_waiting = 0, + BLKG_idling, + BLKG_empty, +}; struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; spinlock_t lock; struct hlist_head blkg_list; + struct list_head policy_list; /* list of blkio_policy_node */ +}; + +struct blkio_group_stats { + /* total disk time and nr sectors dispatched by this group */ + uint64_t time; + uint64_t sectors; + uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Sum of number of IOs queued across all samples */ + uint64_t avg_queue_size_sum; + /* Count of samples taken for average */ + uint64_t avg_queue_size_samples; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; + + /* Total time spent waiting for it to be assigned a timeslice. */ + uint64_t group_wait_time; + uint64_t start_group_wait_time; + + /* Time spent idling for this blkio_group */ + uint64_t idle_time; + uint64_t start_idle_time; + /* + * Total time when we have requests queued and do not contain the + * current active queue. + */ + uint64_t empty_time; + uint64_t start_empty_time; + uint16_t flags; +#endif }; struct blkio_group { @@ -29,22 +108,24 @@ struct blkio_group { void *key; struct hlist_node blkcg_node; unsigned short blkcg_id; -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Store cgroup path */ char path[128]; - /* How many times this group has been removed from service tree */ - unsigned long dequeue; -#endif /* The device MKDEV(major, minor), this group has been created for */ - dev_t dev; + dev_t dev; - /* total disk time and nr sectors dispatched by this group */ - unsigned long time; - unsigned long sectors; + /* Need to serialize the stats in the case of reset/update */ + spinlock_t stats_lock; + struct blkio_group_stats stats; +}; + +struct blkio_policy_node { + struct list_head node; + dev_t dev; + unsigned int weight; }; -extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); -extern void blkiocg_css_put(struct blkio_cgroup *blkcg); +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, @@ -64,6 +145,11 @@ struct blkio_policy_type { extern void blkio_policy_register(struct blkio_policy_type *); extern void blkio_policy_unregister(struct blkio_policy_type *); +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} + #else struct blkio_group { @@ -75,6 +161,8 @@ struct blkio_policy_type { static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } + #endif #define BLKIO_WEIGHT_MIN 100 @@ -82,19 +170,45 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } #define BLKIO_WEIGHT_DEFAULT 500 #ifdef CONFIG_DEBUG_BLK_CGROUP -static inline char *blkg_path(struct blkio_group *blkg) -{ - return blkg->path; -} -void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, unsigned long dequeue); +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); +void blkiocg_update_idle_time_stats(struct blkio_group *blkg); +void blkiocg_set_start_empty_time(struct blkio_group *blkg); + +#define BLKG_FLAG_FNS(name) \ +static inline void blkio_mark_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags |= (1 << BLKG_##name); \ +} \ +static inline void blkio_clear_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags &= ~(1 << BLKG_##name); \ +} \ +static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ +{ \ + return (stats->flags & (1 << BLKG_##name)) != 0; \ +} \ + +BLKG_FLAG_FNS(waiting) +BLKG_FLAG_FNS(idling) +BLKG_FLAG_FNS(empty) +#undef BLKG_FLAG_FNS #else -static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } -static inline void blkiocg_update_blkio_group_dequeue_stats( - struct blkio_group *blkg, unsigned long dequeue) {} +static inline void blkiocg_update_avg_queue_size_stats( + struct blkio_group *blkg) {} +static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} +static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{} +static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} +static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #endif -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, @@ -102,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors); +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time); +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, + bool direction, bool sync); +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync); +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync); +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync); #else struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) -{ -} + struct blkio_group *blkg, void *key, dev_t dev) {} static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } static inline struct blkio_group * blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } -static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) -{ -} +static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, + bool sync) {} +static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} +static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} #endif #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 718897e6d37f..32a1c123dfb3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->tag = -1; rq->ref_count = 1; rq->start_time = jiffies; + set_start_time_ns(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -183,7 +184,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg) printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); - if (blk_pc_request(rq)) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); for (bit = 0; bit < BLK_MAX_CDB; bit++) printk("%02x ", rq->cmd[bit]); @@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_sync_queue(q); + del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); mutex_unlock(&q->sysfs_lock); @@ -465,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q) { struct request_list *rl = &q->rq; + if (unlikely(rl->rq_pool)) + return 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; @@ -510,6 +515,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) return NULL; } + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, + laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); @@ -566,19 +573,42 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!uninit_q) + return NULL; + + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + if (!q) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_init_queue_node); + +struct request_queue * +blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock) +{ + return blk_init_allocated_queue_node(q, rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_allocated_queue); +struct request_queue * +blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock, int node_id) +{ if (!q) return NULL; q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); + if (blk_init_free_list(q)) return NULL; - } q->request_fn = rfn; q->prep_rq_fn = NULL; + q->unprep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; @@ -598,10 +628,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return q; } - blk_put_queue(q); return NULL; } -EXPORT_SYMBOL(blk_init_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue_node); int blk_get_queue(struct request_queue *q) { @@ -1107,33 +1136,46 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); +/** + * blk_add_request_payload - add a payload to a request + * @rq: request to update + * @page: page backing the payload + * @len: length of the payload. + * + * This allows to later add a payload to an already submitted request by + * a block driver. The driver needs to take care of freeing the payload + * itself. + * + * Note that this is a quite horrible hack and nothing but handling of + * discard requests should ever use it. + */ +void blk_add_request_payload(struct request *rq, struct page *page, + unsigned int len) +{ + struct bio *bio = rq->bio; + + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = len; + + bio->bi_size = len; + bio->bi_vcnt = 1; + bio->bi_phys_segments = 1; + + rq->__data_len = rq->resid_len = len; + rq->nr_phys_segments = 1; + rq->buffer = bio_data(bio); +} +EXPORT_SYMBOL_GPL(blk_add_request_payload); + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; - /* - * Inherit FAILFAST from bio (for read-ahead, and explicit - * FAILFAST). FAILFAST flags are identical for req and bio. - */ - if (bio_rw_flagged(bio, BIO_RW_AHEAD)) + req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; + if (bio->bi_rw & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; - else - req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; - - if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { - req->cmd_flags |= REQ_DISCARD; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) - req->cmd_flags |= REQ_SOFTBARRIER; - } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) - req->cmd_flags |= REQ_HARDBARRIER; - - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) - req->cmd_flags |= REQ_RW_SYNC; - if (bio_rw_flagged(bio, BIO_RW_META)) - req->cmd_flags |= REQ_RW_META; - if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) - req->cmd_flags |= REQ_NOIDLE; req->errors = 0; req->__sector = bio->bi_sector; @@ -1147,7 +1189,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); + return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); } static int __make_request(struct request_queue *q, struct bio *bio) @@ -1156,12 +1198,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); - const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); - const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; + const bool sync = !!(bio->bi_rw & REQ_SYNC); + const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); + const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; - if (bio_rw_flagged(bio, BIO_RW_BARRIER) && + if ((bio->bi_rw & REQ_HARDBARRIER) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; @@ -1175,7 +1217,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) + if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1198,6 +1240,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1231,6 +1274,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1248,7 +1292,7 @@ get_rq: */ rw_flags = bio_data_dir(bio); if (sync) - rw_flags |= REQ_RW_SYNC; + rw_flags |= REQ_SYNC; /* * Grab a free request. This is might sleep but can not fail. @@ -1437,7 +1481,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && + if (unlikely(!(bio->bi_rw & REQ_DISCARD) && nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), @@ -1470,8 +1514,10 @@ static inline void __generic_make_request(struct bio *bio) if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_rw_flagged(bio, BIO_RW_DISCARD) && - !blk_queue_discard(q)) { + if ((bio->bi_rw & REQ_DISCARD) && + (!blk_queue_discard(q) || + ((bio->bi_rw & REQ_SECURE) && + !blk_queue_secdiscard(q)))) { err = -EOPNOTSUPP; goto end_io; } @@ -1490,9 +1536,9 @@ end_io: /* * We only want one ->make_request_fn to be active at a time, * else stack usage with stacked devices could be a problem. - * So use current->bio_{list,tail} to keep a list of requests + * So use current->bio_list to keep a list of requests * submited by a make_request_fn function. - * current->bio_tail is also used as a flag to say if + * current->bio_list is also used as a flag to say if * generic_make_request is currently active in this task or not. * If it is NULL, then no make_request is active. If it is non-NULL, * then a make_request is active, and new requests should be added @@ -1500,11 +1546,11 @@ end_io: */ void generic_make_request(struct bio *bio) { - if (current->bio_tail) { + struct bio_list bio_list_on_stack; + + if (current->bio_list) { /* make_request is active */ - *(current->bio_tail) = bio; - bio->bi_next = NULL; - current->bio_tail = &bio->bi_next; + bio_list_add(current->bio_list, bio); return; } /* following loop may be a bit non-obvious, and so deserves some @@ -1512,30 +1558,27 @@ void generic_make_request(struct bio *bio) * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so - * we assign bio_list to the next (which is NULL) and bio_tail - * to &bio_list, thus initialising the bio_list of new bios to be + * we assign bio_list to a pointer to the bio_list_on_stack, + * thus initialising the bio_list of new bios to be * added. __generic_make_request may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so fixup bio_list and - * bio_tail or bi_next, and call into __generic_make_request again. + * of the top of the list (no pretending) and so remove it from + * bio_list, and call into __generic_make_request again. * * The loop was structured like this to make only one call to * __generic_make_request (which is important as it is large and * inlined) and to keep the structure simple. */ BUG_ON(bio->bi_next); + bio_list_init(&bio_list_on_stack); + current->bio_list = &bio_list_on_stack; do { - current->bio_list = bio->bi_next; - if (bio->bi_next == NULL) - current->bio_tail = ¤t->bio_list; - else - bio->bi_next = NULL; __generic_make_request(bio); - bio = current->bio_list; + bio = bio_list_pop(current->bio_list); } while (bio); - current->bio_tail = NULL; /* deactivate */ + current->bio_list = NULL; /* deactivate */ } EXPORT_SYMBOL(generic_make_request); @@ -1559,7 +1602,7 @@ void submit_bio(int rw, struct bio *bio) * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (bio_has_data(bio)) { + if (bio_has_data(bio) && !(rw & REQ_DISCARD)) { if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { @@ -1604,6 +1647,9 @@ EXPORT_SYMBOL(submit_bio); */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { + if (rq->cmd_flags & REQ_DISCARD) + return 0; + if (blk_rq_sectors(rq) > queue_max_sectors(q) || blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { printk(KERN_ERR "%s: over max size limit.\n", __func__); @@ -1617,8 +1663,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) * limitation. */ blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_phys_segments(q) || - rq->nr_phys_segments > queue_max_hw_segments(q)) { + if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } @@ -1773,7 +1818,7 @@ struct request *blk_peek_request(struct request_queue *q) * sees this request (possibly after * requeueing). Notify IO scheduler. */ - if (blk_sorted_rq(rq)) + if (rq->cmd_flags & REQ_SORTED) elv_activate_rq(q, rq); /* @@ -1861,12 +1906,7 @@ void blk_dequeue_request(struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; - /* - * Mark this device as supporting hardware queuing, if - * we have more IOs in flight than 4. - */ - if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) - set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + set_io_start_time_ns(rq); } } @@ -1966,10 +2006,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * TODO: tj: This is too subtle. It would be better to let * low level drivers do what they see fit. */ - if (blk_fs_request(req)) + if (req->cmd_type == REQ_TYPE_FS) req->errors = 0; - if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { + if (error && req->cmd_type == REQ_TYPE_FS && + !(req->cmd_flags & REQ_QUIET)) { printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); @@ -2056,7 +2097,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ - if (blk_fs_request(req) || blk_discard_rq(req)) + if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ @@ -2093,11 +2134,32 @@ static bool blk_update_bidi_request(struct request *rq, int error, blk_update_request(rq->next_rq, error, bidi_bytes)) return true; - add_disk_randomness(rq->rq_disk); + if (blk_queue_add_random(rq->q)) + add_disk_randomness(rq->rq_disk); return false; } +/** + * blk_unprep_request - unprepare a request + * @req: the request + * + * This function makes a request ready for complete resubmission (or + * completion). It happens only after all error handling is complete, + * so represents the appropriate moment to deallocate any resources + * that were allocated to the request in the prep_rq_fn. The queue + * lock is held when calling this. + */ +void blk_unprep_request(struct request *req) +{ + struct request_queue *q = req->q; + + req->cmd_flags &= ~REQ_DONTPREP; + if (q->unprep_rq_fn) + q->unprep_rq_fn(q, req); +} +EXPORT_SYMBOL_GPL(blk_unprep_request); + /* * queue lock must be held */ @@ -2108,11 +2170,15 @@ static void blk_finish_request(struct request *req, int error) BUG_ON(blk_queued_rq(req)); - if (unlikely(laptop_mode) && blk_fs_request(req)) - laptop_io_completion(); + if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS) + laptop_io_completion(&req->q->backing_dev_info); blk_delete_timer(req); + if (req->cmd_flags & REQ_DONTPREP) + blk_unprep_request(req); + + blk_account_io_done(req); if (req->end_io) @@ -2345,7 +2411,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ - rq->cmd_flags |= bio->bi_rw & REQ_RW; + rq->cmd_flags |= bio->bi_rw & REQ_WRITE; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -2432,6 +2498,8 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); + if (src->cmd_flags & REQ_DISCARD) + dst->cmd_flags |= REQ_DISCARD; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2528,4 +2596,3 @@ int __init blk_dev_init(void) return 0; } - diff --git a/block/blk-exec.c b/block/blk-exec.c index 49557e91f0da..e1672f14840e 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -57,7 +57,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where, 1); __generic_unplug_device(q); /* the queue is stopped so it won't be plugged+unplugged */ - if (blk_pm_resume_request(rq)) + if (rq->cmd_type == REQ_TYPE_PM_RESUME) q->request_fn(q); spin_unlock_irq(q->queue_lock); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 15c630813b1c..edce1ef7933d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -24,6 +24,7 @@ #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> +#include <linux/slab.h> #include "blk.h" @@ -278,7 +279,7 @@ static struct attribute *integrity_attrs[] = { NULL, }; -static struct sysfs_ops integrity_ops = { +static const struct sysfs_ops integrity_ops = { .show = &integrity_attr_show, .store = &integrity_attr_store, }; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 98e6bf61b0ac..d22c4c55c406 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -7,6 +7,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ +#include <linux/slab.h> #include "blk.h" @@ -91,7 +92,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) spin_lock_init(&ret->lock); ret->ioprio_changed = 0; ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ + ret->last_waited = 0; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); diff --git a/block/blk-lib.c b/block/blk-lib.c new file mode 100644 index 000000000000..c392029a104e --- /dev/null +++ b/block/blk-lib.c @@ -0,0 +1,231 @@ +/* + * Functions related to generic helpers functions + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> + +#include "blk.h" + +static void blkdev_discard_end_io(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + + if (bio->bi_private) + complete(bio->bi_private); + + bio_put(bio); +} + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Issue a discard request for the sectors in question. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = flags & BLKDEV_IFL_BARRIER ? + DISCARD_BARRIER : DISCARD_NOBARRIER; + unsigned int max_discard_sectors; + struct bio *bio; + int ret = 0; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + /* + * Ensure that max_discard_sectors is of the proper + * granularity + */ + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + if (q->limits.discard_granularity) { + unsigned int disc_sects = q->limits.discard_granularity >> 9; + + max_discard_sectors &= ~(disc_sects - 1); + } + + if (flags & BLKDEV_IFL_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + type |= DISCARD_SECURE; + } + + while (nr_sects && !ret) { + bio = bio_alloc(gfp_mask, 1); + if (!bio) { + ret = -ENOMEM; + break; + } + + bio->bi_sector = sector; + bio->bi_end_io = blkdev_discard_end_io; + bio->bi_bdev = bdev; + if (flags & BLKDEV_IFL_WAIT) + bio->bi_private = &wait; + + if (nr_sects > max_discard_sectors) { + bio->bi_size = max_discard_sectors << 9; + nr_sects -= max_discard_sectors; + sector += max_discard_sectors; + } else { + bio->bi_size = nr_sects << 9; + nr_sects = 0; + } + + bio_get(bio); + submit_bio(type, bio); + + if (flags & BLKDEV_IFL_WAIT) + wait_for_completion(&wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + bio_put(bio); + } + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_discard); + +struct bio_batch +{ + atomic_t done; + unsigned long flags; + struct completion *wait; + bio_end_io_t *end_io; +}; + +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bb->flags); + else + clear_bit(BIO_UPTODATE, &bb->flags); + } + if (bb) { + if (bb->end_io) + bb->end_io(bio, err); + atomic_inc(&bb->done); + complete(bb->wait); + } + bio_put(bio); +} + +/** + * blkdev_issue_zeroout generate number of zero filed write bios + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Generate and issue number of bios with zerofiled pages. + * Send barrier at the beginning and at the end if requested. This guarantie + * correct request ordering. Empty barrier allow us to avoid post queue flush. + */ + +int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + int ret; + struct bio *bio; + struct bio_batch bb; + unsigned int sz, issued = 0; + DECLARE_COMPLETION_ONSTACK(wait); + + atomic_set(&bb.done, 0); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + bb.end_io = NULL; + + if (flags & BLKDEV_IFL_BARRIER) { + /* issue async barrier before the data */ + ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); + if (ret) + return ret; + } +submit: + ret = 0; + while (nr_sects != 0) { + bio = bio_alloc(gfp_mask, + min(nr_sects, (sector_t)BIO_MAX_PAGES)); + if (!bio) { + ret = -ENOMEM; + break; + } + + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = bio_batch_end_io; + if (flags & BLKDEV_IFL_WAIT) + bio->bi_private = &bb; + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); + if (sz == 0) + /* bio has maximum size possible */ + break; + ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= ret >> 9; + sector += ret >> 9; + if (ret < (sz << 9)) + break; + } + ret = 0; + issued++; + submit_bio(WRITE, bio); + } + /* + * When all data bios are in flight. Send final barrier if requeted. + */ + if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) + ret = blkdev_issue_flush(bdev, gfp_mask, NULL, + flags & BLKDEV_IFL_WAIT); + + + if (flags & BLKDEV_IFL_WAIT) + /* Wait for bios in-flight */ + while ( issued != atomic_read(&bb.done)) + wait_for_completion(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + /* One of bios in the batch was completed with error.*/ + ret = -EIO; + + if (ret) + goto out; + + if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { + ret = -EOPNOTSUPP; + goto out; + } + if (nr_sects != 0) + goto submit; +out: + return ret; +} +EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c index 9083cf0180cc..ade0a08c9099 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return PTR_ERR(bio); if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; if (do_copy) rq->cmd_flags |= REQ_COPY_USER; diff --git a/block/blk-merge.c b/block/blk-merge.c index 99cb5cf1f447..eafc94f68d79 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,7 +12,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { - unsigned int phys_size; struct bio_vec *bv, *bvprv = NULL; int cluster, i, high, highprv = 1; unsigned int seg_size, nr_phys_segs; @@ -24,7 +23,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, fbio = bio; cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); seg_size = 0; - phys_size = nr_phys_segs = 0; + nr_phys_segs = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, i) { /* @@ -180,7 +179,7 @@ new_segment: } if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (rq->cmd_flags & REQ_RW) + if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); sg->page_link &= ~0x02; @@ -206,8 +205,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || - req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) { + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -227,7 +225,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, { unsigned short max_sectors; - if (unlikely(blk_pc_request(req))) + if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) max_sectors = queue_max_hw_sectors(q); else max_sectors = queue_max_sectors(q); @@ -251,7 +249,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, { unsigned short max_sectors; - if (unlikely(blk_pc_request(req))) + if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) max_sectors = queue_max_hw_sectors(q); else max_sectors = queue_max_sectors(q); @@ -300,10 +298,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, total_phys_segments--; } - if (total_phys_segments > queue_max_phys_segments(q)) - return 0; - - if (total_phys_segments > queue_max_hw_segments(q)) + if (total_phys_segments > queue_max_segments(q)) return 0; /* Merge is OK... */ @@ -367,6 +362,18 @@ static int attempt_merge(struct request_queue *q, struct request *req, return 0; /* + * Don't merge file system requests and discard requests + */ + if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) + return 0; + + /* + * Don't merge discard requests and secure discard requests + */ + if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) + return 0; + + /* * not contiguous */ if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) diff --git a/block/blk-settings.c b/block/blk-settings.c index 5eeb9e0d256e..a234f4bf1d6f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,7 +8,9 @@ #include <linux/blkdev.h> #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ #include <linux/gcd.h> +#include <linux/lcm.h> #include <linux/jiffies.h> +#include <linux/gfp.h> #include "blk.h" @@ -35,6 +37,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) EXPORT_SYMBOL(blk_queue_prep_rq); /** + * blk_queue_unprep_rq - set an unprepare_request function for queue + * @q: queue + * @ufn: unprepare_request function + * + * It's possible for a queue to register an unprepare_request callback + * which is invoked before the request is finally completed. The goal + * of the function is to deallocate any data that was allocated in the + * prepare_request callback. + * + */ +void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) +{ + q->unprep_rq_fn = ufn; +} +EXPORT_SYMBOL(blk_queue_unprep_rq); + +/** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue * @mbfn: merge_bvec_fn @@ -91,10 +110,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); */ void blk_set_default_limits(struct queue_limits *lim) { - lim->max_phys_segments = MAX_PHYS_SEGMENTS; - lim->max_hw_segments = MAX_HW_SEGMENTS; + lim->max_segments = BLK_MAX_SEGMENTS; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->max_segment_size = MAX_SEGMENT_SIZE; + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; lim->max_hw_sectors = INT_MAX; lim->max_discard_sectors = 0; @@ -154,7 +172,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->unplug_timer.data = (unsigned long)q; blk_set_default_limits(&q->limits); - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); + blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); /* * If the caller didn't supply a lock, fall back to our embedded @@ -210,37 +228,32 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_sectors - set max sectors for a request for this queue + * blk_queue_max_hw_sectors - set max sectors for a request for this queue * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. + * Enables a low level driver to set a hard upper limit, + * max_hw_sectors, on the size of requests. max_hw_sectors is set by + * the device driver based upon the combined capabilities of I/O + * controller and storage device. + * + * max_sectors is a soft limit imposed by the block layer for + * filesystem type requests. This value can be overridden on a + * per-device basis in /sys/block/<device>/queue/max_sectors_kb. + * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { + max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_sectors); + __func__, max_hw_sectors); } - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors; - else { - q->limits.max_sectors = BLK_DEF_MAX_SECTORS; - q->limits.max_hw_sectors = max_sectors; - } -} -EXPORT_SYMBOL(blk_queue_max_sectors); - -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) -{ - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS; - else - q->limits.max_hw_sectors = max_sectors; + q->limits.max_hw_sectors = max_hw_sectors; + q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -257,17 +270,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. + * hw data segments in a request. **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) +void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) { if (!max_segments) { max_segments = 1; @@ -275,33 +286,9 @@ void blk_queue_max_phys_segments(struct request_queue *q, __func__, max_segments); } - q->limits.max_phys_segments = max_segments; + q->limits.max_segments = max_segments; } -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give at once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); - } - - q->limits.max_hw_segments = max_segments; -} -EXPORT_SYMBOL(blk_queue_max_hw_segments); +EXPORT_SYMBOL(blk_queue_max_segments); /** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg @@ -493,21 +480,11 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) } EXPORT_SYMBOL(blk_queue_stack_limits); -static unsigned int lcm(unsigned int a, unsigned int b) -{ - if (a && b) - return (a * b) / gcd(a, b); - else if (b) - return b; - - return a; -} - /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) - * @offset: offset to beginning of data within component device + * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure @@ -525,10 +502,9 @@ static unsigned int lcm(unsigned int a, unsigned int b) * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { - sector_t alignment; - unsigned int top, bottom, ret = 0; + unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -537,18 +513,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); - t->max_phys_segments = min_not_zero(t->max_phys_segments, - b->max_phys_segments); - - t->max_hw_segments = min_not_zero(t->max_hw_segments, - b->max_hw_segments); + t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, offset); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. @@ -611,11 +583,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Discard alignment and granularity */ if (b->discard_granularity) { - unsigned int granularity = b->discard_granularity; - offset &= granularity - 1; - - alignment = (granularity + b->discard_alignment - offset) - & (granularity - 1); + alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -657,7 +625,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, start += get_start_sect(bdev); - return blk_stack_limits(t, &bq->limits, start << 9); + return blk_stack_limits(t, &bq->limits, start); } EXPORT_SYMBOL(bdev_stack_limits); @@ -668,9 +636,8 @@ EXPORT_SYMBOL(bdev_stack_limits); * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -678,9 +645,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); @@ -752,22 +717,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad); * does is adjust the queue so that the buf is always appended * silently to the scatterlist. * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. + * Note: This routine adjusts max_hw_segments to make room for appending + * the drain buffer. If you call blk_queue_max_segments() after calling + * this routine, you must set the limit to one fewer than your device + * can support otherwise there won't be room for the drain buffer. */ int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size) { - if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) + if (queue_max_segments(q) < 2) return -EINVAL; /* make room for appending the drain */ - blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); - blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1); + blk_queue_max_segments(q, queue_max_segments(q) - 1); q->dma_drain_needed = dma_drain_needed; q->dma_drain_buffer = buf; q->dma_drain_size = size; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8606c9543fdd..0749b89c6885 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -2,6 +2,7 @@ * Functions related to sysfs handling */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -106,6 +107,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_sectors_kb, (page)); } +static ssize_t queue_max_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segments(q), (page)); +} + +static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + return queue_var_show(queue_max_segment_size(q), (page)); + + return queue_var_show(PAGE_CACHE_SIZE, (page)); +} + static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) { return queue_var_show(queue_logical_block_size(q), page); @@ -166,30 +180,41 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_hw_sectors_kb, (page)); } -static ssize_t queue_nonrot_show(struct request_queue *q, char *page) -{ - return queue_var_show(!blk_queue_nonrot(q), page); +#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ +static ssize_t \ +queue_show_##name(struct request_queue *q, char *page) \ +{ \ + int bit; \ + bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ + return queue_var_show(neg ? !bit : bit, page); \ +} \ +static ssize_t \ +queue_store_##name(struct request_queue *q, const char *page, size_t count) \ +{ \ + unsigned long val; \ + ssize_t ret; \ + ret = queue_var_store(&val, page, count); \ + if (neg) \ + val = !val; \ + \ + spin_lock_irq(q->queue_lock); \ + if (val) \ + queue_flag_set(QUEUE_FLAG_##flag, q); \ + else \ + queue_flag_clear(QUEUE_FLAG_##flag, q); \ + spin_unlock_irq(q->queue_lock); \ + return ret; \ } -static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, - size_t count) -{ - unsigned long nm; - ssize_t ret = queue_var_store(&nm, page, count); - - spin_lock_irq(q->queue_lock); - if (nm) - queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - queue_flag_set(QUEUE_FLAG_NONROT, q); - spin_unlock_irq(q->queue_lock); - - return ret; -} +QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); +QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); +QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); +#undef QUEUE_SYSFS_BIT_FNS static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nomerges(q), page); + return queue_var_show((blk_queue_nomerges(q) << 1) | + blk_queue_noxmerges(q), page); } static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, @@ -199,10 +224,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, ssize_t ret = queue_var_store(&nm, page, count); spin_lock_irq(q->queue_lock); - if (nm) + queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + if (nm == 2) queue_flag_set(QUEUE_FLAG_NOMERGES, q); - else - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + else if (nm) + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); spin_unlock_irq(q->queue_lock); return ret; @@ -233,27 +260,6 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_iostats_show(struct request_queue *q, char *page) -{ - return queue_var_show(blk_queue_io_stat(q), page); -} - -static ssize_t queue_iostats_store(struct request_queue *q, const char *page, - size_t count) -{ - unsigned long stats; - ssize_t ret = queue_var_store(&stats, page, count); - - spin_lock_irq(q->queue_lock); - if (stats) - queue_flag_set(QUEUE_FLAG_IO_STAT, q); - else - queue_flag_clear(QUEUE_FLAG_IO_STAT, q); - spin_unlock_irq(q->queue_lock); - - return ret; -} - static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -277,6 +283,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = { .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_max_segments_entry = { + .attr = {.name = "max_segments", .mode = S_IRUGO }, + .show = queue_max_segments_show, +}; + +static struct queue_sysfs_entry queue_max_segment_size_entry = { + .attr = {.name = "max_segment_size", .mode = S_IRUGO }, + .show = queue_max_segment_size_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -325,8 +341,8 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, - .show = queue_nonrot_show, - .store = queue_nonrot_store, + .show = queue_show_nonrot, + .store = queue_store_nonrot, }; static struct queue_sysfs_entry queue_nomerges_entry = { @@ -343,8 +359,14 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = { static struct queue_sysfs_entry queue_iostats_entry = { .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR }, - .show = queue_iostats_show, - .store = queue_iostats_store, + .show = queue_show_iostats, + .store = queue_store_iostats, +}; + +static struct queue_sysfs_entry queue_random_entry = { + .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR }, + .show = queue_show_random, + .store = queue_store_random, }; static struct attribute *default_attrs[] = { @@ -352,6 +374,8 @@ static struct attribute *default_attrs[] = { &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_max_segments_entry.attr, + &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, @@ -365,6 +389,7 @@ static struct attribute *default_attrs[] = { &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, + &queue_random_entry.attr, NULL, }; @@ -447,7 +472,7 @@ static void blk_release_queue(struct kobject *kobj) kmem_cache_free(blk_requestq_cachep, q); } -static struct sysfs_ops queue_sysfs_ops = { +static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; @@ -486,6 +511,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + kobject_put(&dev->kobj); return ret; } diff --git a/block/blk-tag.c b/block/blk-tag.c index 6b0f52c20964..ece65fc4c79b 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/slab.h> #include "blk.h" diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 1ba7e0aca878..4f0c06c7a338 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data) struct request_queue *q = (struct request_queue *) data; unsigned long flags, next = 0; struct request *rq, *tmp; + int next_set = 0; spin_lock_irqsave(q->queue_lock, flags); @@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data) if (blk_mark_rq_complete(rq)) continue; blk_rq_timed_out(rq); - } else if (!next || time_after(next, rq->deadline)) + } else if (!next_set || time_after(next, rq->deadline)) { next = rq->deadline; + next_set = 1; + } } - /* - * next can never be 0 here with the list non-empty, since we always - * bump ->deadline to 1 so we can detect if the timer was ever added - * or not. See comment in blk_add_timer() - */ - if (next) + if (next_set) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); diff --git a/block/blk.h b/block/blk.h index 5ee3d7e72feb..d6b911ac002c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) static inline int blk_cpu_to_group(int cpu) { + int group = NR_CPUS; #ifdef CONFIG_SCHED_MC const struct cpumask *mask = cpu_coregroup_mask(cpu); - return cpumask_first(mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return cpumask_first(topology_thread_cpumask(cpu)); + group = cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif + if (likely(group < NR_CPUS)) + return group; + return cpu; } /* @@ -161,8 +165,10 @@ static inline int blk_cpu_to_group(int cpu) */ static inline int blk_do_io_stat(struct request *rq) { - return rq->rq_disk && blk_rq_io_stat(rq) && - (blk_fs_request(rq) || blk_discard_rq(rq)); + return rq->rq_disk && + (rq->cmd_flags & REQ_IO_STAT) && + (rq->cmd_type == REQ_TYPE_FS || + (rq->cmd_flags & REQ_DISCARD)); } #endif diff --git a/block/bsg.c b/block/bsg.c index a9fd2d84b53a..0c00870553a3 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -21,6 +21,7 @@ #include <linux/idr.h> #include <linux/bsg.h> #include <linux/smp_lock.h> +#include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> @@ -260,7 +261,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, return ERR_PTR(ret); /* - * map scatter-gather elements seperately and string them to request + * map scatter-gather elements separately and string them to request */ rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) @@ -425,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, /* * fill in all the output members */ - hdr->device_status = status_byte(rq->errors); + hdr->device_status = rq->errors & 0xff; hdr->transport_status = host_byte(rq->errors); hdr->driver_status = driver_byte(rq->errors); hdr->info = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee130f14d1fc..9eba291eb6fd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -7,19 +7,20 @@ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> */ #include <linux/module.h> +#include <linux/slab.h> #include <linux/blkdev.h> #include <linux/elevator.h> #include <linux/jiffies.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include "cfq.h" /* * tunables */ /* max queue in one round of service */ -static const int cfq_quantum = 4; +static const int cfq_quantum = 8; static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* maximum backwards seek, in KiB */ static const int cfq_back_max = 16 * 1024; @@ -29,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_group_idle = HZ / 125; static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; @@ -42,19 +44,19 @@ static const int cfq_hist_divisor = 4; */ #define CFQ_MIN_TT (2) -/* - * Allow merged cfqqs to perform this amount of seeky I/O before - * deciding to break the queues up again. - */ -#define CFQQ_COOP_TOUT (HZ) - #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) #define CFQ_SERVICE_SHIFT 12 +#define CFQQ_SEEK_THR (sector_t)(8 * 100) +#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) + #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -63,6 +65,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); +static DEFINE_SPINLOCK(cic_index_lock); +static DEFINE_IDA(cic_index_ida); + #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -80,11 +85,12 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; + unsigned total_weight; u64 min_vdisktime; struct rb_node *active; - unsigned total_weight; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_vdisktime = 0, } /* * Per process-grouping structure @@ -118,11 +124,11 @@ struct cfq_queue { /* time when queue got scheduled in to dispatch first request. */ unsigned long dispatch_start; unsigned int allocated_slice; + unsigned int slice_dispatch; /* time when first request from queue completed and slice started. */ unsigned long slice_start; unsigned long slice_end; long slice_resid; - unsigned int slice_dispatch; /* pending metadata requests */ int meta_pending; @@ -133,19 +139,16 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - sector_t last_request_pos; - unsigned long seeky_start; - pid_t pid; + u32 seek_history; + sector_t last_request_pos; + struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; struct cfq_group *orig_cfqg; - /* Sectors dispatched in current dispatch round */ + /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; @@ -198,6 +201,8 @@ struct cfq_group { struct hlist_node cfqd_node; atomic_t ref; #endif + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; }; /* @@ -227,8 +232,8 @@ struct cfq_data { unsigned int busy_queues; - int rq_in_driver[2]; - int sync_flight; + int rq_in_driver; + int rq_in_flight[2]; /* * queue-depth detection @@ -271,9 +276,11 @@ struct cfq_data { unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; + unsigned int cfq_group_idle; unsigned int cfq_latency; unsigned int cfq_group_isolation; + unsigned int cic_index; struct list_head cic_list; /* @@ -314,6 +321,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; @@ -342,11 +350,12 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); +CFQ_CFQQ_FNS(split_coop); CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#ifdef CONFIG_CFQ_GROUP_IOSCHED #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ @@ -375,6 +384,21 @@ CFQ_CFQQ_FNS(wait_busy); &cfqg->service_trees[i][j]: NULL) \ +static inline bool iops_mode(struct cfq_data *cfqd) +{ + /* + * If we are not idling on queues and it is a NCQ drive, parallel + * execution of requests is on and measuring time is not possible + * in most of the cases until and unless we drive shallower queue + * depths and that becomes a performance bottleneck. In such cases + * switch to start providing fairness in terms of number of IOs. + */ + if (!cfqd->cfq_slice_idle && cfqd->hw_tag) + return true; + else + return false; +} + static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) @@ -419,11 +443,6 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); -static inline int rq_in_driver(struct cfq_data *cfqd) -{ - return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; -} - static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, bool is_sync) { @@ -436,13 +455,31 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, cic->cfqq[is_sync] = cfqq; } +#define CIC_DEAD_KEY 1ul +#define CIC_DEAD_INDEX_SHIFT 1 + +static inline void *cfqd_dead_key(struct cfq_data *cfqd) +{ + return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); +} + +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) +{ + struct cfq_data *cfqd = cic->key; + + if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) + return NULL; + + return cfqd; +} + /* * We regard a request as SYNC, if it's either a read or has the SYNC bit * set (in which case it could also be direct WRITE). */ static inline bool cfq_bio_sync(struct bio *bio) { - return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO); + return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); } /* @@ -630,9 +667,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, return rq1; else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) return rq2; - if (rq_is_meta(rq1) && !rq_is_meta(rq2)) + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) return rq1; - else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) + else if ((rq2->cmd_flags & REQ_META) && + !(rq1->cmd_flags & REQ_META)) return rq2; s1 = blk_rq_pos(rq1); @@ -863,7 +901,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); cfqg->saved_workload_slice = 0; - blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); + cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) @@ -889,8 +927,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) slice_used = cfqq->allocated_slice; } - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, - cfqq->nr_sectors); return slice_used; } @@ -898,19 +934,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl, charge_sl; + unsigned int used_sl, charge; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; BUG_ON(nr_sync < 0); - used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + used_sl = charge = cfq_cfqq_slice_usage(cfqq); - if (!cfq_cfqq_sync(cfqq) && !nr_sync) - charge_sl = cfqq->allocated_slice; + if (iops_mode(cfqd)) + charge = cfqq->slice_dispatch; + else if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge = cfqq->allocated_slice; /* Can't update vdisktime while group is on service tree */ cfq_rb_erase(&cfqg->rb_node, st); - cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge, cfqg); __cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ @@ -924,8 +962,11 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, - cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" + " sect=%u", used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); + cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); + cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } #ifdef CONFIG_CFQ_GROUP_IOSCHED @@ -953,11 +994,12 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; unsigned int major, minor; - /* Do we need to take this reference */ - if (!blkiocg_css_tryget(blkcg)) - return NULL;; - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + goto done; + } if (cfqg || !create) goto done; @@ -965,7 +1007,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) if (!cfqg) goto done; - cfqg->weight = blkcg->weight; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); @@ -978,16 +1019,26 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) */ atomic_set(&cfqg->ref, 1); - /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initiliazed yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. See code above. + */ + if (bdi->dev) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + 0); + + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); /* Add group on cfqd list */ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); done: - blkiocg_css_put(blkcg); return cfqg; } @@ -1009,6 +1060,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) return cfqg; } +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + atomic_inc(&cfqg->ref); + return cfqg; +} + static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ @@ -1058,7 +1115,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd) * it from cgroup list, then it will take care of destroying * cfqg also. */ - if (!blkiocg_del_blkio_group(&cfqg->blkg)) + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) cfq_destroy_cfqg(cfqd, cfqg); } } @@ -1092,6 +1149,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) { return &cfqd->root_group; } + +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + return cfqg; +} + static inline void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; @@ -1394,7 +1457,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); cfq_add_rq_rb(rq); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); } static struct request * @@ -1422,9 +1490,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver[rq_is_sync(rq)]++; + cfqd->rq_in_driver++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -1432,12 +1500,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver[sync]); - cfqd->rq_in_driver[sync]--; + WARN_ON(!cfqd->rq_in_driver); + cfqd->rq_in_driver--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); } static void cfq_remove_request(struct request *rq) @@ -1451,7 +1518,9 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - if (rq_is_meta(rq)) { + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); + if (rq->cmd_flags & REQ_META) { WARN_ON(!cfqq->meta_pending); cfqq->meta_pending--; } @@ -1482,6 +1551,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, } } +static void cfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, + bio_data_dir(bio), cfq_bio_sync(bio)); +} + static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) @@ -1499,6 +1575,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(next), rq_is_sync(next)); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, @@ -1526,11 +1604,19 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, return cfqq == RQ_CFQQ(rq); } +static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + del_timer(&cfqd->idle_slice_timer); + cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); +} + static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", + cfqd->serving_prio, cfqd->serving_type); + cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -1544,7 +1630,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); } cfqd->active_queue = cfqq; @@ -1560,12 +1646,21 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); if (cfq_cfqq_wait_request(cfqq)) - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_wait_busy(cfqq); /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) + cfq_mark_cfqq_split_coop(cfqq); + + /* * store what was left of this slice, if the queue idled/timed out */ if (timed_out && !cfq_cfqq_slice_new(cfqq)) { @@ -1663,22 +1758,10 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - blk_rq_pos(rq); } -#define CFQQ_SEEK_THR 8 * 1024 -#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) - static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq, bool for_preempt) + struct request *rq) { - sector_t sdist = cfqq->seek_mean; - - if (!sample_valid(cfqq->seek_samples)) - sdist = CFQQ_SEEK_THR; - - /* if seek_mean is big, using it as close criteria is meaningless */ - if (sdist > CFQQ_SEEK_THR && !for_preempt) - sdist = CFQQ_SEEK_THR; - - return cfq_dist_from_last(cfqd, rq) <= sdist; + return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; } static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, @@ -1705,7 +1788,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1716,7 +1799,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; return NULL; @@ -1737,6 +1820,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, { struct cfq_queue *cfqq; + if (cfq_class_idle(cur_cfqq)) + return NULL; if (!cfq_cfqq_sync(cur_cfqq)) return NULL; if (CFQQ_SEEKY(cur_cfqq)) @@ -1790,6 +1875,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!service_tree); BUG_ON(!service_tree->count); + if (!cfqd->cfq_slice_idle) + return false; + /* We never do for idle class queues. */ if (prio == IDLE_WORKLOAD) return false; @@ -1803,14 +1891,18 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - return service_tree->count == 1; + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + return 1; + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", + service_tree->count); + return 0; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; - unsigned long sl; + unsigned long sl, group_idle = 0; /* * SSD device without seek penalty, disable idling. But only do so @@ -1826,8 +1918,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) - return; + if (!cfq_should_idle(cfqd, cfqq)) { + /* no queue idling. Check for group idling */ + if (cfqd->cfq_group_idle) + group_idle = cfqd->cfq_group_idle; + else + return; + } /* * still active requests from this queue, don't idle @@ -1848,15 +1945,27 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * time slice. */ if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) + (cfqq->slice_end - jiffies < cic->ttime_mean)) { + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", + cic->ttime_mean); + return; + } + + /* There are other queues in the group, don't do group idle */ + if (group_idle && cfqq->cfqg->nr_cfqq > 1) return; cfq_mark_cfqq_wait_request(cfqq); - sl = cfqd->cfq_slice_idle; + if (group_idle) + sl = cfqd->cfq_group_idle; + else + sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, + group_idle ? 1 : 0); } /* @@ -1872,11 +1981,13 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; + (RQ_CFQG(rq))->dispatched++; elv_dispatch_sort(q, rq); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight++; + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); + cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), + rq_data_dir(rq), rq_is_sync(rq)); } /* @@ -1930,6 +2041,15 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) int process_refs, new_process_refs; struct cfq_queue *__cfqq; + /* + * If there are no process references on the new_cfqq, then it is + * unsafe to follow the ->new_cfqq chain as other cfqq's in the + * chain may have dropped their last reference (not just their + * last process reference). + */ + if (!cfqq_process_refs(new_cfqq)) + return; + /* Avoid a circular list and skip interim queue merges */ while ((__cfqq = new_cfqq->new_cfqq)) { if (__cfqq == cfqq) @@ -1938,17 +2058,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } process_refs = cfqq_process_refs(cfqq); + new_process_refs = cfqq_process_refs(new_cfqq); /* * If the process for the cfqq has gone away, there is no * sense in merging the queues. */ - if (process_refs == 0) + if (process_refs == 0 || new_process_refs == 0) return; /* * Merge in the direction of the lesser amount of work. */ - new_process_refs = cfqq_process_refs(new_cfqq); if (new_process_refs >= process_refs) { cfqq->new_cfqq = new_cfqq; atomic_add(process_refs, &new_cfqq->ref); @@ -2058,6 +2178,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max(slice, 2 * cfqd->cfq_slice_idle); slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; cfqd->noidle_tree_requires_idle = false; } @@ -2131,7 +2252,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) cfqq = NULL; goto keep_queue; } else - goto expire; + goto check_group_idle; } /* @@ -2159,8 +2280,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * flight or is idling for a new request, allow either of these * conditions to happen (or time out) before selecting a new queue. */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { + if (timer_pending(&cfqd->idle_slice_timer)) { + cfqq = NULL; + goto keep_queue; + } + + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } + + /* + * If group idle is enabled and there are requests dispatched from + * this group, wait for requests to complete. + */ +check_group_idle: + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 + && cfqq->cfqg->dispatched) { cfqq = NULL; goto keep_queue; } @@ -2205,16 +2341,32 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) + /* Expire the timeslice of the current active queue first */ + cfq_slice_expired(cfqd, 0); + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { + __cfq_set_active_queue(cfqd, cfqq); dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } - cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); cfq_log(cfqd, "forced_dispatch=%d", dispatched); return dispatched; } +static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + /* the queue hasn't finished any request, can't estimate */ + if (cfq_cfqq_slice_new(cfqq)) + return 1; + if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, + cfqq->slice_end)) + return 1; + + return 0; +} + static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; @@ -2222,16 +2374,16 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * Drain async requests before we start sync IO */ - if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) return false; /* * If this is an async queue and we have sync IO in flight, let it wait */ - if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) + if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) return false; - max_dispatch = cfqd->cfq_quantum; + max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); if (cfq_class_idle(cfqq)) max_dispatch = 1; @@ -2248,13 +2400,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1) + if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) return false; /* * Sole queue user, no limit */ - max_dispatch = -1; + if (cfqd->busy_queues == 1) + max_dispatch = -1; + else + /* + * Normally we start throttling cfqq when cfq_quantum/2 + * requests have been dispatched. But we can drive + * deeper queue depths at the beginning of slice + * subjected to upper limit of cfq_quantum. + * */ + max_dispatch = cfqd->cfq_quantum; } /* @@ -2450,11 +2611,12 @@ static void cfq_cic_free(struct cfq_io_context *cic) static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) { unsigned long flags; + unsigned long dead_key = (unsigned long) cic->key; - BUG_ON(!cic->dead_key); + BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, cic->dead_key); + radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2477,15 +2639,10 @@ static void cfq_free_io_context(struct io_context *ioc) __call_for_each_cic(ioc, cic_free_func); } -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_put_cooperator(struct cfq_queue *cfqq) { struct cfq_queue *__cfqq, *next; - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - /* * If this queue was scheduled to merge with another queue, be * sure to drop the reference taken on that queue (and others in @@ -2501,6 +2658,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_put_queue(__cfqq); __cfqq = next; } +} + +static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (unlikely(cfqq == cfqd->active_queue)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } + + cfq_put_cooperator(cfqq); cfq_put_queue(cfqq); } @@ -2513,11 +2680,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, list_del_init(&cic->queue_list); /* - * Make sure key == NULL is seen for dead queues + * Make sure dead mark is seen for dead queues */ smp_wmb(); - cic->dead_key = (unsigned long) cic->key; - cic->key = NULL; + cic->key = cfqd_dead_key(cfqd); if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); @@ -2536,7 +2702,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, static void cfq_exit_single_io_context(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); if (cfqd) { struct request_queue *q = cfqd->queue; @@ -2549,7 +2715,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc, * race between exiting task and queue */ smp_read_barrier_depends(); - if (cic->key) + if (cic->key == cfqd) __cfq_exit_single_io_context(cfqd, cic); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2629,7 +2795,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; unsigned long flags; @@ -2686,7 +2852,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) { struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); unsigned long flags; struct request_queue *q; @@ -2823,12 +2989,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, unsigned long flags; WARN_ON(!list_empty(&cic->queue_list)); + BUG_ON(cic->key != cfqd_dead_key(cfqd)); spin_lock_irqsave(&ioc->lock, flags); BUG_ON(ioc->ioc_data == cic); - radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); + radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2840,7 +3007,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; unsigned long flags; - void *k; if (unlikely(!ioc)) return NULL; @@ -2857,13 +3023,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) } do { - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); + cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); rcu_read_unlock(); if (!cic) break; - /* ->key must be copied to avoid race with cfq_exit_queue() */ - k = cic->key; - if (unlikely(!k)) { + if (unlikely(cic->key != cfqd)) { cfq_drop_dead_cic(cfqd, ioc, cic); rcu_read_lock(); continue; @@ -2896,7 +3060,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); ret = radix_tree_insert(&ioc->radix_root, - (unsigned long) cfqd, cic); + cfqd->cic_index, cic); if (!ret) hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2976,43 +3140,20 @@ static void cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq) { - sector_t sdist; - u64 total; - - if (!cfqq->last_request_pos) - sdist = 0; - else if (cfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cfqq->last_request_pos; - else - sdist = cfqq->last_request_pos - blk_rq_pos(rq); + sector_t sdist = 0; + sector_t n_sec = blk_rq_sectors(rq); + if (cfqq->last_request_pos) { + if (cfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - cfqq->last_request_pos; + else + sdist = cfqq->last_request_pos - blk_rq_pos(rq); + } - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (cfqq->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); + cfqq->seek_history <<= 1; + if (blk_queue_nonrot(cfqd->queue)) + cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); else - sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); - - cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; - cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; - total = cfqq->seek_total + (cfqq->seek_samples/2); - do_div(total, cfqq->seek_samples); - cfqq->seek_mean = (sector_t)total; - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq)) { - if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) - cfqq->seeky_start = jiffies; - else if (!CFQQ_SEEKY(cfqq)) - cfqq->seeky_start = 0; - } + cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); } /* @@ -3037,8 +3178,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_mark_cfqq_deep(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) - && CFQQ_SEEKY(cfqq))) + (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { if (cic->ttime_mean > cfqd->cfq_slice_idle) @@ -3106,7 +3246,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if (rq_is_meta(rq) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) return true; /* @@ -3122,7 +3262,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq, true)) + if (cfq_rq_close(cfqd, cfqq, rq)) return true; return false; @@ -3160,7 +3300,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq_is_meta(rq)) + if (rq->cmd_flags & REQ_META) cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cic); @@ -3183,11 +3323,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfq_cfqq_wait_request(cfqq)) { if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); __blk_run_queue(cfqd->queue); - } else + } else { + cfq_blkiocg_update_idle_time_stats( + &cfqq->cfqg->blkg); cfq_mark_cfqq_must_dispatch(cfqq); + } } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -3212,7 +3355,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3224,14 +3369,14 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; - if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) - cfqd->hw_tag_est_depth = rq_in_driver(cfqd); + if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) + cfqd->hw_tag_est_depth = cfqd->rq_in_driver; if (cfqd->hw_tag == 1) return; if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) + cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) return; /* @@ -3241,7 +3386,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) */ if (cfqq && cfq_cfqq_idle_window(cfqq) && cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < - CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) + CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -3290,17 +3435,21 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) unsigned long now; now = jiffies; - cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", + !!(rq->cmd_flags & REQ_NOIDLE)); cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver[sync]); + WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver[sync]--; + cfqd->rq_in_driver--; cfqq->dispatched--; + (RQ_CFQG(rq))->dispatched--; + cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, + rq_start_time_ns(rq), rq_io_start_time_ns(rq), + rq_data_dir(rq), rq_is_sync(rq)); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight--; + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { RQ_CIC(rq)->last_end_request = now; @@ -3325,8 +3474,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * the queue. */ if (cfq_should_wait_busy(cfqd, cfqq)) { - cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; + unsigned long extend_sl = cfqd->cfq_slice_idle; + if (!cfqd->cfq_slice_idle) + extend_sl = cfqd->cfq_group_idle; + cfqq->slice_end = jiffies + extend_sl; cfq_mark_cfqq_wait_busy(cfqq); + cfq_log_cfqq(cfqd, cfqq, "will busy wait"); } /* @@ -3341,11 +3494,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && !cfq_close_cooperator(cfqd, cfqq)) { - cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); + cfqd->noidle_tree_requires_idle |= + !(rq->cmd_flags & REQ_NOIDLE); /* * Idling is enabled for SYNC_WORKLOAD. * SYNC_NOIDLE_WORKLOAD idles at the end of the tree - * only if we processed at least one !rq_noidle request + * only if we processed at least one !REQ_NOIDLE request */ if (cfqd->serving_type == SYNC_WORKLOAD || cfqd->noidle_tree_requires_idle @@ -3354,7 +3508,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) } } - if (!rq_in_driver(cfqd)) + if (!cfqd->rq_in_driver) cfq_schedule_dispatch(cfqd); } @@ -3438,6 +3592,10 @@ static void cfq_put_request(struct request *rq) rq->elevator_private = NULL; rq->elevator_private2 = NULL; + /* Put down rq reference on cfqg */ + cfq_put_cfqg(RQ_CFQG(rq)); + rq->elevator_private3 = NULL; + cfq_put_queue(cfqq); } } @@ -3453,14 +3611,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, return cic_to_cfqq(cic, 1); } -static int should_split_cfqq(struct cfq_queue *cfqq) -{ - if (cfqq->seeky_start && - time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) - return 1; - return 0; -} - /* * Returns NULL if a new cfqq should be allocated, or the old cfqq if this * was the last process referring to said cfqq. @@ -3469,13 +3619,16 @@ static struct cfq_queue * split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) { if (cfqq_process_refs(cfqq) == 1) { - cfqq->seeky_start = 0; cfqq->pid = current->pid; cfq_clear_cfqq_coop(cfqq); + cfq_clear_cfqq_split_coop(cfqq); return cfqq; } cic_set_cfqq(cic, NULL, 1); + + cfq_put_cooperator(cfqq); + cfq_put_queue(cfqq); return NULL; } @@ -3510,7 +3663,7 @@ new_queue: /* * If the queue was seeky for too long, break it apart. */ - if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { + if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); cfqq = split_cfqq(cic, cfqq); if (!cfqq) @@ -3534,6 +3687,7 @@ new_queue: rq->elevator_private = cic; rq->elevator_private2 = cfqq; + rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); return 0; queue_fail: @@ -3661,16 +3815,38 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_put_async_queues(cfqd); cfq_release_cfq_groups(cfqd); - blkiocg_del_blkio_group(&cfqd->root_group.blkg); + cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, cfqd->cic_index); + spin_unlock(&cic_index_lock); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ call_rcu(&cfqd->rcu, cfq_cfqd_free); } +static int cfq_alloc_cic_index(void) +{ + int index, error; + + do { + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&cic_index_lock); + error = ida_get_new(&cic_index_ida, &index); + spin_unlock(&cic_index_lock); + if (error && error != -EAGAIN) + return error; + } while (error); + + return index; +} + static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; @@ -3678,10 +3854,16 @@ static void *cfq_init_queue(struct request_queue *q) struct cfq_group *cfqg; struct cfq_rb_root *st; + i = cfq_alloc_cic_index(); + if (i < 0) + return NULL; + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + cfqd->cic_index = i; + /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; @@ -3700,8 +3882,10 @@ static void *cfq_init_queue(struct request_queue *q) * to make sure that cfq_put_cfqg() does not try to kfree root group */ atomic_set(&cfqg->ref, 1); - blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, - 0); + rcu_read_lock(); + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, + (void *)cfqd, 0); + rcu_read_unlock(); #endif /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -3739,6 +3923,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; @@ -3747,7 +3932,6 @@ static void *cfq_init_queue(struct request_queue *q) * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - INIT_RCU_HEAD(&cfqd->rcu); return cfqd; } @@ -3812,6 +3996,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); @@ -3844,6 +4029,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, @@ -3865,6 +4051,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), CFQ_ATTR(group_isolation), __ATTR_NULL @@ -3876,6 +4063,7 @@ static struct elevator_type iosched_cfq = { .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, .elevator_allow_merge_fn = cfq_allow_merge, + .elevator_bio_merged_fn = cfq_bio_merged, .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, @@ -3917,6 +4105,12 @@ static int __init cfq_init(void) if (!cfq_slice_idle) cfq_slice_idle = 1; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfq_group_idle) + cfq_group_idle = 1; +#else + cfq_group_idle = 0; +#endif if (cfq_slab_setup()) return -ENOMEM; @@ -3941,6 +4135,7 @@ static void __exit cfq_exit(void) */ if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); + ida_destroy(&cic_index_ida); cfq_slab_kill(); } diff --git a/block/cfq.h b/block/cfq.h new file mode 100644 index 000000000000..93448e5a2e41 --- /dev/null +++ b/block/cfq.h @@ -0,0 +1,115 @@ +#ifndef _CFQ_H +#define _CFQ_H +#include "blk-cgroup.h" + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) +{ + blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkiocg_update_dequeue_stats(blkg, dequeue); +} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) +{ + blkiocg_update_timeslice_used(blkg, time); +} + +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + blkiocg_set_start_empty_time(blkg); +} + +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_remove_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_merged_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_idle_time_stats(blkg); +} + +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + blkiocg_update_avg_queue_size_stats(blkg); +} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_set_idle_time_stats(blkg); +} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); +} + +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + blkiocg_update_completion_stats(blkg, start_time, io_start_time, + direction, sync); +} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) { + blkiocg_add_blkio_group(blkcg, blkg, key, dev); +} + +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return blkiocg_del_blkio_group(blkg); +} + +#else /* CFQ_GROUP_IOSCHED */ +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ +} +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) {} +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return 0; +} + +#endif /* CFQ_GROUP_IOSCHED */ +#endif diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 4eb8e9ea4af5..119f07b74dc0 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -6,6 +6,7 @@ #include <linux/elevator.h> #include <linux/fd.h> #include <linux/hdreg.h> +#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/smp_lock.h> #include <linux/types.h> @@ -534,56 +535,6 @@ out: return err; } -struct compat_blk_user_trace_setup { - char name[32]; - u16 act_mask; - u32 buf_size; - u32 buf_nr; - compat_u64 start_lba; - compat_u64 end_lba; - u32 pid; -}; -#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup) - -static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg) -{ - struct blk_user_trace_setup buts; - struct compat_blk_user_trace_setup cbuts; - struct request_queue *q; - char b[BDEVNAME_SIZE]; - int ret; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - if (copy_from_user(&cbuts, arg, sizeof(cbuts))) - return -EFAULT; - - bdevname(bdev, b); - - buts = (struct blk_user_trace_setup) { - .act_mask = cbuts.act_mask, - .buf_size = cbuts.buf_size, - .buf_nr = cbuts.buf_nr, - .start_lba = cbuts.start_lba, - .end_lba = cbuts.end_lba, - .pid = cbuts.pid, - }; - memcpy(&buts.name, &cbuts.name, 32); - - mutex_lock(&bdev->bd_mutex); - ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts); - mutex_unlock(&bdev->bd_mutex); - if (ret) - return ret; - - if (copy_to_user(arg, &buts.name, 32)) - return -EFAULT; - - return 0; -} - static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -752,6 +703,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKFLSBUF: case BLKROSET: case BLKDISCARD: + case BLKSECDISCARD: /* * the ones below are implemented in blkdev_locked_ioctl, * but we call blkdev_ioctl, which gets the lock for us @@ -801,16 +753,10 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_u64(arg, bdev->bd_inode->i_size); case BLKTRACESETUP32: - lock_kernel(); - ret = compat_blk_trace_setup(bdev, compat_ptr(arg)); - unlock_kernel(); - return ret; case BLKTRACESTART: /* compatible */ case BLKTRACESTOP: /* compatible */ case BLKTRACETEARDOWN: /* compatible */ - lock_kernel(); ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg)); - unlock_kernel(); return ret; default: if (disk->fops->compat_ioctl) diff --git a/block/elevator.c b/block/elevator.c index 9ad5ccc4c5ee..4e11559aa2b0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -79,8 +79,13 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) /* * Don't merge file system requests and discard requests */ - if (bio_rw_flagged(bio, BIO_RW_DISCARD) != - bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) + if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) + return 0; + + /* + * Don't merge discard requests and secure discard requests + */ + if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) return 0; /* @@ -154,7 +159,7 @@ static struct elevator_type *elevator_get(const char *name) spin_unlock(&elv_list_lock); - sprintf(elv, "%s-iosched", name); + snprintf(elv, sizeof(elv), "%s-iosched", name); request_module("%s", elv); spin_lock(&elv_list_lock); @@ -242,9 +247,11 @@ int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; struct elevator_queue *eq; - int ret = 0; void *data; + if (unlikely(q->elevator)) + return 0; + INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; @@ -284,7 +291,7 @@ int elevator_init(struct request_queue *q, char *name) } elevator_attach(q, eq, data); - return ret; + return 0; } EXPORT_SYMBOL(elevator_init); @@ -426,7 +433,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); - if (blk_discard_rq(rq) != blk_discard_rq(pos)) + if ((rq->cmd_flags & REQ_DISCARD) != + (pos->cmd_flags & REQ_DISCARD)) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; @@ -474,6 +482,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) int ret; /* + * Levels of merges: + * nomerges: No merges at all attempted + * noxmerges: Only simple one-hit cache try + * merges: All merge tries attempted + */ + if (blk_queue_nomerges(q)) + return ELEVATOR_NO_MERGE; + + /* * First try one-hit cache. */ if (q->last_merge) { @@ -484,7 +501,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) } } - if (blk_queue_nomerges(q)) + if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* @@ -530,6 +547,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, q->last_merge = rq; } +void elv_bio_merged(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->ops->elevator_bio_merged_fn) + e->ops->elevator_bio_merged_fn(q, rq, bio); +} + void elv_requeue_request(struct request_queue *q, struct request *rq) { /* @@ -538,7 +564,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq)) + if (rq->cmd_flags & REQ_SORTED) elv_deactivate_rq(q, rq); } @@ -624,7 +650,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq)); + BUG_ON(rq->cmd_type != REQ_TYPE_FS && + !(rq->cmd_flags & REQ_DISCARD)); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { @@ -696,7 +723,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, /* * toggle ordered color */ - if (blk_barrier_rq(rq)) + if (rq->cmd_flags & REQ_HARDBARRIER) q->ordcolor ^= 1; /* @@ -709,7 +736,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, * this request is scheduling boundary, update * end_sector */ - if (blk_fs_request(rq) || blk_discard_rq(rq)) { + if (rq->cmd_type == REQ_TYPE_FS || + (rq->cmd_flags & REQ_DISCARD)) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } @@ -823,7 +851,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) + if ((rq->cmd_flags & REQ_SORTED) && + e->ops->elevator_completed_req_fn) e->ops->elevator_completed_req_fn(q, rq); } @@ -883,7 +912,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, return error; } -static struct sysfs_ops elv_sysfs_ops = { +static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; @@ -909,14 +938,17 @@ int elv_register_queue(struct request_queue *q) } } kobject_uevent(&e->kobj, KOBJ_ADD); + e->registered = 1; } return error; } +EXPORT_SYMBOL(elv_register_queue); static void __elv_unregister_queue(struct elevator_queue *e) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + e->registered = 0; } void elv_unregister_queue(struct request_queue *q) @@ -924,6 +956,7 @@ void elv_unregister_queue(struct request_queue *q) if (q) __elv_unregister_queue(q->elevator); } +EXPORT_SYMBOL(elv_unregister_queue); void elv_register(struct elevator_type *e) { @@ -978,18 +1011,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old_elevator, *e; void *data; + int err; /* * Allocate new elevator */ e = elevator_alloc(q, new_e); if (!e) - return 0; + return -ENOMEM; data = elevator_init_queue(q, e); if (!data) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; } /* @@ -1010,10 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) spin_unlock_irq(q->queue_lock); - __elv_unregister_queue(old_elevator); + if (old_elevator->registered) { + __elv_unregister_queue(old_elevator); - if (elv_register_queue(q)) - goto fail_register; + err = elv_register_queue(q); + if (err) + goto fail_register; + } /* * finally exit old elevator and turn off BYPASS. @@ -1025,7 +1062,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); - return 1; + return 0; fail_register: /* @@ -1040,17 +1077,19 @@ fail_register: queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); - return 0; + return err; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, - size_t count) +/* + * Switch this queue to the given IO scheduler. + */ +int elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; if (!q->elevator) - return count; + return -ENXIO; strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name)); @@ -1061,13 +1100,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { elevator_put(e); - return count; + return 0; } - if (!elevator_switch(q, e)) - printk(KERN_ERR "elevator: switch to %s failed\n", - elevator_name); - return count; + return elevator_switch(q, e); +} +EXPORT_SYMBOL(elevator_change); + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, + size_t count) +{ + int ret; + + if (!q->elevator) + return count; + + ret = elevator_change(q, name); + if (!ret) + return count; + + printk(KERN_ERR "elevator: switch to %s failed\n", name); + return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) @@ -1077,7 +1130,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!q->elevator) + if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); elv = e->elevator_type; diff --git a/block/genhd.c b/block/genhd.c index d13ba76a169c..59a2db6fecef 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) return disk; } +EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number @@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) if (!new_ptbl) return -ENOMEM; - INIT_RCU_HEAD(&new_ptbl->rcu_head); new_ptbl->len = target; for (i = 0; i < len; i++) diff --git a/block/ioctl.c b/block/ioctl.c index be48ea51faee..d8052f0dabd3 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -1,5 +1,6 @@ #include <linux/capability.h> #include <linux/blkdev.h> +#include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> @@ -113,8 +114,10 @@ static int blkdev_reread_part(struct block_device *bdev) } static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, - uint64_t len) + uint64_t len, int secure) { + unsigned long flags = BLKDEV_IFL_WAIT; + if (start & 511) return -EINVAL; if (len & 511) @@ -124,8 +127,9 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; - return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, - DISCARD_FL_WAIT); + if (secure) + flags |= BLKDEV_IFL_SECURE; + return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } static int put_ushort(unsigned long arg, unsigned short val) @@ -162,18 +166,10 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; - int ret; if (disk->fops->ioctl) return disk->fops->ioctl(bdev, mode, cmd, arg); - if (disk->fops->locked_ioctl) { - lock_kernel(); - ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg); - unlock_kernel(); - return ret; - } - return -ENOTTY; } /* @@ -184,8 +180,7 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); /* - * always keep this in sync with compat_blkdev_ioctl() and - * compat_blkdev_locked_ioctl() + * always keep this in sync with compat_blkdev_ioctl() */ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) @@ -205,10 +200,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret != -EINVAL && ret != -ENOTTY) return ret; - lock_kernel(); fsync_bdev(bdev); invalidate_bdev(bdev); - unlock_kernel(); return 0; case BLKROSET: @@ -220,12 +213,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EACCES; if (get_user(n, (int __user *)(arg))) return -EFAULT; - lock_kernel(); set_device_ro(bdev, n); - unlock_kernel(); return 0; - case BLKDISCARD: { + case BLKDISCARD: + case BLKSECDISCARD: { uint64_t range[2]; if (!(mode & FMODE_WRITE)) @@ -234,7 +226,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; - return blk_ioctl_discard(bdev, range[0], range[1]); + return blk_ioctl_discard(bdev, range[0], range[1], + cmd == BLKSECDISCARD); } case HDIO_GETGEO: { @@ -308,14 +301,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, bd_release(bdev); return ret; case BLKPG: - lock_kernel(); ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); - unlock_kernel(); break; case BLKRRPART: - lock_kernel(); ret = blkdev_reread_part(bdev); - unlock_kernel(); break; case BLKGETSIZE: size = bdev->bd_inode->i_size; @@ -328,9 +317,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESTOP: case BLKTRACESETUP: case BLKTRACETEARDOWN: - lock_kernel(); ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg); - unlock_kernel(); break; default: ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3a0d369d08c7..232c4b38cd37 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -5,6 +5,7 @@ #include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> struct noop_data { |