summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2010-10-28 00:15:57 +0200
committerMichal Marek <mmarek@suse.cz>2010-10-28 00:15:57 +0200
commitb74b953b998bcc2db91b694446f3a2619ec32de6 (patch)
tree6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /block
parentabb438526201c6a79949ad45375c051b6681c253 (diff)
parentf6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
Merge commit 'v2.6.36' into kbuild/misc
Update to be able to fix a recent change to scripts/basic/docproc.c (commit eda603f).
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig22
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c183
-rw-r--r--block/blk-cgroup.c861
-rw-r--r--block/blk-cgroup.h189
-rw-r--r--block/blk-core.c213
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-integrity.c3
-rw-r--r--block/blk-ioc.c3
-rw-r--r--block/blk-lib.c231
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-merge.c29
-rw-r--r--block/blk-settings.c160
-rw-r--r--block/blk-sysfs.c122
-rw-r--r--block/blk-tag.c1
-rw-r--r--block/blk-timeout.c12
-rw-r--r--block/blk.h14
-rw-r--r--block/bsg.c5
-rw-r--r--block/cfq-iosched.c571
-rw-r--r--block/cfq.h115
-rw-r--r--block/compat_ioctl.c58
-rw-r--r--block/elevator.c111
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c37
-rw-r--r--block/noop-iosched.c1
26 files changed, 2129 insertions, 836 deletions
diff --git a/block/Kconfig b/block/Kconfig
index e20fbde0875c..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,28 +77,6 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
-config BLK_CGROUP
- bool
- depends on CGROUPS
- default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
-
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups.
-
-config DEBUG_BLK_CGROUP
- bool
- depends on BLK_CGROUP
- default n
- ---help---
- Enable some debugging help. Currently it stores the cgroup path
- in the blk group which can be used by cfq for tracing various
- group related activity.
-
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index b71abfb0d726..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,8 @@ config IOSCHED_DEADLINE
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
+ # If BLK_CGROUP is a module, CFQ has to be built as module.
+ depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -32,23 +34,15 @@ config IOSCHED_CFQ
This is the default I/O scheduler.
+ Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
- depends on IOSCHED_CFQ && CGROUPS
- select BLK_CGROUP
+ depends on IOSCHED_CFQ && BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
-config DEBUG_CFQ_IOSCHED
- bool "Debug CFQ Scheduling"
- depends on CFQ_GROUP_IOSCHED
- select DEBUG_BLK_CGROUP
- default n
- ---help---
- Enable CFQ IO scheduling debugging in CFQ. Currently it makes
- blktrace output more verbose.
-
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8618d8996fea..f0faefca032f 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/gfp.h>
#include "blk.h"
@@ -12,7 +13,6 @@
* blk_queue_ordered - does this queue support ordered writes
* @q: the request queue
* @ordered: one of QUEUE_ORDERED_*
- * @prepare_flush_fn: rq setup helper for cache flush ordered writes
*
* Description:
* For journalled file systems, doing ordered writes on a commit
@@ -21,15 +21,8 @@
* feature should call this function and indicate so.
*
**/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered,
- prepare_flush_fn *prepare_flush_fn)
+int blk_queue_ordered(struct request_queue *q, unsigned ordered)
{
- if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH))) {
- printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
- return -EINVAL;
- }
-
if (ordered != QUEUE_ORDERED_NONE &&
ordered != QUEUE_ORDERED_DRAIN &&
ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
@@ -43,7 +36,6 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered,
q->ordered = ordered;
q->next_ordered = ordered;
- q->prepare_flush_fn = prepare_flush_fn;
return 0;
}
@@ -78,7 +70,7 @@ unsigned blk_ordered_req_seq(struct request *rq)
*
* http://thread.gmane.org/gmane.linux.kernel/537473
*/
- if (!blk_fs_request(rq))
+ if (rq->cmd_type != REQ_TYPE_FS)
return QUEUE_ORDSEQ_DRAIN;
if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
@@ -142,10 +134,10 @@ static void queue_flush(struct request_queue *q, unsigned which)
}
blk_rq_init(q, rq);
- rq->cmd_flags = REQ_HARDBARRIER;
- rq->rq_disk = q->bar_rq.rq_disk;
+ rq->cmd_type = REQ_TYPE_FS;
+ rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
+ rq->rq_disk = q->orig_bar_rq->rq_disk;
rq->end_io = end_io;
- q->prepare_flush_fn(q, rq);
elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}
@@ -202,7 +194,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
/* initialize proxy request and queue it */
blk_rq_init(q, rq);
if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
- rq->cmd_flags |= REQ_RW;
+ rq->cmd_flags |= REQ_WRITE;
if (q->ordered & QUEUE_ORDERED_DO_FUA)
rq->cmd_flags |= REQ_FUA;
init_request_from_bio(rq, q->orig_bar_rq->bio);
@@ -235,7 +227,8 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
bool blk_do_ordered(struct request_queue *q, struct request **rqp)
{
struct request *rq = *rqp;
- const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+ const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
+ (rq->cmd_flags & REQ_HARDBARRIER);
if (!q->ordseq) {
if (!is_barrier)
@@ -260,7 +253,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
*/
/* Special requests are not subject to ordering rules. */
- if (!blk_fs_request(rq) &&
+ if (rq->cmd_type != REQ_TYPE_FS &&
rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
return true;
@@ -285,26 +278,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}
-
- complete(bio->bi_private);
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
+ * @gfp_mask: memory allocation flags (for bio_alloc)
* @error_sector: error sector
+ * @flags: BLKDEV_IFL_* flags to control behaviour
*
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to.
+ * wish to. If WAIT flag is not passed then caller may check only what
+ * request was pushed in some internal queue for later handling.
*/
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+ sector_t *error_sector, unsigned long flags)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
- int ret;
+ int ret = 0;
if (bdev->bd_disk == NULL)
return -ENXIO;
@@ -313,23 +311,34 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
if (!q)
return -ENXIO;
- bio = bio_alloc(GFP_KERNEL, 0);
+ /*
+ * some block devices may not have their queue correctly set up here
+ * (e.g. loop device without a backing file) and so issuing a flush
+ * here will panic. Ensure there is a request function before issuing
+ * the barrier.
+ */
+ if (!q->make_request_fn)
+ return -ENXIO;
+
+ bio = bio_alloc(gfp_mask, 0);
bio->bi_end_io = bio_end_empty_barrier;
- bio->bi_private = &wait;
bio->bi_bdev = bdev;
- submit_bio(WRITE_BARRIER, bio);
+ if (test_bit(BLKDEV_WAIT, &flags))
+ bio->bi_private = &wait;
- wait_for_completion(&wait);
-
- /*
- * The driver must store the error location in ->bi_sector, if
- * it supports it. For non-stacked drivers, this should be copied
- * from blk_rq_pos(rq).
- */
- if (error_sector)
- *error_sector = bio->bi_sector;
+ bio_get(bio);
+ submit_bio(WRITE_BARRIER, bio);
+ if (test_bit(BLKDEV_WAIT, &flags)) {
+ wait_for_completion(&wait);
+ /*
+ * The driver must store the error location in ->bi_sector, if
+ * it supports it. For non-stacked drivers, this should be
+ * copied from blk_rq_pos(rq).
+ */
+ if (error_sector)
+ *error_sector = bio->bi_sector;
+ }
- ret = 0;
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -339,107 +348,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
-
- if (bio->bi_private)
- complete(bio->bi_private);
- __free_page(bio_page(bio));
-
- bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev: blockdev to issue discard for
- * @sector: start sector
- * @nr_sects: number of sectors to discard
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: DISCARD_FL_* flags to control behaviour
- *
- * Description:
- * Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, int flags)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- struct request_queue *q = bdev_get_queue(bdev);
- int type = flags & DISCARD_FL_BARRIER ?
- DISCARD_BARRIER : DISCARD_NOBARRIER;
- struct bio *bio;
- struct page *page;
- int ret = 0;
-
- if (!q)
- return -ENXIO;
-
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
-
- while (nr_sects && !ret) {
- unsigned int sector_size = q->limits.logical_block_size;
- unsigned int max_discard_sectors =
- min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-
- bio = bio_alloc(gfp_mask, 1);
- if (!bio)
- goto out;
- bio->bi_sector = sector;
- bio->bi_end_io = blkdev_discard_end_io;
- bio->bi_bdev = bdev;
- if (flags & DISCARD_FL_WAIT)
- bio->bi_private = &wait;
-
- /*
- * Add a zeroed one-sector payload as that's what
- * our current implementations need. If we'll ever need
- * more the interface will need revisiting.
- */
- page = alloc_page(gfp_mask | __GFP_ZERO);
- if (!page)
- goto out_free_bio;
- if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
- goto out_free_page;
-
- /*
- * And override the bio size - the way discard works we
- * touch many more blocks on disk than the actual payload
- * length.
- */
- if (nr_sects > max_discard_sectors) {
- bio->bi_size = max_discard_sectors << 9;
- nr_sects -= max_discard_sectors;
- sector += max_discard_sectors;
- } else {
- bio->bi_size = nr_sects << 9;
- nr_sects = 0;
- }
-
- bio_get(bio);
- submit_bio(type, bio);
-
- if (flags & DISCARD_FL_WAIT)
- wait_for_completion(&wait);
-
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- else if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
- bio_put(bio);
- }
- return ret;
-out_free_page:
- __free_page(page);
-out_free_bio:
- bio_put(bio);
-out:
- return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1fa2654db0a6..2fef1ef931a0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,7 +15,12 @@
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>
+
+#define MAX_KEY_LEN 100
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
@@ -23,19 +28,56 @@ static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);
-bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
+static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
+ struct cgroup *);
+static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+ struct task_struct *, bool);
+static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
+ struct cgroup *, struct task_struct *, bool);
+static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
+static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
+
+struct cgroup_subsys blkio_subsys = {
+ .name = "blkio",
+ .create = blkiocg_create,
+ .can_attach = blkiocg_can_attach,
+ .attach = blkiocg_attach,
+ .destroy = blkiocg_destroy,
+ .populate = blkiocg_populate,
+#ifdef CONFIG_BLK_CGROUP
+ /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
+ .subsys_id = blkio_subsys_id,
+#endif
+ .use_id = 1,
+ .module = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(blkio_subsys);
+
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+ struct blkio_policy_node *pn)
{
- if (!css_tryget(&blkcg->css))
- return false;
- return true;
+ list_add(&pn->node, &blkcg->policy_list);
}
-EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
-void blkiocg_css_put(struct blkio_cgroup *blkcg)
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
{
- css_put(&blkcg->css);
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(blkiocg_css_put);
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
@@ -44,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
}
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors)
+/*
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+ bool sync)
+{
+ if (direction)
+ stat[BLKIO_STAT_WRITE] += add;
+ else
+ stat[BLKIO_STAT_READ] += add;
+ if (sync)
+ stat[BLKIO_STAT_SYNC] += add;
+ else
+ stat[BLKIO_STAT_ASYNC] += add;
+}
+
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+ if (direction) {
+ BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+ stat[BLKIO_STAT_WRITE]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_READ] == 0);
+ stat[BLKIO_STAT_READ]--;
+ }
+ if (sync) {
+ BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+ stat[BLKIO_STAT_SYNC]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+ stat[BLKIO_STAT_ASYNC]--;
+ }
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg)
+{
+ if (blkio_blkg_waiting(&blkg->stats))
+ return;
+ if (blkg == curr_blkg)
+ return;
+ blkg->stats.start_group_wait_time = sched_clock();
+ blkio_mark_blkg_waiting(&blkg->stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ stats->group_wait_time += now - stats->start_group_wait_time;
+ blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ stats->empty_time += now - stats->start_empty_time;
+ blkio_clear_blkg_empty(stats);
+}
+
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ BUG_ON(blkio_blkg_idling(&blkg->stats));
+ blkg->stats.start_idle_time = sched_clock();
+ blkio_mark_blkg_idling(&blkg->stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ unsigned long long now;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (blkio_blkg_idling(stats)) {
+ now = sched_clock();
+ if (time_after64(now, stats->start_idle_time))
+ stats->idle_time += now - stats->start_idle_time;
+ blkio_clear_blkg_idling(stats);
+ }
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->avg_queue_size_sum +=
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+ stats->avg_queue_size_samples++;
+ blkio_update_group_wait_time(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+
+ if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ /*
+ * group is already marked empty. This can happen if cfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if(blkio_blkg_empty(stats)) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ stats->start_empty_time = sched_clock();
+ blkio_mark_blkg_empty(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue)
+{
+ blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction,
+ bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+ sync);
+ blkio_end_empty_time(&blkg->stats);
+ blkio_set_start_group_wait_time(blkg, curr_blkg);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkg->stats.time += time;
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync)
+{
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->sectors += bytes >> 9;
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+ sync);
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
+
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
{
- blkg->time += time;
- blkg->sectors += sectors;
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+ unsigned long long now = sched_clock();
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (time_after64(now, io_start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+ now - io_start_time, direction, sync);
+ if (time_after64(io_start_time, start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+ io_start_time - start_time, direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
}
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+ bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+ sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev)
@@ -58,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
unsigned long flags;
spin_lock_irqsave(&blkcg->lock, flags);
+ spin_lock_init(&blkg->stats_lock);
rcu_assign_pointer(blkg->key, key);
blkg->blkcg_id = css_id(&blkcg->css);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Need to take css reference ? */
cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
blkg->dev = dev;
}
EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -89,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
rcu_read_lock();
css = css_lookup(&blkio_subsys, blkg->blkcg_id);
- if (!css)
- goto out;
-
- blkcg = container_of(css, struct blkio_cgroup, css);
- spin_lock_irqsave(&blkcg->lock, flags);
- if (!hlist_unhashed(&blkg->blkcg_node)) {
- __blkiocg_del_blkio_group(blkg);
- ret = 0;
+ if (css) {
+ blkcg = container_of(css, struct blkio_cgroup, css);
+ spin_lock_irqsave(&blkcg->lock, flags);
+ if (!hlist_unhashed(&blkg->blkcg_node)) {
+ __blkiocg_del_blkio_group(blkg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&blkcg->lock, flags);
}
- spin_unlock_irqrestore(&blkcg->lock, flags);
-out:
+
rcu_read_unlock();
return ret;
}
@@ -142,31 +428,179 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn;
if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;
blkcg = cgroup_to_blkio_cgroup(cgroup);
+ spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
- spin_lock(&blkio_list_lock);
+ pn = blkio_policy_search_node(blkcg, blkg->dev);
+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);
- spin_unlock(&blkio_list_lock);
}
spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
return 0;
}
-#define SHOW_FUNCTION_PER_GROUP(__VAR) \
+static int
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ struct blkio_group_stats *stats;
+ struct hlist_node *n;
+ uint64_t queued[BLKIO_STAT_TOTAL];
+ int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ bool idling, waiting, empty;
+ unsigned long long now = sched_clock();
+#endif
+
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ spin_lock_irq(&blkcg->lock);
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ spin_lock(&blkg->stats_lock);
+ stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ idling = blkio_blkg_idling(stats);
+ waiting = blkio_blkg_waiting(stats);
+ empty = blkio_blkg_empty(stats);
+#endif
+ for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+ queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+ memset(stats, 0, sizeof(struct blkio_group_stats));
+ for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+ stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (idling) {
+ blkio_mark_blkg_idling(stats);
+ stats->start_idle_time = now;
+ }
+ if (waiting) {
+ blkio_mark_blkg_waiting(stats);
+ stats->start_group_wait_time = now;
+ }
+ if (empty) {
+ blkio_mark_blkg_empty(stats);
+ stats->start_empty_time = now;
+ }
+#endif
+ spin_unlock(&blkg->stats_lock);
+ }
+ spin_unlock_irq(&blkcg->lock);
+ return 0;
+}
+
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+ int chars_left, bool diskname_only)
+{
+ snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
+ chars_left -= strlen(str);
+ if (chars_left <= 0) {
+ printk(KERN_WARNING
+ "Possibly incorrect cgroup stat display format");
+ return;
+ }
+ if (diskname_only)
+ return;
+ switch (type) {
+ case BLKIO_STAT_READ:
+ strlcat(str, " Read", chars_left);
+ break;
+ case BLKIO_STAT_WRITE:
+ strlcat(str, " Write", chars_left);
+ break;
+ case BLKIO_STAT_SYNC:
+ strlcat(str, " Sync", chars_left);
+ break;
+ case BLKIO_STAT_ASYNC:
+ strlcat(str, " Async", chars_left);
+ break;
+ case BLKIO_STAT_TOTAL:
+ strlcat(str, " Total", chars_left);
+ break;
+ default:
+ strlcat(str, " Invalid", chars_left);
+ }
+}
+
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+ struct cgroup_map_cb *cb, dev_t dev)
+{
+ blkio_get_key_name(0, dev, str, chars_left, true);
+ cb->fill(cb, str, val);
+ return val;
+}
+
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+ struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
+{
+ uint64_t disk_total;
+ char key_str[MAX_KEY_LEN];
+ enum stat_sub_type sub_type;
+
+ if (type == BLKIO_STAT_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.time, cb, dev);
+ if (type == BLKIO_STAT_SECTORS)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
+ uint64_t sum = blkg->stats.avg_queue_size_sum;
+ uint64_t samples = blkg->stats.avg_queue_size_samples;
+ if (samples)
+ do_div(sum, samples);
+ else
+ sum = 0;
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+ }
+ if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.group_wait_time, cb, dev);
+ if (type == BLKIO_STAT_IDLE_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.idle_time, cb, dev);
+ if (type == BLKIO_STAT_EMPTY_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.empty_time, cb, dev);
+ if (type == BLKIO_STAT_DEQUEUE)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.dequeue, cb, dev);
+#endif
+
+ for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+ sub_type++) {
+ blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+ cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+ }
+ disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+ blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+ blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+ cb->fill(cb, key_str, disk_total);
+ return disk_total;
+}
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
- struct cftype *cftype, struct seq_file *m) \
+ struct cftype *cftype, struct cgroup_map_cb *cb) \
{ \
struct blkio_cgroup *blkcg; \
struct blkio_group *blkg; \
struct hlist_node *n; \
+ uint64_t cgroup_total = 0; \
\
if (!cgroup_lock_live_group(cgroup)) \
return -ENODEV; \
@@ -174,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
blkcg = cgroup_to_blkio_cgroup(cgroup); \
rcu_read_lock(); \
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
- if (blkg->dev) \
- seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
- MINOR(blkg->dev), blkg->__VAR); \
+ if (blkg->dev) { \
+ spin_lock_irq(&blkg->stats_lock); \
+ cgroup_total += blkio_get_stat(blkg, cb, \
+ blkg->dev, type); \
+ spin_unlock_irq(&blkg->stats_lock); \
+ } \
} \
+ if (show_total) \
+ cb->fill(cb, "Total", cgroup_total); \
rcu_read_unlock(); \
cgroup_unlock(); \
return 0; \
}
-SHOW_FUNCTION_PER_GROUP(time);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
#endif
#undef SHOW_FUNCTION_PER_GROUP
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
- unsigned long dequeue)
+static int blkio_check_dev_num(dev_t dev)
{
- blkg->dequeue += dequeue;
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int blkio_policy_parse_and_set(char *buf,
+ struct blkio_policy_node *newpn)
+{
+ char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+ int ret;
+ unsigned long major, minor, temp;
+ int i = 0;
+ dev_t dev;
+
+ memset(s, 0, sizeof(s));
+
+ while ((p = strsep(&buf, " ")) != NULL) {
+ if (!*p)
+ continue;
+
+ s[i++] = p;
+
+ /* Prevent from inputing too many things */
+ if (i == 3)
+ break;
+ }
+
+ if (i != 2)
+ return -EINVAL;
+
+ p = strsep(&s[0], ":");
+ if (p != NULL)
+ major_s = p;
+ else
+ return -EINVAL;
+
+ minor_s = s[0];
+ if (!minor_s)
+ return -EINVAL;
+
+ ret = strict_strtoul(major_s, 10, &major);
+ if (ret)
+ return -EINVAL;
+
+ ret = strict_strtoul(minor_s, 10, &minor);
+ if (ret)
+ return -EINVAL;
+
+ dev = MKDEV(major, minor);
+
+ ret = blkio_check_dev_num(dev);
+ if (ret)
+ return ret;
+
+ newpn->dev = dev;
+
+ if (s[1] == NULL)
+ return -EINVAL;
+
+ ret = strict_strtoul(s[1], 10, &temp);
+ if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+ temp > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ newpn->weight = temp;
+
+ return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ pn = blkio_policy_search_node(blkcg, dev);
+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;
+ struct blkio_policy_node *newpn, *pn;
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ int keep_newpn = 0;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ buf = kstrdup(buffer, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+ if (!newpn) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ ret = blkio_policy_parse_and_set(buf, newpn);
+ if (ret)
+ goto free_newpn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ spin_lock_irq(&blkcg->lock);
+
+ pn = blkio_policy_search_node(blkcg, newpn->dev);
+ if (!pn) {
+ if (newpn->weight != 0) {
+ blkio_policy_insert_node(blkcg, newpn);
+ keep_newpn = 1;
+ }
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+
+ if (newpn->weight == 0) {
+ /* weight == 0 means deleteing a specific weight */
+ blkio_policy_delete_node(pn);
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ pn->weight = newpn->weight;
+
+update_io_group:
+ /* update weight for each cfqg */
+ spin_lock(&blkio_list_lock);
+ spin_lock_irq(&blkcg->lock);
+
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ if (newpn->dev == blkg->dev) {
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_update_group_weight_fn(blkg,
+ newpn->weight ?
+ newpn->weight :
+ blkcg->weight);
+ }
+ }
+
+ spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
+
+free_newpn:
+ if (!keep_newpn)
+ kfree(newpn);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_policy_node *pn;
+
+ seq_printf(m, "dev\tweight\n");
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (!list_empty(&blkcg->policy_list)) {
+ spin_lock_irq(&blkcg->lock);
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+ MINOR(pn->dev), pn->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
-#endif
struct cftype blkio_files[] = {
{
+ .name = "weight_device",
+ .read_seq_string = blkiocg_weight_device_read,
+ .write_string = blkiocg_weight_device_write,
+ .max_write_len = 256,
+ },
+ {
.name = "weight",
.read_u64 = blkiocg_weight_read,
.write_u64 = blkiocg_weight_write,
},
{
.name = "time",
- .read_seq_string = blkiocg_time_read,
+ .read_map = blkiocg_time_read,
},
{
.name = "sectors",
- .read_seq_string = blkiocg_sectors_read,
+ .read_map = blkiocg_sectors_read,
+ },
+ {
+ .name = "io_service_bytes",
+ .read_map = blkiocg_io_service_bytes_read,
+ },
+ {
+ .name = "io_serviced",
+ .read_map = blkiocg_io_serviced_read,
+ },
+ {
+ .name = "io_service_time",
+ .read_map = blkiocg_io_service_time_read,
+ },
+ {
+ .name = "io_wait_time",
+ .read_map = blkiocg_io_wait_time_read,
+ },
+ {
+ .name = "io_merged",
+ .read_map = blkiocg_io_merged_read,
+ },
+ {
+ .name = "io_queued",
+ .read_map = blkiocg_io_queued_read,
+ },
+ {
+ .name = "reset_stats",
+ .write_u64 = blkiocg_reset_stats,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
- {
+ {
+ .name = "avg_queue_size",
+ .read_map = blkiocg_avg_queue_size_read,
+ },
+ {
+ .name = "group_wait_time",
+ .read_map = blkiocg_group_wait_time_read,
+ },
+ {
+ .name = "idle_time",
+ .read_map = blkiocg_idle_time_read,
+ },
+ {
+ .name = "empty_time",
+ .read_map = blkiocg_empty_time_read,
+ },
+ {
.name = "dequeue",
- .read_seq_string = blkiocg_dequeue_read,
- },
+ .read_map = blkiocg_dequeue_read,
+ },
#endif
};
@@ -234,56 +911,62 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn, *pntmp;
rcu_read_lock();
-remove_entry:
- spin_lock_irqsave(&blkcg->lock, flags);
+ do {
+ spin_lock_irqsave(&blkcg->lock, flags);
+
+ if (hlist_empty(&blkcg->blkg_list)) {
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+ break;
+ }
+
+ blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+ blkcg_node);
+ key = rcu_dereference(blkg->key);
+ __blkiocg_del_blkio_group(blkg);
- if (hlist_empty(&blkcg->blkg_list)) {
spin_unlock_irqrestore(&blkcg->lock, flags);
- goto done;
- }
- blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
- blkcg_node);
- key = rcu_dereference(blkg->key);
- __blkiocg_del_blkio_group(blkg);
+ /*
+ * This blkio_group is being unlinked as associated cgroup is
+ * going away. Let all the IO controlling policies know about
+ * this event. Currently this is static call to one io
+ * controlling policy. Once we have more policies in place, we
+ * need some dynamic registration of callback function.
+ */
+ spin_lock(&blkio_list_lock);
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_unlink_group_fn(key, blkg);
+ spin_unlock(&blkio_list_lock);
+ } while (1);
- spin_unlock_irqrestore(&blkcg->lock, flags);
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+ blkio_policy_delete_node(pn);
+ kfree(pn);
+ }
- /*
- * This blkio_group is being unlinked as associated cgroup is going
- * away. Let all the IO controlling policies know about this event.
- *
- * Currently this is static call to one io controlling policy. Once
- * we have more policies in place, we need some dynamic registration
- * of callback function.
- */
- spin_lock(&blkio_list_lock);
- list_for_each_entry(blkiop, &blkio_list, list)
- blkiop->ops.blkio_unlink_group_fn(key, blkg);
- spin_unlock(&blkio_list_lock);
- goto remove_entry;
-done:
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
- kfree(blkcg);
+ if (blkcg != &blkio_root_cgroup)
+ kfree(blkcg);
}
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- struct blkio_cgroup *blkcg, *parent_blkcg;
+ struct blkio_cgroup *blkcg;
+ struct cgroup *parent = cgroup->parent;
- if (!cgroup->parent) {
+ if (!parent) {
blkcg = &blkio_root_cgroup;
goto done;
}
/* Currently we do not support hierarchy deeper than two level (0,1) */
- parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
- if (css_depth(&parent_blkcg->css) > 0)
- return ERR_PTR(-EINVAL);
+ if (parent != cgroup->top_cgroup)
+ return ERR_PTR(-EPERM);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
@@ -294,6 +977,7 @@ done:
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);
+ INIT_LIST_HEAD(&blkcg->policy_list);
return &blkcg->css;
}
@@ -333,17 +1017,6 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
task_unlock(tsk);
}
-struct cgroup_subsys blkio_subsys = {
- .name = "blkio",
- .create = blkiocg_create,
- .can_attach = blkiocg_can_attach,
- .attach = blkiocg_attach,
- .destroy = blkiocg_destroy,
- .populate = blkiocg_populate,
- .subsys_id = blkio_subsys_id,
- .use_id = 1,
-};
-
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
spin_lock(&blkio_list_lock);
@@ -359,3 +1032,17 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
spin_unlock(&blkio_list_lock);
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+
+static int __init init_cgroup_blkio(void)
+{
+ return cgroup_load_subsys(&blkio_subsys);
+}
+
+static void __exit exit_cgroup_blkio(void)
+{
+ cgroup_unload_subsys(&blkio_subsys);
+}
+
+module_init(init_cgroup_blkio);
+module_exit(exit_cgroup_blkio);
+MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4d316df863b4..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,13 +15,92 @@
#include <linux/cgroup.h>
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+
+#ifndef CONFIG_BLK_CGROUP
+/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
+extern struct cgroup_subsys blkio_subsys;
+#define blkio_subsys_id blkio_subsys.subsys_id
+#endif
+
+enum stat_type {
+ /* Total time spent (in ns) between request dispatch to the driver and
+ * request completion for IOs doen by this cgroup. This may not be
+ * accurate when NCQ is turned on. */
+ BLKIO_STAT_SERVICE_TIME = 0,
+ /* Total bytes transferred */
+ BLKIO_STAT_SERVICE_BYTES,
+ /* Total IOs serviced, post merge */
+ BLKIO_STAT_SERVICED,
+ /* Total time spent waiting in scheduler queue in ns */
+ BLKIO_STAT_WAIT_TIME,
+ /* Number of IOs merged */
+ BLKIO_STAT_MERGED,
+ /* Number of IOs queued up */
+ BLKIO_STAT_QUEUED,
+ /* All the single valued stats go below this */
+ BLKIO_STAT_TIME,
+ BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ BLKIO_STAT_AVG_QUEUE_SIZE,
+ BLKIO_STAT_IDLE_TIME,
+ BLKIO_STAT_EMPTY_TIME,
+ BLKIO_STAT_GROUP_WAIT_TIME,
+ BLKIO_STAT_DEQUEUE
+#endif
+};
+
+enum stat_sub_type {
+ BLKIO_STAT_READ = 0,
+ BLKIO_STAT_WRITE,
+ BLKIO_STAT_SYNC,
+ BLKIO_STAT_ASYNC,
+ BLKIO_STAT_TOTAL
+};
+
+/* blkg state flags */
+enum blkg_state_flags {
+ BLKG_waiting = 0,
+ BLKG_idling,
+ BLKG_empty,
+};
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
+ struct list_head policy_list; /* list of blkio_policy_node */
+};
+
+struct blkio_group_stats {
+ /* total disk time and nr sectors dispatched by this group */
+ uint64_t time;
+ uint64_t sectors;
+ uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* Sum of number of IOs queued across all samples */
+ uint64_t avg_queue_size_sum;
+ /* Count of samples taken for average */
+ uint64_t avg_queue_size_samples;
+ /* How many times this group has been removed from service tree */
+ unsigned long dequeue;
+
+ /* Total time spent waiting for it to be assigned a timeslice. */
+ uint64_t group_wait_time;
+ uint64_t start_group_wait_time;
+
+ /* Time spent idling for this blkio_group */
+ uint64_t idle_time;
+ uint64_t start_idle_time;
+ /*
+ * Total time when we have requests queued and do not contain the
+ * current active queue.
+ */
+ uint64_t empty_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif
};
struct blkio_group {
@@ -29,22 +108,24 @@ struct blkio_group {
void *key;
struct hlist_node blkcg_node;
unsigned short blkcg_id;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Store cgroup path */
char path[128];
- /* How many times this group has been removed from service tree */
- unsigned long dequeue;
-#endif
/* The device MKDEV(major, minor), this group has been created for */
- dev_t dev;
+ dev_t dev;
- /* total disk time and nr sectors dispatched by this group */
- unsigned long time;
- unsigned long sectors;
+ /* Need to serialize the stats in the case of reset/update */
+ spinlock_t stats_lock;
+ struct blkio_group_stats stats;
+};
+
+struct blkio_policy_node {
+ struct list_head node;
+ dev_t dev;
+ unsigned int weight;
};
-extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
-extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev);
typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
@@ -64,6 +145,11 @@ struct blkio_policy_type {
extern void blkio_policy_register(struct blkio_policy_type *);
extern void blkio_policy_unregister(struct blkio_policy_type *);
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+ return blkg->path;
+}
+
#else
struct blkio_group {
@@ -75,6 +161,8 @@ struct blkio_policy_type {
static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+
#endif
#define BLKIO_WEIGHT_MIN 100
@@ -82,19 +170,45 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
#define BLKIO_WEIGHT_DEFAULT 500
#ifdef CONFIG_DEBUG_BLK_CGROUP
-static inline char *blkg_path(struct blkio_group *blkg)
-{
- return blkg->path;
-}
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
+
+#define BLKG_FLAG_FNS(name) \
+static inline void blkio_mark_blkg_##name( \
+ struct blkio_group_stats *stats) \
+{ \
+ stats->flags |= (1 << BLKG_##name); \
+} \
+static inline void blkio_clear_blkg_##name( \
+ struct blkio_group_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BLKG_##name); \
+} \
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
+{ \
+ return (stats->flags & (1 << BLKG_##name)) != 0; \
+} \
+
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
#else
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkiocg_update_blkio_group_dequeue_stats(
- struct blkio_group *blkg, unsigned long dequeue) {}
+static inline void blkiocg_update_avg_queue_size_stats(
+ struct blkio_group *blkg) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
#endif
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
@@ -102,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+ bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+ bool sync);
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync);
#else
struct cgroup;
static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, void *key, dev_t dev)
-{
-}
+ struct blkio_group *blkg, void *key, dev_t dev) {}
static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
static inline struct blkio_group *
blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors)
-{
-}
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time) {}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction,
+ bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
#endif
#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 718897e6d37f..32a1c123dfb3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->tag = -1;
rq->ref_count = 1;
rq->start_time = jiffies;
+ set_start_time_ns(rq);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -183,7 +184,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n",
rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
- if (blk_pc_request(rq)) {
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
printk(KERN_INFO " cdb: ");
for (bit = 0; bit < BLK_MAX_CDB; bit++)
printk("%02x ", rq->cmd[bit]);
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
*/
blk_sync_queue(q);
+ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
mutex_unlock(&q->sysfs_lock);
@@ -465,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q)
{
struct request_list *rl = &q->rq;
+ if (unlikely(rl->rq_pool))
+ return 0;
+
rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
rl->elvpriv = 0;
@@ -510,6 +515,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
return NULL;
}
+ setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+ laptop_mode_timer_fn, (unsigned long) q);
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
@@ -566,19 +573,42 @@ EXPORT_SYMBOL(blk_init_queue);
struct request_queue *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
- struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ struct request_queue *uninit_q, *q;
+
+ uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ if (!uninit_q)
+ return NULL;
+
+ q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+ if (!q)
+ blk_cleanup_queue(uninit_q);
+
+ return q;
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+struct request_queue *
+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+ spinlock_t *lock)
+{
+ return blk_init_allocated_queue_node(q, rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_allocated_queue);
+struct request_queue *
+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
+ spinlock_t *lock, int node_id)
+{
if (!q)
return NULL;
q->node = node_id;
- if (blk_init_free_list(q)) {
- kmem_cache_free(blk_requestq_cachep, q);
+ if (blk_init_free_list(q))
return NULL;
- }
q->request_fn = rfn;
q->prep_rq_fn = NULL;
+ q->unprep_rq_fn = NULL;
q->unplug_fn = generic_unplug_device;
q->queue_flags = QUEUE_FLAG_DEFAULT;
q->queue_lock = lock;
@@ -598,10 +628,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
return q;
}
- blk_put_queue(q);
return NULL;
}
-EXPORT_SYMBOL(blk_init_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue_node);
int blk_get_queue(struct request_queue *q)
{
@@ -1107,33 +1136,46 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
+/**
+ * blk_add_request_payload - add a payload to a request
+ * @rq: request to update
+ * @page: page backing the payload
+ * @len: length of the payload.
+ *
+ * This allows to later add a payload to an already submitted request by
+ * a block driver. The driver needs to take care of freeing the payload
+ * itself.
+ *
+ * Note that this is a quite horrible hack and nothing but handling of
+ * discard requests should ever use it.
+ */
+void blk_add_request_payload(struct request *rq, struct page *page,
+ unsigned int len)
+{
+ struct bio *bio = rq->bio;
+
+ bio->bi_io_vec->bv_page = page;
+ bio->bi_io_vec->bv_offset = 0;
+ bio->bi_io_vec->bv_len = len;
+
+ bio->bi_size = len;
+ bio->bi_vcnt = 1;
+ bio->bi_phys_segments = 1;
+
+ rq->__data_len = rq->resid_len = len;
+ rq->nr_phys_segments = 1;
+ rq->buffer = bio_data(bio);
+}
+EXPORT_SYMBOL_GPL(blk_add_request_payload);
+
void init_request_from_bio(struct request *req, struct bio *bio)
{
req->cpu = bio->bi_comp_cpu;
req->cmd_type = REQ_TYPE_FS;
- /*
- * Inherit FAILFAST from bio (for read-ahead, and explicit
- * FAILFAST). FAILFAST flags are identical for req and bio.
- */
- if (bio_rw_flagged(bio, BIO_RW_AHEAD))
+ req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
+ if (bio->bi_rw & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
- else
- req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
-
- if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
- req->cmd_flags |= REQ_DISCARD;
- if (bio_rw_flagged(bio, BIO_RW_BARRIER))
- req->cmd_flags |= REQ_SOFTBARRIER;
- } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
- req->cmd_flags |= REQ_HARDBARRIER;
-
- if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
- req->cmd_flags |= REQ_RW_SYNC;
- if (bio_rw_flagged(bio, BIO_RW_META))
- req->cmd_flags |= REQ_RW_META;
- if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
- req->cmd_flags |= REQ_NOIDLE;
req->errors = 0;
req->__sector = bio->bi_sector;
@@ -1147,7 +1189,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
*/
static inline bool queue_should_plug(struct request_queue *q)
{
- return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
+ return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
}
static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1156,12 +1198,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
int el_ret;
unsigned int bytes = bio->bi_size;
const unsigned short prio = bio_prio(bio);
- const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
- const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
- const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ const bool sync = !!(bio->bi_rw & REQ_SYNC);
+ const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+ const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
int rw_flags;
- if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
+ if ((bio->bi_rw & REQ_HARDBARRIER) &&
(q->next_ordered == QUEUE_ORDERED_NONE)) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
@@ -1175,7 +1217,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
spin_lock_irq(q->queue_lock);
- if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
+ if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio);
@@ -1198,6 +1240,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
@@ -1231,6 +1274,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
@@ -1248,7 +1292,7 @@ get_rq:
*/
rw_flags = bio_data_dir(bio);
if (sync)
- rw_flags |= REQ_RW_SYNC;
+ rw_flags |= REQ_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
@@ -1437,7 +1481,7 @@ static inline void __generic_make_request(struct bio *bio)
goto end_io;
}
- if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+ if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
nr_sectors > queue_max_hw_sectors(q))) {
printk(KERN_ERR "bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
@@ -1470,8 +1514,10 @@ static inline void __generic_make_request(struct bio *bio)
if (bio_check_eod(bio, nr_sectors))
goto end_io;
- if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
- !blk_queue_discard(q)) {
+ if ((bio->bi_rw & REQ_DISCARD) &&
+ (!blk_queue_discard(q) ||
+ ((bio->bi_rw & REQ_SECURE) &&
+ !blk_queue_secdiscard(q)))) {
err = -EOPNOTSUPP;
goto end_io;
}
@@ -1490,9 +1536,9 @@ end_io:
/*
* We only want one ->make_request_fn to be active at a time,
* else stack usage with stacked devices could be a problem.
- * So use current->bio_{list,tail} to keep a list of requests
+ * So use current->bio_list to keep a list of requests
* submited by a make_request_fn function.
- * current->bio_tail is also used as a flag to say if
+ * current->bio_list is also used as a flag to say if
* generic_make_request is currently active in this task or not.
* If it is NULL, then no make_request is active. If it is non-NULL,
* then a make_request is active, and new requests should be added
@@ -1500,11 +1546,11 @@ end_io:
*/
void generic_make_request(struct bio *bio)
{
- if (current->bio_tail) {
+ struct bio_list bio_list_on_stack;
+
+ if (current->bio_list) {
/* make_request is active */
- *(current->bio_tail) = bio;
- bio->bi_next = NULL;
- current->bio_tail = &bio->bi_next;
+ bio_list_add(current->bio_list, bio);
return;
}
/* following loop may be a bit non-obvious, and so deserves some
@@ -1512,30 +1558,27 @@ void generic_make_request(struct bio *bio)
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
- * we assign bio_list to the next (which is NULL) and bio_tail
- * to &bio_list, thus initialising the bio_list of new bios to be
+ * we assign bio_list to a pointer to the bio_list_on_stack,
+ * thus initialising the bio_list of new bios to be
* added. __generic_make_request may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so fixup bio_list and
- * bio_tail or bi_next, and call into __generic_make_request again.
+ * of the top of the list (no pretending) and so remove it from
+ * bio_list, and call into __generic_make_request again.
*
* The loop was structured like this to make only one call to
* __generic_make_request (which is important as it is large and
* inlined) and to keep the structure simple.
*/
BUG_ON(bio->bi_next);
+ bio_list_init(&bio_list_on_stack);
+ current->bio_list = &bio_list_on_stack;
do {
- current->bio_list = bio->bi_next;
- if (bio->bi_next == NULL)
- current->bio_tail = &current->bio_list;
- else
- bio->bi_next = NULL;
__generic_make_request(bio);
- bio = current->bio_list;
+ bio = bio_list_pop(current->bio_list);
} while (bio);
- current->bio_tail = NULL; /* deactivate */
+ current->bio_list = NULL; /* deactivate */
}
EXPORT_SYMBOL(generic_make_request);
@@ -1559,7 +1602,7 @@ void submit_bio(int rw, struct bio *bio)
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
- if (bio_has_data(bio)) {
+ if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
@@ -1604,6 +1647,9 @@ EXPORT_SYMBOL(submit_bio);
*/
int blk_rq_check_limits(struct request_queue *q, struct request *rq)
{
+ if (rq->cmd_flags & REQ_DISCARD)
+ return 0;
+
if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
printk(KERN_ERR "%s: over max size limit.\n", __func__);
@@ -1617,8 +1663,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
* limitation.
*/
blk_recalc_rq_segments(rq);
- if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
- rq->nr_phys_segments > queue_max_hw_segments(q)) {
+ if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit.\n", __func__);
return -EIO;
}
@@ -1773,7 +1818,7 @@ struct request *blk_peek_request(struct request_queue *q)
* sees this request (possibly after
* requeueing). Notify IO scheduler.
*/
- if (blk_sorted_rq(rq))
+ if (rq->cmd_flags & REQ_SORTED)
elv_activate_rq(q, rq);
/*
@@ -1861,12 +1906,7 @@ void blk_dequeue_request(struct request *rq)
*/
if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]++;
- /*
- * Mark this device as supporting hardware queuing, if
- * we have more IOs in flight than 4.
- */
- if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
- set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
+ set_io_start_time_ns(rq);
}
}
@@ -1966,10 +2006,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
* TODO: tj: This is too subtle. It would be better to let
* low level drivers do what they see fit.
*/
- if (blk_fs_request(req))
+ if (req->cmd_type == REQ_TYPE_FS)
req->errors = 0;
- if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
+ if (error && req->cmd_type == REQ_TYPE_FS &&
+ !(req->cmd_flags & REQ_QUIET)) {
printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
req->rq_disk ? req->rq_disk->disk_name : "?",
(unsigned long long)blk_rq_pos(req));
@@ -2056,7 +2097,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
req->buffer = bio_data(req->bio);
/* update sector only for requests with clear definition of sector */
- if (blk_fs_request(req) || blk_discard_rq(req))
+ if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */
@@ -2093,11 +2134,32 @@ static bool blk_update_bidi_request(struct request *rq, int error,
blk_update_request(rq->next_rq, error, bidi_bytes))
return true;
- add_disk_randomness(rq->rq_disk);
+ if (blk_queue_add_random(rq->q))
+ add_disk_randomness(rq->rq_disk);
return false;
}
+/**
+ * blk_unprep_request - unprepare a request
+ * @req: the request
+ *
+ * This function makes a request ready for complete resubmission (or
+ * completion). It happens only after all error handling is complete,
+ * so represents the appropriate moment to deallocate any resources
+ * that were allocated to the request in the prep_rq_fn. The queue
+ * lock is held when calling this.
+ */
+void blk_unprep_request(struct request *req)
+{
+ struct request_queue *q = req->q;
+
+ req->cmd_flags &= ~REQ_DONTPREP;
+ if (q->unprep_rq_fn)
+ q->unprep_rq_fn(q, req);
+}
+EXPORT_SYMBOL_GPL(blk_unprep_request);
+
/*
* queue lock must be held
*/
@@ -2108,11 +2170,15 @@ static void blk_finish_request(struct request *req, int error)
BUG_ON(blk_queued_rq(req));
- if (unlikely(laptop_mode) && blk_fs_request(req))
- laptop_io_completion();
+ if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
+ laptop_io_completion(&req->q->backing_dev_info);
blk_delete_timer(req);
+ if (req->cmd_flags & REQ_DONTPREP)
+ blk_unprep_request(req);
+
+
blk_account_io_done(req);
if (req->end_io)
@@ -2345,7 +2411,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
- rq->cmd_flags |= bio->bi_rw & REQ_RW;
+ rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
if (bio_has_data(bio)) {
rq->nr_phys_segments = bio_phys_segments(q, bio);
@@ -2432,6 +2498,8 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
{
dst->cpu = src->cpu;
dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
+ if (src->cmd_flags & REQ_DISCARD)
+ dst->cmd_flags |= REQ_DISCARD;
dst->cmd_type = src->cmd_type;
dst->__sector = blk_rq_pos(src);
dst->__data_len = blk_rq_bytes(src);
@@ -2528,4 +2596,3 @@ int __init blk_dev_init(void)
return 0;
}
-
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 49557e91f0da..e1672f14840e 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -57,7 +57,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
__elv_add_request(q, rq, where, 1);
__generic_unplug_device(q);
/* the queue is stopped so it won't be plugged+unplugged */
- if (blk_pm_resume_request(rq))
+ if (rq->cmd_type == REQ_TYPE_PM_RESUME)
q->request_fn(q);
spin_unlock_irq(q->queue_lock);
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 15c630813b1c..edce1ef7933d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,6 +24,7 @@
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
+#include <linux/slab.h>
#include "blk.h"
@@ -278,7 +279,7 @@ static struct attribute *integrity_attrs[] = {
NULL,
};
-static struct sysfs_ops integrity_ops = {
+static const struct sysfs_ops integrity_ops = {
.show = &integrity_attr_show,
.store = &integrity_attr_store,
};
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 98e6bf61b0ac..d22c4c55c406 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
+#include <linux/slab.h>
#include "blk.h"
@@ -91,7 +92,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
spin_lock_init(&ret->lock);
ret->ioprio_changed = 0;
ret->ioprio = 0;
- ret->last_waited = jiffies; /* doesn't matter... */
+ ret->last_waited = 0; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */
INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ret->cic_list);
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..c392029a104e
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,231 @@
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+
+ if (bio->bi_private)
+ complete(bio->bi_private);
+
+ bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ * Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = flags & BLKDEV_IFL_BARRIER ?
+ DISCARD_BARRIER : DISCARD_NOBARRIER;
+ unsigned int max_discard_sectors;
+ struct bio *bio;
+ int ret = 0;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /*
+ * Ensure that max_discard_sectors is of the proper
+ * granularity
+ */
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ if (q->limits.discard_granularity) {
+ unsigned int disc_sects = q->limits.discard_granularity >> 9;
+
+ max_discard_sectors &= ~(disc_sects - 1);
+ }
+
+ if (flags & BLKDEV_IFL_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ type |= DISCARD_SECURE;
+ }
+
+ while (nr_sects && !ret) {
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bio->bi_sector = sector;
+ bio->bi_end_io = blkdev_discard_end_io;
+ bio->bi_bdev = bdev;
+ if (flags & BLKDEV_IFL_WAIT)
+ bio->bi_private = &wait;
+
+ if (nr_sects > max_discard_sectors) {
+ bio->bi_size = max_discard_sectors << 9;
+ nr_sects -= max_discard_sectors;
+ sector += max_discard_sectors;
+ } else {
+ bio->bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+
+ bio_get(bio);
+ submit_bio(type, bio);
+
+ if (flags & BLKDEV_IFL_WAIT)
+ wait_for_completion(&wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ else if (!bio_flagged(bio, BIO_UPTODATE))
+ ret = -EIO;
+ bio_put(bio);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
+
+struct bio_batch
+{
+ atomic_t done;
+ unsigned long flags;
+ struct completion *wait;
+ bio_end_io_t *end_io;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+ struct bio_batch *bb = bio->bi_private;
+
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bb->flags);
+ else
+ clear_bit(BIO_UPTODATE, &bb->flags);
+ }
+ if (bb) {
+ if (bb->end_io)
+ bb->end_io(bio, err);
+ atomic_inc(&bb->done);
+ complete(bb->wait);
+ }
+ bio_put(bio);
+}
+
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev: blockdev to issue
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ * Generate and issue number of bios with zerofiled pages.
+ * Send barrier at the beginning and at the end if requested. This guarantie
+ * correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+ int ret;
+ struct bio *bio;
+ struct bio_batch bb;
+ unsigned int sz, issued = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ atomic_set(&bb.done, 0);
+ bb.flags = 1 << BIO_UPTODATE;
+ bb.wait = &wait;
+ bb.end_io = NULL;
+
+ if (flags & BLKDEV_IFL_BARRIER) {
+ /* issue async barrier before the data */
+ ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+ if (ret)
+ return ret;
+ }
+submit:
+ ret = 0;
+ while (nr_sects != 0) {
+ bio = bio_alloc(gfp_mask,
+ min(nr_sects, (sector_t)BIO_MAX_PAGES));
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bio->bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_end_io = bio_batch_end_io;
+ if (flags & BLKDEV_IFL_WAIT)
+ bio->bi_private = &bb;
+
+ while (nr_sects != 0) {
+ sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
+ if (sz == 0)
+ /* bio has maximum size possible */
+ break;
+ ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+ nr_sects -= ret >> 9;
+ sector += ret >> 9;
+ if (ret < (sz << 9))
+ break;
+ }
+ ret = 0;
+ issued++;
+ submit_bio(WRITE, bio);
+ }
+ /*
+ * When all data bios are in flight. Send final barrier if requeted.
+ */
+ if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+ ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+ flags & BLKDEV_IFL_WAIT);
+
+
+ if (flags & BLKDEV_IFL_WAIT)
+ /* Wait for bios in-flight */
+ while ( issued != atomic_read(&bb.done))
+ wait_for_completion(&wait);
+
+ if (!test_bit(BIO_UPTODATE, &bb.flags))
+ /* One of bios in the batch was completed with error.*/
+ ret = -EIO;
+
+ if (ret)
+ goto out;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (nr_sects != 0)
+ goto submit;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index 9083cf0180cc..ade0a08c9099 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
return PTR_ERR(bio);
if (rq_data_dir(rq) == WRITE)
- bio->bi_rw |= (1 << BIO_RW);
+ bio->bi_rw |= REQ_WRITE;
if (do_copy)
rq->cmd_flags |= REQ_COPY_USER;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf1f447..eafc94f68d79 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,7 +12,6 @@
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio)
{
- unsigned int phys_size;
struct bio_vec *bv, *bvprv = NULL;
int cluster, i, high, highprv = 1;
unsigned int seg_size, nr_phys_segs;
@@ -24,7 +23,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
fbio = bio;
cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
seg_size = 0;
- phys_size = nr_phys_segs = 0;
+ nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_segment(bv, bio, i) {
/*
@@ -180,7 +179,7 @@ new_segment:
}
if (q->dma_drain_size && q->dma_drain_needed(rq)) {
- if (rq->cmd_flags & REQ_RW)
+ if (rq->cmd_flags & REQ_WRITE)
memset(q->dma_drain_buffer, 0, q->dma_drain_size);
sg->page_link &= ~0x02;
@@ -206,8 +205,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
{
int nr_phys_segs = bio_phys_segments(q, bio);
- if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
- req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
+ if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
@@ -227,7 +225,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
{
unsigned short max_sectors;
- if (unlikely(blk_pc_request(req)))
+ if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
max_sectors = queue_max_hw_sectors(q);
else
max_sectors = queue_max_sectors(q);
@@ -251,7 +249,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
{
unsigned short max_sectors;
- if (unlikely(blk_pc_request(req)))
+ if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
max_sectors = queue_max_hw_sectors(q);
else
max_sectors = queue_max_sectors(q);
@@ -300,10 +298,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
total_phys_segments--;
}
- if (total_phys_segments > queue_max_phys_segments(q))
- return 0;
-
- if (total_phys_segments > queue_max_hw_segments(q))
+ if (total_phys_segments > queue_max_segments(q))
return 0;
/* Merge is OK... */
@@ -367,6 +362,18 @@ static int attempt_merge(struct request_queue *q, struct request *req,
return 0;
/*
+ * Don't merge file system requests and discard requests
+ */
+ if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
+ return 0;
+
+ /*
+ * Don't merge discard requests and secure discard requests
+ */
+ if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
+ return 0;
+
+ /*
* not contiguous
*/
if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5eeb9e0d256e..a234f4bf1d6f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,7 +8,9 @@
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/gcd.h>
+#include <linux/lcm.h>
#include <linux/jiffies.h>
+#include <linux/gfp.h>
#include "blk.h"
@@ -35,6 +37,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
EXPORT_SYMBOL(blk_queue_prep_rq);
/**
+ * blk_queue_unprep_rq - set an unprepare_request function for queue
+ * @q: queue
+ * @ufn: unprepare_request function
+ *
+ * It's possible for a queue to register an unprepare_request callback
+ * which is invoked before the request is finally completed. The goal
+ * of the function is to deallocate any data that was allocated in the
+ * prepare_request callback.
+ *
+ */
+void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
+{
+ q->unprep_rq_fn = ufn;
+}
+EXPORT_SYMBOL(blk_queue_unprep_rq);
+
+/**
* blk_queue_merge_bvec - set a merge_bvec function for queue
* @q: queue
* @mbfn: merge_bvec_fn
@@ -91,10 +110,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
*/
void blk_set_default_limits(struct queue_limits *lim)
{
- lim->max_phys_segments = MAX_PHYS_SEGMENTS;
- lim->max_hw_segments = MAX_HW_SEGMENTS;
+ lim->max_segments = BLK_MAX_SEGMENTS;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
- lim->max_segment_size = MAX_SEGMENT_SIZE;
+ lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = BLK_DEF_MAX_SECTORS;
lim->max_hw_sectors = INT_MAX;
lim->max_discard_sectors = 0;
@@ -154,7 +172,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
q->unplug_timer.data = (unsigned long)q;
blk_set_default_limits(&q->limits);
- blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+ blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
/*
* If the caller didn't supply a lock, fall back to our embedded
@@ -210,37 +228,32 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
EXPORT_SYMBOL(blk_queue_bounce_limit);
/**
- * blk_queue_max_sectors - set max sectors for a request for this queue
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
* @q: the request queue for the device
- * @max_sectors: max sectors in the usual 512b unit
+ * @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
- * Enables a low level driver to set an upper limit on the size of
- * received requests.
+ * Enables a low level driver to set a hard upper limit,
+ * max_hw_sectors, on the size of requests. max_hw_sectors is set by
+ * the device driver based upon the combined capabilities of I/O
+ * controller and storage device.
+ *
+ * max_sectors is a soft limit imposed by the block layer for
+ * filesystem type requests. This value can be overridden on a
+ * per-device basis in /sys/block/<device>/queue/max_sectors_kb.
+ * The soft limit can not exceed max_hw_sectors.
**/
-void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
- if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
- max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+ if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
+ max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_sectors);
+ __func__, max_hw_sectors);
}
- if (BLK_DEF_MAX_SECTORS > max_sectors)
- q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
- else {
- q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
- q->limits.max_hw_sectors = max_sectors;
- }
-}
-EXPORT_SYMBOL(blk_queue_max_sectors);
-
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
-{
- if (BLK_DEF_MAX_SECTORS > max_sectors)
- q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
- else
- q->limits.max_hw_sectors = max_sectors;
+ q->limits.max_hw_sectors = max_hw_sectors;
+ q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
+ BLK_DEF_MAX_SECTORS);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -257,17 +270,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
- * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
*
* Description:
* Enables a low level driver to set an upper limit on the number of
- * physical data segments in a request. This would be the largest sized
- * scatter list the driver could handle.
+ * hw data segments in a request.
**/
-void blk_queue_max_phys_segments(struct request_queue *q,
- unsigned short max_segments)
+void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
{
if (!max_segments) {
max_segments = 1;
@@ -275,33 +286,9 @@ void blk_queue_max_phys_segments(struct request_queue *q,
__func__, max_segments);
}
- q->limits.max_phys_segments = max_segments;
+ q->limits.max_segments = max_segments;
}
-EXPORT_SYMBOL(blk_queue_max_phys_segments);
-
-/**
- * blk_queue_max_hw_segments - set max hw segments for a request for this queue
- * @q: the request queue for the device
- * @max_segments: max number of segments
- *
- * Description:
- * Enables a low level driver to set an upper limit on the number of
- * hw data segments in a request. This would be the largest number of
- * address/length pairs the host adapter can actually give at once
- * to the device.
- **/
-void blk_queue_max_hw_segments(struct request_queue *q,
- unsigned short max_segments)
-{
- if (!max_segments) {
- max_segments = 1;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_segments);
- }
-
- q->limits.max_hw_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_hw_segments);
+EXPORT_SYMBOL(blk_queue_max_segments);
/**
* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
@@ -493,21 +480,11 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
}
EXPORT_SYMBOL(blk_queue_stack_limits);
-static unsigned int lcm(unsigned int a, unsigned int b)
-{
- if (a && b)
- return (a * b) / gcd(a, b);
- else if (b)
- return b;
-
- return a;
-}
-
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
* @b: the underlying queue limits (bottom, component device)
- * @offset: offset to beginning of data within component device
+ * @start: first data sector within component device
*
* Description:
* This function is used by stacking drivers like MD and DM to ensure
@@ -525,10 +502,9 @@ static unsigned int lcm(unsigned int a, unsigned int b)
* the alignment_offset is undefined.
*/
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
- sector_t offset)
+ sector_t start)
{
- sector_t alignment;
- unsigned int top, bottom, ret = 0;
+ unsigned int top, bottom, alignment, ret = 0;
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -537,18 +513,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
- t->max_phys_segments = min_not_zero(t->max_phys_segments,
- b->max_phys_segments);
-
- t->max_hw_segments = min_not_zero(t->max_hw_segments,
- b->max_hw_segments);
+ t->max_segments = min_not_zero(t->max_segments, b->max_segments);
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
t->misaligned |= b->misaligned;
- alignment = queue_limit_alignment_offset(b, offset);
+ alignment = queue_limit_alignment_offset(b, start);
/* Bottom device has different alignment. Check that it is
* compatible with the current top alignment.
@@ -611,11 +583,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
/* Discard alignment and granularity */
if (b->discard_granularity) {
- unsigned int granularity = b->discard_granularity;
- offset &= granularity - 1;
-
- alignment = (granularity + b->discard_alignment - offset)
- & (granularity - 1);
+ alignment = queue_limit_discard_alignment(b, start);
if (t->discard_granularity != 0 &&
t->discard_alignment != alignment) {
@@ -657,7 +625,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
start += get_start_sect(bdev);
- return blk_stack_limits(t, &bq->limits, start << 9);
+ return blk_stack_limits(t, &bq->limits, start);
}
EXPORT_SYMBOL(bdev_stack_limits);
@@ -668,9 +636,8 @@ EXPORT_SYMBOL(bdev_stack_limits);
* @offset: offset to beginning of data within component device
*
* Description:
- * Merges the limits for two queues. Returns 0 if alignment
- * didn't change. Returns -1 if adding the bottom device caused
- * misalignment.
+ * Merges the limits for a top level gendisk and a bottom level
+ * block_device.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
@@ -678,9 +645,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
struct request_queue *t = disk->queue;
struct request_queue *b = bdev_get_queue(bdev);
- offset += get_start_sect(bdev) << 9;
-
- if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+ if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
disk_name(disk, 0, top);
@@ -752,22 +717,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad);
* does is adjust the queue so that the buf is always appended
* silently to the scatterlist.
*
- * Note: This routine adjusts max_hw_segments to make room for
- * appending the drain buffer. If you call
- * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
- * calling this routine, you must set the limit to one fewer than your
- * device can support otherwise there won't be room for the drain
- * buffer.
+ * Note: This routine adjusts max_hw_segments to make room for appending
+ * the drain buffer. If you call blk_queue_max_segments() after calling
+ * this routine, you must set the limit to one fewer than your device
+ * can support otherwise there won't be room for the drain buffer.
*/
int blk_queue_dma_drain(struct request_queue *q,
dma_drain_needed_fn *dma_drain_needed,
void *buf, unsigned int size)
{
- if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
+ if (queue_max_segments(q) < 2)
return -EINVAL;
/* make room for appending the drain */
- blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
- blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
+ blk_queue_max_segments(q, queue_max_segments(q) - 1);
q->dma_drain_needed = dma_drain_needed;
q->dma_drain_buffer = buf;
q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8606c9543fdd..0749b89c6885 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -2,6 +2,7 @@
* Functions related to sysfs handling
*/
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -106,6 +107,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
return queue_var_show(max_sectors_kb, (page));
}
+static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_max_segments(q), (page));
+}
+
+static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
+{
+ if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+ return queue_var_show(queue_max_segment_size(q), (page));
+
+ return queue_var_show(PAGE_CACHE_SIZE, (page));
+}
+
static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_logical_block_size(q), page);
@@ -166,30 +180,41 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
return queue_var_show(max_hw_sectors_kb, (page));
}
-static ssize_t queue_nonrot_show(struct request_queue *q, char *page)
-{
- return queue_var_show(!blk_queue_nonrot(q), page);
+#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
+static ssize_t \
+queue_show_##name(struct request_queue *q, char *page) \
+{ \
+ int bit; \
+ bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
+ return queue_var_show(neg ? !bit : bit, page); \
+} \
+static ssize_t \
+queue_store_##name(struct request_queue *q, const char *page, size_t count) \
+{ \
+ unsigned long val; \
+ ssize_t ret; \
+ ret = queue_var_store(&val, page, count); \
+ if (neg) \
+ val = !val; \
+ \
+ spin_lock_irq(q->queue_lock); \
+ if (val) \
+ queue_flag_set(QUEUE_FLAG_##flag, q); \
+ else \
+ queue_flag_clear(QUEUE_FLAG_##flag, q); \
+ spin_unlock_irq(q->queue_lock); \
+ return ret; \
}
-static ssize_t queue_nonrot_store(struct request_queue *q, const char *page,
- size_t count)
-{
- unsigned long nm;
- ssize_t ret = queue_var_store(&nm, page, count);
-
- spin_lock_irq(q->queue_lock);
- if (nm)
- queue_flag_clear(QUEUE_FLAG_NONROT, q);
- else
- queue_flag_set(QUEUE_FLAG_NONROT, q);
- spin_unlock_irq(q->queue_lock);
-
- return ret;
-}
+QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
+QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
+QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
{
- return queue_var_show(blk_queue_nomerges(q), page);
+ return queue_var_show((blk_queue_nomerges(q) << 1) |
+ blk_queue_noxmerges(q), page);
}
static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
@@ -199,10 +224,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
ssize_t ret = queue_var_store(&nm, page, count);
spin_lock_irq(q->queue_lock);
- if (nm)
+ queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+ queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
+ if (nm == 2)
queue_flag_set(QUEUE_FLAG_NOMERGES, q);
- else
- queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+ else if (nm)
+ queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
spin_unlock_irq(q->queue_lock);
return ret;
@@ -233,27 +260,6 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
return ret;
}
-static ssize_t queue_iostats_show(struct request_queue *q, char *page)
-{
- return queue_var_show(blk_queue_io_stat(q), page);
-}
-
-static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
- size_t count)
-{
- unsigned long stats;
- ssize_t ret = queue_var_store(&stats, page, count);
-
- spin_lock_irq(q->queue_lock);
- if (stats)
- queue_flag_set(QUEUE_FLAG_IO_STAT, q);
- else
- queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
- spin_unlock_irq(q->queue_lock);
-
- return ret;
-}
-
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -277,6 +283,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
.show = queue_max_hw_sectors_show,
};
+static struct queue_sysfs_entry queue_max_segments_entry = {
+ .attr = {.name = "max_segments", .mode = S_IRUGO },
+ .show = queue_max_segments_show,
+};
+
+static struct queue_sysfs_entry queue_max_segment_size_entry = {
+ .attr = {.name = "max_segment_size", .mode = S_IRUGO },
+ .show = queue_max_segment_size_show,
+};
+
static struct queue_sysfs_entry queue_iosched_entry = {
.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
.show = elv_iosched_show,
@@ -325,8 +341,8 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
- .show = queue_nonrot_show,
- .store = queue_nonrot_store,
+ .show = queue_show_nonrot,
+ .store = queue_store_nonrot,
};
static struct queue_sysfs_entry queue_nomerges_entry = {
@@ -343,8 +359,14 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = {
static struct queue_sysfs_entry queue_iostats_entry = {
.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
- .show = queue_iostats_show,
- .store = queue_iostats_store,
+ .show = queue_show_iostats,
+ .store = queue_store_iostats,
+};
+
+static struct queue_sysfs_entry queue_random_entry = {
+ .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_show_random,
+ .store = queue_store_random,
};
static struct attribute *default_attrs[] = {
@@ -352,6 +374,8 @@ static struct attribute *default_attrs[] = {
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
+ &queue_max_segments_entry.attr,
+ &queue_max_segment_size_entry.attr,
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
@@ -365,6 +389,7 @@ static struct attribute *default_attrs[] = {
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
+ &queue_random_entry.attr,
NULL,
};
@@ -447,7 +472,7 @@ static void blk_release_queue(struct kobject *kobj)
kmem_cache_free(blk_requestq_cachep, q);
}
-static struct sysfs_ops queue_sysfs_ops = {
+static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
};
@@ -486,6 +511,7 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
+ kobject_put(&dev->kobj);
return ret;
}
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 6b0f52c20964..ece65fc4c79b 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/slab.h>
#include "blk.h"
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ba7e0aca878..4f0c06c7a338 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data)
struct request_queue *q = (struct request_queue *) data;
unsigned long flags, next = 0;
struct request *rq, *tmp;
+ int next_set = 0;
spin_lock_irqsave(q->queue_lock, flags);
@@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data)
if (blk_mark_rq_complete(rq))
continue;
blk_rq_timed_out(rq);
- } else if (!next || time_after(next, rq->deadline))
+ } else if (!next_set || time_after(next, rq->deadline)) {
next = rq->deadline;
+ next_set = 1;
+ }
}
- /*
- * next can never be 0 here with the list non-empty, since we always
- * bump ->deadline to 1 so we can detect if the timer was ever added
- * or not. See comment in blk_add_timer()
- */
- if (next)
+ if (next_set)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/blk.h b/block/blk.h
index 5ee3d7e72feb..d6b911ac002c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
static inline int blk_cpu_to_group(int cpu)
{
+ int group = NR_CPUS;
#ifdef CONFIG_SCHED_MC
const struct cpumask *mask = cpu_coregroup_mask(cpu);
- return cpumask_first(mask);
+ group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- return cpumask_first(topology_thread_cpumask(cpu));
+ group = cpumask_first(topology_thread_cpumask(cpu));
#else
return cpu;
#endif
+ if (likely(group < NR_CPUS))
+ return group;
+ return cpu;
}
/*
@@ -161,8 +165,10 @@ static inline int blk_cpu_to_group(int cpu)
*/
static inline int blk_do_io_stat(struct request *rq)
{
- return rq->rq_disk && blk_rq_io_stat(rq) &&
- (blk_fs_request(rq) || blk_discard_rq(rq));
+ return rq->rq_disk &&
+ (rq->cmd_flags & REQ_IO_STAT) &&
+ (rq->cmd_type == REQ_TYPE_FS ||
+ (rq->cmd_flags & REQ_DISCARD));
}
#endif
diff --git a/block/bsg.c b/block/bsg.c
index a9fd2d84b53a..0c00870553a3 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -21,6 +21,7 @@
#include <linux/idr.h>
#include <linux/bsg.h>
#include <linux/smp_lock.h>
+#include <linux/slab.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
@@ -260,7 +261,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
return ERR_PTR(ret);
/*
- * map scatter-gather elements seperately and string them to request
+ * map scatter-gather elements separately and string them to request
*/
rq = blk_get_request(q, rw, GFP_KERNEL);
if (!rq)
@@ -425,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
/*
* fill in all the output members
*/
- hdr->device_status = status_byte(rq->errors);
+ hdr->device_status = rq->errors & 0xff;
hdr->transport_status = host_byte(rq->errors);
hdr->driver_status = driver_byte(rq->errors);
hdr->info = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee130f14d1fc..9eba291eb6fd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -7,19 +7,20 @@
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include "cfq.h"
/*
* tunables
*/
/* max queue in one round of service */
-static const int cfq_quantum = 4;
+static const int cfq_quantum = 8;
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
@@ -29,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
@@ -42,19 +44,19 @@ static const int cfq_hist_divisor = 4;
*/
#define CFQ_MIN_TT (2)
-/*
- * Allow merged cfqqs to perform this amount of seeky I/O before
- * deciding to break the queues up again.
- */
-#define CFQQ_COOP_TOUT (HZ)
-
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
#define CFQ_SERVICE_SHIFT 12
+#define CFQQ_SEEK_THR (sector_t)(8 * 100)
+#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
+#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
+
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
@@ -63,6 +65,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
+
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -80,11 +85,12 @@ struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
unsigned count;
+ unsigned total_weight;
u64 min_vdisktime;
struct rb_node *active;
- unsigned total_weight;
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
+#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
+ .count = 0, .min_vdisktime = 0, }
/*
* Per process-grouping structure
@@ -118,11 +124,11 @@ struct cfq_queue {
/* time when queue got scheduled in to dispatch first request. */
unsigned long dispatch_start;
unsigned int allocated_slice;
+ unsigned int slice_dispatch;
/* time when first request from queue completed and slice started. */
unsigned long slice_start;
unsigned long slice_end;
long slice_resid;
- unsigned int slice_dispatch;
/* pending metadata requests */
int meta_pending;
@@ -133,19 +139,16 @@ struct cfq_queue {
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
- unsigned int seek_samples;
- u64 seek_total;
- sector_t seek_mean;
- sector_t last_request_pos;
- unsigned long seeky_start;
-
pid_t pid;
+ u32 seek_history;
+ sector_t last_request_pos;
+
struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
struct cfq_group *orig_cfqg;
- /* Sectors dispatched in current dispatch round */
+ /* Number of sectors dispatched from queue in single dispatch round */
unsigned long nr_sectors;
};
@@ -198,6 +201,8 @@ struct cfq_group {
struct hlist_node cfqd_node;
atomic_t ref;
#endif
+ /* number of requests that are on the dispatch list or inside driver */
+ int dispatched;
};
/*
@@ -227,8 +232,8 @@ struct cfq_data {
unsigned int busy_queues;
- int rq_in_driver[2];
- int sync_flight;
+ int rq_in_driver;
+ int rq_in_flight[2];
/*
* queue-depth detection
@@ -271,9 +276,11 @@ struct cfq_data {
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
+ unsigned int cfq_group_idle;
unsigned int cfq_latency;
unsigned int cfq_group_isolation;
+ unsigned int cic_index;
struct list_head cic_list;
/*
@@ -314,6 +321,7 @@ enum cfqq_state_flags {
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
+ CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
};
@@ -342,11 +350,12 @@ CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
+CFQ_CFQQ_FNS(split_coop);
CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -375,6 +384,21 @@ CFQ_CFQQ_FNS(wait_busy);
&cfqg->service_trees[i][j]: NULL) \
+static inline bool iops_mode(struct cfq_data *cfqd)
+{
+ /*
+ * If we are not idling on queues and it is a NCQ drive, parallel
+ * execution of requests is on and measuring time is not possible
+ * in most of the cases until and unless we drive shallower queue
+ * depths and that becomes a performance bottleneck. In such cases
+ * switch to start providing fairness in terms of number of IOs.
+ */
+ if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
+ return true;
+ else
+ return false;
+}
+
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
{
if (cfq_class_idle(cfqq))
@@ -419,11 +443,6 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
struct io_context *);
-static inline int rq_in_driver(struct cfq_data *cfqd)
-{
- return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
-}
-
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
bool is_sync)
{
@@ -436,13 +455,31 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
cic->cfqq[is_sync] = cfqq;
}
+#define CIC_DEAD_KEY 1ul
+#define CIC_DEAD_INDEX_SHIFT 1
+
+static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+{
+ return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+}
+
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+{
+ struct cfq_data *cfqd = cic->key;
+
+ if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
+ return NULL;
+
+ return cfqd;
+}
+
/*
* We regard a request as SYNC, if it's either a read or has the SYNC bit
* set (in which case it could also be direct WRITE).
*/
static inline bool cfq_bio_sync(struct bio *bio)
{
- return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
+ return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
}
/*
@@ -630,9 +667,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
return rq1;
else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
return rq2;
- if (rq_is_meta(rq1) && !rq_is_meta(rq2))
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
return rq1;
- else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
+ else if ((rq2->cmd_flags & REQ_META) &&
+ !(rq1->cmd_flags & REQ_META))
return rq2;
s1 = blk_rq_pos(rq1);
@@ -863,7 +901,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
cfqg->saved_workload_slice = 0;
- blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+ cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -889,8 +927,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
slice_used = cfqq->allocated_slice;
}
- cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
- cfqq->nr_sectors);
return slice_used;
}
@@ -898,19 +934,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
- unsigned int used_sl, charge_sl;
+ unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
BUG_ON(nr_sync < 0);
- used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+ used_sl = charge = cfq_cfqq_slice_usage(cfqq);
- if (!cfq_cfqq_sync(cfqq) && !nr_sync)
- charge_sl = cfqq->allocated_slice;
+ if (iops_mode(cfqd))
+ charge = cfqq->slice_dispatch;
+ else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+ charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */
cfq_rb_erase(&cfqg->rb_node, st);
- cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+ cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
__cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
@@ -924,8 +962,11 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
st->min_vdisktime);
- blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
- cfqq->nr_sectors);
+ cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
+ " sect=%u", used_sl, cfqq->slice_dispatch, charge,
+ iops_mode(cfqd), cfqq->nr_sectors);
+ cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+ cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -953,11 +994,12 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
- /* Do we need to take this reference */
- if (!blkiocg_css_tryget(blkcg))
- return NULL;;
-
cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
if (cfqg || !create)
goto done;
@@ -965,7 +1007,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
if (!cfqg)
goto done;
- cfqg->weight = blkcg->weight;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,16 +1019,26 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
atomic_set(&cfqg->ref, 1);
- /* Add group onto cgroup list */
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ /*
+ * Add group onto cgroup list. It might happen that bdi->dev is
+ * not initiliazed yet. Initialize this new group without major
+ * and minor info and this info will be filled in once a new thread
+ * comes for IO. See code above.
+ */
+ if (bdi->dev) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
MKDEV(major, minor));
+ } else
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ 0);
+
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
done:
- blkiocg_css_put(blkcg);
return cfqg;
}
@@ -1009,6 +1060,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
return cfqg;
}
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+ atomic_inc(&cfqg->ref);
+ return cfqg;
+}
+
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
/* Currently, all async queues are mapped to root group */
@@ -1058,7 +1115,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
* it from cgroup list, then it will take care of destroying
* cfqg also.
*/
- if (!blkiocg_del_blkio_group(&cfqg->blkg))
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
cfq_destroy_cfqg(cfqd, cfqg);
}
}
@@ -1092,6 +1149,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
{
return &cfqd->root_group;
}
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+ return cfqg;
+}
+
static inline void
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
cfqq->cfqg = cfqg;
@@ -1394,7 +1457,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
{
elv_rb_del(&cfqq->sort_list, rq);
cfqq->queued[rq_is_sync(rq)]--;
+ cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+ rq_data_dir(rq), rq_is_sync(rq));
cfq_add_rq_rb(rq);
+ cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+ &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
}
static struct request *
@@ -1422,9 +1490,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- cfqd->rq_in_driver[rq_is_sync(rq)]++;
+ cfqd->rq_in_driver++;
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
- rq_in_driver(cfqd));
+ cfqd->rq_in_driver);
cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
}
@@ -1432,12 +1500,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- const int sync = rq_is_sync(rq);
- WARN_ON(!cfqd->rq_in_driver[sync]);
- cfqd->rq_in_driver[sync]--;
+ WARN_ON(!cfqd->rq_in_driver);
+ cfqd->rq_in_driver--;
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
- rq_in_driver(cfqd));
+ cfqd->rq_in_driver);
}
static void cfq_remove_request(struct request *rq)
@@ -1451,7 +1518,9 @@ static void cfq_remove_request(struct request *rq)
cfq_del_rq_rb(rq);
cfqq->cfqd->rq_queued--;
- if (rq_is_meta(rq)) {
+ cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+ rq_data_dir(rq), rq_is_sync(rq));
+ if (rq->cmd_flags & REQ_META) {
WARN_ON(!cfqq->meta_pending);
cfqq->meta_pending--;
}
@@ -1482,6 +1551,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
}
}
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
+ bio_data_dir(bio), cfq_bio_sync(bio));
+}
+
static void
cfq_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
@@ -1499,6 +1575,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
if (cfqq->next_rq == next)
cfqq->next_rq = rq;
cfq_remove_request(next);
+ cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
+ rq_data_dir(next), rq_is_sync(next));
}
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1526,11 +1604,19 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
return cfqq == RQ_CFQQ(rq);
}
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ del_timer(&cfqd->idle_slice_timer);
+ cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
+
static void __cfq_set_active_queue(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
if (cfqq) {
- cfq_log_cfqq(cfqd, cfqq, "set_active");
+ cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
+ cfqd->serving_prio, cfqd->serving_type);
+ cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
cfqq->slice_start = 0;
cfqq->dispatch_start = jiffies;
cfqq->allocated_slice = 0;
@@ -1544,7 +1630,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
cfq_clear_cfqq_fifo_expire(cfqq);
cfq_mark_cfqq_slice_new(cfqq);
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
}
cfqd->active_queue = cfqq;
@@ -1560,12 +1646,21 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
if (cfq_cfqq_wait_request(cfqq))
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
cfq_clear_cfqq_wait_busy(cfqq);
/*
+ * If this cfqq is shared between multiple processes, check to
+ * make sure that those processes are still issuing I/Os within
+ * the mean seek distance. If not, it may be time to break the
+ * queues apart again.
+ */
+ if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
+ cfq_mark_cfqq_split_coop(cfqq);
+
+ /*
* store what was left of this slice, if the queue idled/timed out
*/
if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
@@ -1663,22 +1758,10 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
return cfqd->last_position - blk_rq_pos(rq);
}
-#define CFQQ_SEEK_THR 8 * 1024
-#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
-
static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- struct request *rq, bool for_preempt)
+ struct request *rq)
{
- sector_t sdist = cfqq->seek_mean;
-
- if (!sample_valid(cfqq->seek_samples))
- sdist = CFQQ_SEEK_THR;
-
- /* if seek_mean is big, using it as close criteria is meaningless */
- if (sdist > CFQQ_SEEK_THR && !for_preempt)
- sdist = CFQQ_SEEK_THR;
-
- return cfq_dist_from_last(cfqd, rq) <= sdist;
+ return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
}
static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
@@ -1705,7 +1788,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
* will contain the closest sector.
*/
__cfqq = rb_entry(parent, struct cfq_queue, p_node);
- if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
return __cfqq;
if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1716,7 +1799,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
return NULL;
__cfqq = rb_entry(node, struct cfq_queue, p_node);
- if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
return __cfqq;
return NULL;
@@ -1737,6 +1820,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
{
struct cfq_queue *cfqq;
+ if (cfq_class_idle(cur_cfqq))
+ return NULL;
if (!cfq_cfqq_sync(cur_cfqq))
return NULL;
if (CFQQ_SEEKY(cur_cfqq))
@@ -1790,6 +1875,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(!service_tree);
BUG_ON(!service_tree->count);
+ if (!cfqd->cfq_slice_idle)
+ return false;
+
/* We never do for idle class queues. */
if (prio == IDLE_WORKLOAD)
return false;
@@ -1803,14 +1891,18 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* Otherwise, we do only if they are the last ones
* in their service tree.
*/
- return service_tree->count == 1;
+ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+ return 1;
+ cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
+ service_tree->count);
+ return 0;
}
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
struct cfq_io_context *cic;
- unsigned long sl;
+ unsigned long sl, group_idle = 0;
/*
* SSD device without seek penalty, disable idling. But only do so
@@ -1826,8 +1918,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
/*
* idle is disabled, either manually or by past process history
*/
- if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
- return;
+ if (!cfq_should_idle(cfqd, cfqq)) {
+ /* no queue idling. Check for group idling */
+ if (cfqd->cfq_group_idle)
+ group_idle = cfqd->cfq_group_idle;
+ else
+ return;
+ }
/*
* still active requests from this queue, don't idle
@@ -1848,15 +1945,27 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
* time slice.
*/
if (sample_valid(cic->ttime_samples) &&
- (cfqq->slice_end - jiffies < cic->ttime_mean))
+ (cfqq->slice_end - jiffies < cic->ttime_mean)) {
+ cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
+ cic->ttime_mean);
+ return;
+ }
+
+ /* There are other queues in the group, don't do group idle */
+ if (group_idle && cfqq->cfqg->nr_cfqq > 1)
return;
cfq_mark_cfqq_wait_request(cfqq);
- sl = cfqd->cfq_slice_idle;
+ if (group_idle)
+ sl = cfqd->cfq_group_idle;
+ else
+ sl = cfqd->cfq_slice_idle;
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
- cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+ cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+ group_idle ? 1 : 0);
}
/*
@@ -1872,11 +1981,13 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
cfq_remove_request(rq);
cfqq->dispatched++;
+ (RQ_CFQG(rq))->dispatched++;
elv_dispatch_sort(q, rq);
- if (cfq_cfqq_sync(cfqq))
- cfqd->sync_flight++;
+ cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
cfqq->nr_sectors += blk_rq_sectors(rq);
+ cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+ rq_data_dir(rq), rq_is_sync(rq));
}
/*
@@ -1930,6 +2041,15 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
int process_refs, new_process_refs;
struct cfq_queue *__cfqq;
+ /*
+ * If there are no process references on the new_cfqq, then it is
+ * unsafe to follow the ->new_cfqq chain as other cfqq's in the
+ * chain may have dropped their last reference (not just their
+ * last process reference).
+ */
+ if (!cfqq_process_refs(new_cfqq))
+ return;
+
/* Avoid a circular list and skip interim queue merges */
while ((__cfqq = new_cfqq->new_cfqq)) {
if (__cfqq == cfqq)
@@ -1938,17 +2058,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
}
process_refs = cfqq_process_refs(cfqq);
+ new_process_refs = cfqq_process_refs(new_cfqq);
/*
* If the process for the cfqq has gone away, there is no
* sense in merging the queues.
*/
- if (process_refs == 0)
+ if (process_refs == 0 || new_process_refs == 0)
return;
/*
* Merge in the direction of the lesser amount of work.
*/
- new_process_refs = cfqq_process_refs(new_cfqq);
if (new_process_refs >= process_refs) {
cfqq->new_cfqq = new_cfqq;
atomic_add(process_refs, &new_cfqq->ref);
@@ -2058,6 +2178,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
slice = max(slice, 2 * cfqd->cfq_slice_idle);
slice = max_t(unsigned, slice, CFQ_MIN_TT);
+ cfq_log(cfqd, "workload slice:%d", slice);
cfqd->workload_expires = jiffies + slice;
cfqd->noidle_tree_requires_idle = false;
}
@@ -2131,7 +2252,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
cfqq = NULL;
goto keep_queue;
} else
- goto expire;
+ goto check_group_idle;
}
/*
@@ -2159,8 +2280,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
* flight or is idling for a new request, allow either of these
* conditions to happen (or time out) before selecting a new queue.
*/
- if (timer_pending(&cfqd->idle_slice_timer) ||
- (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+ if (timer_pending(&cfqd->idle_slice_timer)) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+
+ if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+
+ /*
+ * If group idle is enabled and there are requests dispatched from
+ * this group, wait for requests to complete.
+ */
+check_group_idle:
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+ && cfqq->cfqg->dispatched) {
cfqq = NULL;
goto keep_queue;
}
@@ -2205,16 +2341,32 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
struct cfq_queue *cfqq;
int dispatched = 0;
- while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+ /* Expire the timeslice of the current active queue first */
+ cfq_slice_expired(cfqd, 0);
+ while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+ __cfq_set_active_queue(cfqd, cfqq);
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ }
- cfq_slice_expired(cfqd, 0);
BUG_ON(cfqd->busy_queues);
cfq_log(cfqd, "forced_dispatch=%d", dispatched);
return dispatched;
}
+static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
+ struct cfq_queue *cfqq)
+{
+ /* the queue hasn't finished any request, can't estimate */
+ if (cfq_cfqq_slice_new(cfqq))
+ return 1;
+ if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
+ cfqq->slice_end))
+ return 1;
+
+ return 0;
+}
+
static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned int max_dispatch;
@@ -2222,16 +2374,16 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* Drain async requests before we start sync IO
*/
- if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
return false;
/*
* If this is an async queue and we have sync IO in flight, let it wait
*/
- if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+ if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
return false;
- max_dispatch = cfqd->cfq_quantum;
+ max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
if (cfq_class_idle(cfqq))
max_dispatch = 1;
@@ -2248,13 +2400,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* We have other queues, don't allow more IO from this one
*/
- if (cfqd->busy_queues > 1)
+ if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
return false;
/*
* Sole queue user, no limit
*/
- max_dispatch = -1;
+ if (cfqd->busy_queues == 1)
+ max_dispatch = -1;
+ else
+ /*
+ * Normally we start throttling cfqq when cfq_quantum/2
+ * requests have been dispatched. But we can drive
+ * deeper queue depths at the beginning of slice
+ * subjected to upper limit of cfq_quantum.
+ * */
+ max_dispatch = cfqd->cfq_quantum;
}
/*
@@ -2450,11 +2611,12 @@ static void cfq_cic_free(struct cfq_io_context *cic)
static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
{
unsigned long flags;
+ unsigned long dead_key = (unsigned long) cic->key;
- BUG_ON(!cic->dead_key);
+ BUG_ON(!(dead_key & CIC_DEAD_KEY));
spin_lock_irqsave(&ioc->lock, flags);
- radix_tree_delete(&ioc->radix_root, cic->dead_key);
+ radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
hlist_del_rcu(&cic->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -2477,15 +2639,10 @@ static void cfq_free_io_context(struct io_context *ioc)
__call_for_each_cic(ioc, cic_free_func);
}
-static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_put_cooperator(struct cfq_queue *cfqq)
{
struct cfq_queue *__cfqq, *next;
- if (unlikely(cfqq == cfqd->active_queue)) {
- __cfq_slice_expired(cfqd, cfqq, 0);
- cfq_schedule_dispatch(cfqd);
- }
-
/*
* If this queue was scheduled to merge with another queue, be
* sure to drop the reference taken on that queue (and others in
@@ -2501,6 +2658,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_put_queue(__cfqq);
__cfqq = next;
}
+}
+
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ if (unlikely(cfqq == cfqd->active_queue)) {
+ __cfq_slice_expired(cfqd, cfqq, 0);
+ cfq_schedule_dispatch(cfqd);
+ }
+
+ cfq_put_cooperator(cfqq);
cfq_put_queue(cfqq);
}
@@ -2513,11 +2680,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
list_del_init(&cic->queue_list);
/*
- * Make sure key == NULL is seen for dead queues
+ * Make sure dead mark is seen for dead queues
*/
smp_wmb();
- cic->dead_key = (unsigned long) cic->key;
- cic->key = NULL;
+ cic->key = cfqd_dead_key(cfqd);
if (ioc->ioc_data == cic)
rcu_assign_pointer(ioc->ioc_data, NULL);
@@ -2536,7 +2702,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
static void cfq_exit_single_io_context(struct io_context *ioc,
struct cfq_io_context *cic)
{
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
if (cfqd) {
struct request_queue *q = cfqd->queue;
@@ -2549,7 +2715,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
* race between exiting task and queue
*/
smp_read_barrier_depends();
- if (cic->key)
+ if (cic->key == cfqd)
__cfq_exit_single_io_context(cfqd, cic);
spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2629,7 +2795,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
{
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
unsigned long flags;
@@ -2686,7 +2852,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
{
struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
unsigned long flags;
struct request_queue *q;
@@ -2823,12 +2989,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
unsigned long flags;
WARN_ON(!list_empty(&cic->queue_list));
+ BUG_ON(cic->key != cfqd_dead_key(cfqd));
spin_lock_irqsave(&ioc->lock, flags);
BUG_ON(ioc->ioc_data == cic);
- radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+ radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
hlist_del_rcu(&cic->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -2840,7 +3007,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
{
struct cfq_io_context *cic;
unsigned long flags;
- void *k;
if (unlikely(!ioc))
return NULL;
@@ -2857,13 +3023,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
}
do {
- cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+ cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
rcu_read_unlock();
if (!cic)
break;
- /* ->key must be copied to avoid race with cfq_exit_queue() */
- k = cic->key;
- if (unlikely(!k)) {
+ if (unlikely(cic->key != cfqd)) {
cfq_drop_dead_cic(cfqd, ioc, cic);
rcu_read_lock();
continue;
@@ -2896,7 +3060,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
spin_lock_irqsave(&ioc->lock, flags);
ret = radix_tree_insert(&ioc->radix_root,
- (unsigned long) cfqd, cic);
+ cfqd->cic_index, cic);
if (!ret)
hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -2976,43 +3140,20 @@ static void
cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *rq)
{
- sector_t sdist;
- u64 total;
-
- if (!cfqq->last_request_pos)
- sdist = 0;
- else if (cfqq->last_request_pos < blk_rq_pos(rq))
- sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
- else
- sdist = cfqq->last_request_pos - blk_rq_pos(rq);
+ sector_t sdist = 0;
+ sector_t n_sec = blk_rq_sectors(rq);
+ if (cfqq->last_request_pos) {
+ if (cfqq->last_request_pos < blk_rq_pos(rq))
+ sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
+ else
+ sdist = cfqq->last_request_pos - blk_rq_pos(rq);
+ }
- /*
- * Don't allow the seek distance to get too large from the
- * odd fragment, pagein, etc
- */
- if (cfqq->seek_samples <= 60) /* second&third seek */
- sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
+ cfqq->seek_history <<= 1;
+ if (blk_queue_nonrot(cfqd->queue))
+ cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
else
- sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
-
- cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
- cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
- total = cfqq->seek_total + (cfqq->seek_samples/2);
- do_div(total, cfqq->seek_samples);
- cfqq->seek_mean = (sector_t)total;
-
- /*
- * If this cfqq is shared between multiple processes, check to
- * make sure that those processes are still issuing I/Os within
- * the mean seek distance. If not, it may be time to break the
- * queues apart again.
- */
- if (cfq_cfqq_coop(cfqq)) {
- if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
- cfqq->seeky_start = jiffies;
- else if (!CFQQ_SEEKY(cfqq))
- cfqq->seeky_start = 0;
- }
+ cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
}
/*
@@ -3037,8 +3178,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_mark_cfqq_deep(cfqq);
if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
- (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
- && CFQQ_SEEKY(cfqq)))
+ (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
else if (sample_valid(cic->ttime_samples)) {
if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -3106,7 +3246,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
* So both queues are sync. Let the new request get disk time if
* it's a metadata request and the current queue is doing regular IO.
*/
- if (rq_is_meta(rq) && !cfqq->meta_pending)
+ if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
return true;
/*
@@ -3122,7 +3262,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
* if this request is as-good as one we would expect from the
* current cfqq, let it preempt
*/
- if (cfq_rq_close(cfqd, cfqq, rq, true))
+ if (cfq_rq_close(cfqd, cfqq, rq))
return true;
return false;
@@ -3160,7 +3300,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_io_context *cic = RQ_CIC(rq);
cfqd->rq_queued++;
- if (rq_is_meta(rq))
+ if (rq->cmd_flags & REQ_META)
cfqq->meta_pending++;
cfq_update_io_thinktime(cfqd, cic);
@@ -3183,11 +3323,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfq_cfqq_wait_request(cfqq)) {
if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
cfqd->busy_queues > 1) {
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
__blk_run_queue(cfqd->queue);
- } else
+ } else {
+ cfq_blkiocg_update_idle_time_stats(
+ &cfqq->cfqg->blkg);
cfq_mark_cfqq_must_dispatch(cfqq);
+ }
}
} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
/*
@@ -3212,7 +3355,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
list_add_tail(&rq->queuelist, &cfqq->fifo);
cfq_add_rq_rb(rq);
-
+ cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+ &cfqd->serving_group->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
cfq_rq_enqueued(cfqd, cfqq, rq);
}
@@ -3224,14 +3369,14 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
- if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
- cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+ if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
+ cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
if (cfqd->hw_tag == 1)
return;
if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
- rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
+ cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
return;
/*
@@ -3241,7 +3386,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
*/
if (cfqq && cfq_cfqq_idle_window(cfqq) &&
cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
- CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+ CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
return;
if (cfqd->hw_tag_samples++ < 50)
@@ -3290,17 +3435,21 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
unsigned long now;
now = jiffies;
- cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
+ cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
+ !!(rq->cmd_flags & REQ_NOIDLE));
cfq_update_hw_tag(cfqd);
- WARN_ON(!cfqd->rq_in_driver[sync]);
+ WARN_ON(!cfqd->rq_in_driver);
WARN_ON(!cfqq->dispatched);
- cfqd->rq_in_driver[sync]--;
+ cfqd->rq_in_driver--;
cfqq->dispatched--;
+ (RQ_CFQG(rq))->dispatched--;
+ cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
+ rq_start_time_ns(rq), rq_io_start_time_ns(rq),
+ rq_data_dir(rq), rq_is_sync(rq));
- if (cfq_cfqq_sync(cfqq))
- cfqd->sync_flight--;
+ cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
if (sync) {
RQ_CIC(rq)->last_end_request = now;
@@ -3325,8 +3474,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
* the queue.
*/
if (cfq_should_wait_busy(cfqd, cfqq)) {
- cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+ unsigned long extend_sl = cfqd->cfq_slice_idle;
+ if (!cfqd->cfq_slice_idle)
+ extend_sl = cfqd->cfq_group_idle;
+ cfqq->slice_end = jiffies + extend_sl;
cfq_mark_cfqq_wait_busy(cfqq);
+ cfq_log_cfqq(cfqd, cfqq, "will busy wait");
}
/*
@@ -3341,11 +3494,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfq_slice_expired(cfqd, 1);
else if (sync && cfqq_empty &&
!cfq_close_cooperator(cfqd, cfqq)) {
- cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
+ cfqd->noidle_tree_requires_idle |=
+ !(rq->cmd_flags & REQ_NOIDLE);
/*
* Idling is enabled for SYNC_WORKLOAD.
* SYNC_NOIDLE_WORKLOAD idles at the end of the tree
- * only if we processed at least one !rq_noidle request
+ * only if we processed at least one !REQ_NOIDLE request
*/
if (cfqd->serving_type == SYNC_WORKLOAD
|| cfqd->noidle_tree_requires_idle
@@ -3354,7 +3508,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
}
}
- if (!rq_in_driver(cfqd))
+ if (!cfqd->rq_in_driver)
cfq_schedule_dispatch(cfqd);
}
@@ -3438,6 +3592,10 @@ static void cfq_put_request(struct request *rq)
rq->elevator_private = NULL;
rq->elevator_private2 = NULL;
+ /* Put down rq reference on cfqg */
+ cfq_put_cfqg(RQ_CFQG(rq));
+ rq->elevator_private3 = NULL;
+
cfq_put_queue(cfqq);
}
}
@@ -3453,14 +3611,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
return cic_to_cfqq(cic, 1);
}
-static int should_split_cfqq(struct cfq_queue *cfqq)
-{
- if (cfqq->seeky_start &&
- time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
- return 1;
- return 0;
-}
-
/*
* Returns NULL if a new cfqq should be allocated, or the old cfqq if this
* was the last process referring to said cfqq.
@@ -3469,13 +3619,16 @@ static struct cfq_queue *
split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
{
if (cfqq_process_refs(cfqq) == 1) {
- cfqq->seeky_start = 0;
cfqq->pid = current->pid;
cfq_clear_cfqq_coop(cfqq);
+ cfq_clear_cfqq_split_coop(cfqq);
return cfqq;
}
cic_set_cfqq(cic, NULL, 1);
+
+ cfq_put_cooperator(cfqq);
+
cfq_put_queue(cfqq);
return NULL;
}
@@ -3510,7 +3663,7 @@ new_queue:
/*
* If the queue was seeky for too long, break it apart.
*/
- if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+ if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
cfqq = split_cfqq(cic, cfqq);
if (!cfqq)
@@ -3534,6 +3687,7 @@ new_queue:
rq->elevator_private = cic;
rq->elevator_private2 = cfqq;
+ rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
return 0;
queue_fail:
@@ -3661,16 +3815,38 @@ static void cfq_exit_queue(struct elevator_queue *e)
cfq_put_async_queues(cfqd);
cfq_release_cfq_groups(cfqd);
- blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+ cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
spin_unlock_irq(q->queue_lock);
cfq_shutdown_timer_wq(cfqd);
+ spin_lock(&cic_index_lock);
+ ida_remove(&cic_index_ida, cfqd->cic_index);
+ spin_unlock(&cic_index_lock);
+
/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
call_rcu(&cfqd->rcu, cfq_cfqd_free);
}
+static int cfq_alloc_cic_index(void)
+{
+ int index, error;
+
+ do {
+ if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&cic_index_lock);
+ error = ida_get_new(&cic_index_ida, &index);
+ spin_unlock(&cic_index_lock);
+ if (error && error != -EAGAIN)
+ return error;
+ } while (error);
+
+ return index;
+}
+
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
@@ -3678,10 +3854,16 @@ static void *cfq_init_queue(struct request_queue *q)
struct cfq_group *cfqg;
struct cfq_rb_root *st;
+ i = cfq_alloc_cic_index();
+ if (i < 0)
+ return NULL;
+
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
return NULL;
+ cfqd->cic_index = i;
+
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -3700,8 +3882,10 @@ static void *cfq_init_queue(struct request_queue *q)
* to make sure that cfq_put_cfqg() does not try to kfree root group
*/
atomic_set(&cfqg->ref, 1);
- blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
- 0);
+ rcu_read_lock();
+ cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
+ (void *)cfqd, 0);
+ rcu_read_unlock();
#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
@@ -3739,6 +3923,7 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_group_idle = cfq_group_idle;
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
@@ -3747,7 +3932,6 @@ static void *cfq_init_queue(struct request_queue *q)
* second, in order to have larger depth for async operations.
*/
cfqd->last_delayed_sync = jiffies - HZ;
- INIT_RCU_HEAD(&cfqd->rcu);
return cfqd;
}
@@ -3812,6 +3996,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3844,6 +4029,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3865,6 +4051,7 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_async),
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
+ CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
CFQ_ATTR(group_isolation),
__ATTR_NULL
@@ -3876,6 +4063,7 @@ static struct elevator_type iosched_cfq = {
.elevator_merged_fn = cfq_merged_request,
.elevator_merge_req_fn = cfq_merged_requests,
.elevator_allow_merge_fn = cfq_allow_merge,
+ .elevator_bio_merged_fn = cfq_bio_merged,
.elevator_dispatch_fn = cfq_dispatch_requests,
.elevator_add_req_fn = cfq_insert_request,
.elevator_activate_req_fn = cfq_activate_request,
@@ -3917,6 +4105,12 @@ static int __init cfq_init(void)
if (!cfq_slice_idle)
cfq_slice_idle = 1;
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (!cfq_group_idle)
+ cfq_group_idle = 1;
+#else
+ cfq_group_idle = 0;
+#endif
if (cfq_slab_setup())
return -ENOMEM;
@@ -3941,6 +4135,7 @@ static void __exit cfq_exit(void)
*/
if (elv_ioc_count_read(cfq_ioc_count))
wait_for_completion(&all_gone);
+ ida_destroy(&cic_index_ida);
cfq_slab_kill();
}
diff --git a/block/cfq.h b/block/cfq.h
new file mode 100644
index 000000000000..93448e5a2e41
--- /dev/null
+++ b/block/cfq.h
@@ -0,0 +1,115 @@
+#ifndef _CFQ_H
+#define _CFQ_H
+#include "blk-cgroup.h"
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync)
+{
+ blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue)
+{
+ blkiocg_update_dequeue_stats(blkg, dequeue);
+}
+
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time)
+{
+ blkiocg_update_timeslice_used(blkg, time);
+}
+
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+ blkiocg_set_start_empty_time(blkg);
+}
+
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync)
+{
+ blkiocg_update_io_remove_stats(blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+ bool direction, bool sync)
+{
+ blkiocg_update_io_merged_stats(blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+ blkiocg_update_idle_time_stats(blkg);
+}
+
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+ blkiocg_update_avg_queue_size_stats(blkg);
+}
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+ blkiocg_update_set_idle_time_stats(blkg);
+}
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync)
+{
+ blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+ blkiocg_update_completion_stats(blkg, start_time, io_start_time,
+ direction, sync);
+}
+
+static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+ struct blkio_group *blkg, void *key, dev_t dev) {
+ blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+}
+
+static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+ return blkiocg_del_blkio_group(blkg);
+}
+
+#else /* CFQ_GROUP_IOSCHED */
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync) {}
+
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue) {}
+
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time) {}
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+}
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
+
+static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+ struct blkio_group *blkg, void *key, dev_t dev) {}
+static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+ return 0;
+}
+
+#endif /* CFQ_GROUP_IOSCHED */
+#endif
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 4eb8e9ea4af5..119f07b74dc0 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -6,6 +6,7 @@
#include <linux/elevator.h>
#include <linux/fd.h>
#include <linux/hdreg.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/smp_lock.h>
#include <linux/types.h>
@@ -534,56 +535,6 @@ out:
return err;
}
-struct compat_blk_user_trace_setup {
- char name[32];
- u16 act_mask;
- u32 buf_size;
- u32 buf_nr;
- compat_u64 start_lba;
- compat_u64 end_lba;
- u32 pid;
-};
-#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup)
-
-static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
-{
- struct blk_user_trace_setup buts;
- struct compat_blk_user_trace_setup cbuts;
- struct request_queue *q;
- char b[BDEVNAME_SIZE];
- int ret;
-
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
- return -EFAULT;
-
- bdevname(bdev, b);
-
- buts = (struct blk_user_trace_setup) {
- .act_mask = cbuts.act_mask,
- .buf_size = cbuts.buf_size,
- .buf_nr = cbuts.buf_nr,
- .start_lba = cbuts.start_lba,
- .end_lba = cbuts.end_lba,
- .pid = cbuts.pid,
- };
- memcpy(&buts.name, &cbuts.name, 32);
-
- mutex_lock(&bdev->bd_mutex);
- ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
- mutex_unlock(&bdev->bd_mutex);
- if (ret)
- return ret;
-
- if (copy_to_user(arg, &buts.name, 32))
- return -EFAULT;
-
- return 0;
-}
-
static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -752,6 +703,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:
+ case BLKSECDISCARD:
/*
* the ones below are implemented in blkdev_locked_ioctl,
* but we call blkdev_ioctl, which gets the lock for us
@@ -801,16 +753,10 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_u64(arg, bdev->bd_inode->i_size);
case BLKTRACESETUP32:
- lock_kernel();
- ret = compat_blk_trace_setup(bdev, compat_ptr(arg));
- unlock_kernel();
- return ret;
case BLKTRACESTART: /* compatible */
case BLKTRACESTOP: /* compatible */
case BLKTRACETEARDOWN: /* compatible */
- lock_kernel();
ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
- unlock_kernel();
return ret;
default:
if (disk->fops->compat_ioctl)
diff --git a/block/elevator.c b/block/elevator.c
index 9ad5ccc4c5ee..4e11559aa2b0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,8 +79,13 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
/*
* Don't merge file system requests and discard requests
*/
- if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
- bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
+ if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
+ return 0;
+
+ /*
+ * Don't merge discard requests and secure discard requests
+ */
+ if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
return 0;
/*
@@ -154,7 +159,7 @@ static struct elevator_type *elevator_get(const char *name)
spin_unlock(&elv_list_lock);
- sprintf(elv, "%s-iosched", name);
+ snprintf(elv, sizeof(elv), "%s-iosched", name);
request_module("%s", elv);
spin_lock(&elv_list_lock);
@@ -242,9 +247,11 @@ int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
struct elevator_queue *eq;
- int ret = 0;
void *data;
+ if (unlikely(q->elevator))
+ return 0;
+
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->end_sector = 0;
@@ -284,7 +291,7 @@ int elevator_init(struct request_queue *q, char *name)
}
elevator_attach(q, eq, data);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(elevator_init);
@@ -426,7 +433,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
list_for_each_prev(entry, &q->queue_head) {
struct request *pos = list_entry_rq(entry);
- if (blk_discard_rq(rq) != blk_discard_rq(pos))
+ if ((rq->cmd_flags & REQ_DISCARD) !=
+ (pos->cmd_flags & REQ_DISCARD))
break;
if (rq_data_dir(rq) != rq_data_dir(pos))
break;
@@ -474,6 +482,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
int ret;
/*
+ * Levels of merges:
+ * nomerges: No merges at all attempted
+ * noxmerges: Only simple one-hit cache try
+ * merges: All merge tries attempted
+ */
+ if (blk_queue_nomerges(q))
+ return ELEVATOR_NO_MERGE;
+
+ /*
* First try one-hit cache.
*/
if (q->last_merge) {
@@ -484,7 +501,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
}
}
- if (blk_queue_nomerges(q))
+ if (blk_queue_noxmerges(q))
return ELEVATOR_NO_MERGE;
/*
@@ -530,6 +547,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
q->last_merge = rq;
}
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e->ops->elevator_bio_merged_fn)
+ e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
+
void elv_requeue_request(struct request_queue *q, struct request *rq)
{
/*
@@ -538,7 +564,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
*/
if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]--;
- if (blk_sorted_rq(rq))
+ if (rq->cmd_flags & REQ_SORTED)
elv_deactivate_rq(q, rq);
}
@@ -624,7 +650,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
break;
case ELEVATOR_INSERT_SORT:
- BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
+ BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
+ !(rq->cmd_flags & REQ_DISCARD));
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) {
@@ -696,7 +723,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
/*
* toggle ordered color
*/
- if (blk_barrier_rq(rq))
+ if (rq->cmd_flags & REQ_HARDBARRIER)
q->ordcolor ^= 1;
/*
@@ -709,7 +736,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
* this request is scheduling boundary, update
* end_sector
*/
- if (blk_fs_request(rq) || blk_discard_rq(rq)) {
+ if (rq->cmd_type == REQ_TYPE_FS ||
+ (rq->cmd_flags & REQ_DISCARD)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
@@ -823,7 +851,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
*/
if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]--;
- if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
+ if ((rq->cmd_flags & REQ_SORTED) &&
+ e->ops->elevator_completed_req_fn)
e->ops->elevator_completed_req_fn(q, rq);
}
@@ -883,7 +912,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
return error;
}
-static struct sysfs_ops elv_sysfs_ops = {
+static const struct sysfs_ops elv_sysfs_ops = {
.show = elv_attr_show,
.store = elv_attr_store,
};
@@ -909,14 +938,17 @@ int elv_register_queue(struct request_queue *q)
}
}
kobject_uevent(&e->kobj, KOBJ_ADD);
+ e->registered = 1;
}
return error;
}
+EXPORT_SYMBOL(elv_register_queue);
static void __elv_unregister_queue(struct elevator_queue *e)
{
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+ e->registered = 0;
}
void elv_unregister_queue(struct request_queue *q)
@@ -924,6 +956,7 @@ void elv_unregister_queue(struct request_queue *q)
if (q)
__elv_unregister_queue(q->elevator);
}
+EXPORT_SYMBOL(elv_unregister_queue);
void elv_register(struct elevator_type *e)
{
@@ -978,18 +1011,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
struct elevator_queue *old_elevator, *e;
void *data;
+ int err;
/*
* Allocate new elevator
*/
e = elevator_alloc(q, new_e);
if (!e)
- return 0;
+ return -ENOMEM;
data = elevator_init_queue(q, e);
if (!data) {
kobject_put(&e->kobj);
- return 0;
+ return -ENOMEM;
}
/*
@@ -1010,10 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
spin_unlock_irq(q->queue_lock);
- __elv_unregister_queue(old_elevator);
+ if (old_elevator->registered) {
+ __elv_unregister_queue(old_elevator);
- if (elv_register_queue(q))
- goto fail_register;
+ err = elv_register_queue(q);
+ if (err)
+ goto fail_register;
+ }
/*
* finally exit old elevator and turn off BYPASS.
@@ -1025,7 +1062,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
- return 1;
+ return 0;
fail_register:
/*
@@ -1040,17 +1077,19 @@ fail_register:
queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
spin_unlock_irq(q->queue_lock);
- return 0;
+ return err;
}
-ssize_t elv_iosched_store(struct request_queue *q, const char *name,
- size_t count)
+/*
+ * Switch this queue to the given IO scheduler.
+ */
+int elevator_change(struct request_queue *q, const char *name)
{
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
if (!q->elevator)
- return count;
+ return -ENXIO;
strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(strstrip(elevator_name));
@@ -1061,13 +1100,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
elevator_put(e);
- return count;
+ return 0;
}
- if (!elevator_switch(q, e))
- printk(KERN_ERR "elevator: switch to %s failed\n",
- elevator_name);
- return count;
+ return elevator_switch(q, e);
+}
+EXPORT_SYMBOL(elevator_change);
+
+ssize_t elv_iosched_store(struct request_queue *q, const char *name,
+ size_t count)
+{
+ int ret;
+
+ if (!q->elevator)
+ return count;
+
+ ret = elevator_change(q, name);
+ if (!ret)
+ return count;
+
+ printk(KERN_ERR "elevator: switch to %s failed\n", name);
+ return ret;
}
ssize_t elv_iosched_show(struct request_queue *q, char *name)
@@ -1077,7 +1130,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
struct elevator_type *__e;
int len = 0;
- if (!q->elevator)
+ if (!q->elevator || !blk_queue_stackable(q))
return sprintf(name, "none\n");
elv = e->elevator_type;
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..59a2db6fecef 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
return disk;
}
+EXPORT_SYMBOL(get_gendisk);
/**
* bdget_disk - do bdget() by gendisk and partition number
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
if (!new_ptbl)
return -ENOMEM;
- INIT_RCU_HEAD(&new_ptbl->rcu_head);
new_ptbl->len = target;
for (i = 0; i < len; i++)
diff --git a/block/ioctl.c b/block/ioctl.c
index be48ea51faee..d8052f0dabd3 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,5 +1,6 @@
#include <linux/capability.h>
#include <linux/blkdev.h>
+#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
@@ -113,8 +114,10 @@ static int blkdev_reread_part(struct block_device *bdev)
}
static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
- uint64_t len)
+ uint64_t len, int secure)
{
+ unsigned long flags = BLKDEV_IFL_WAIT;
+
if (start & 511)
return -EINVAL;
if (len & 511)
@@ -124,8 +127,9 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
if (start + len > (bdev->bd_inode->i_size >> 9))
return -EINVAL;
- return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
- DISCARD_FL_WAIT);
+ if (secure)
+ flags |= BLKDEV_IFL_SECURE;
+ return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
}
static int put_ushort(unsigned long arg, unsigned short val)
@@ -162,18 +166,10 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
- int ret;
if (disk->fops->ioctl)
return disk->fops->ioctl(bdev, mode, cmd, arg);
- if (disk->fops->locked_ioctl) {
- lock_kernel();
- ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
- return ret;
- }
-
return -ENOTTY;
}
/*
@@ -184,8 +180,7 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
/*
- * always keep this in sync with compat_blkdev_ioctl() and
- * compat_blkdev_locked_ioctl()
+ * always keep this in sync with compat_blkdev_ioctl()
*/
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
unsigned long arg)
@@ -205,10 +200,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (ret != -EINVAL && ret != -ENOTTY)
return ret;
- lock_kernel();
fsync_bdev(bdev);
invalidate_bdev(bdev);
- unlock_kernel();
return 0;
case BLKROSET:
@@ -220,12 +213,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -EACCES;
if (get_user(n, (int __user *)(arg)))
return -EFAULT;
- lock_kernel();
set_device_ro(bdev, n);
- unlock_kernel();
return 0;
- case BLKDISCARD: {
+ case BLKDISCARD:
+ case BLKSECDISCARD: {
uint64_t range[2];
if (!(mode & FMODE_WRITE))
@@ -234,7 +226,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
- return blk_ioctl_discard(bdev, range[0], range[1]);
+ return blk_ioctl_discard(bdev, range[0], range[1],
+ cmd == BLKSECDISCARD);
}
case HDIO_GETGEO: {
@@ -308,14 +301,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
bd_release(bdev);
return ret;
case BLKPG:
- lock_kernel();
ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
- unlock_kernel();
break;
case BLKRRPART:
- lock_kernel();
ret = blkdev_reread_part(bdev);
- unlock_kernel();
break;
case BLKGETSIZE:
size = bdev->bd_inode->i_size;
@@ -328,9 +317,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESTOP:
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
- lock_kernel();
ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
- unlock_kernel();
break;
default:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3a0d369d08c7..232c4b38cd37 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -5,6 +5,7 @@
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/init.h>
struct noop_data {