From 762380ad9322951cea4ce9d24864265f9c66a916 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2014 13:38:39 -0600 Subject: block: add notion of a chunk size for request merging Some drivers have different limits on what size a request should optimally be, depending on the offset of the request. Similar to dividing a device into chunks. Add a setting that allows the driver to inform the block layer of such a chunk size. The block layer will then prevent merging across the chunks. This is needed to optimally support NVMe with a non-zero stripe size. Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- block/blk-settings.c | 18 ++++++++++++++++++ include/linux/blkdev.h | 22 +++++++++++++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 96d28eee8a1e..97e832cc9b9c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,7 +849,8 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); + + return __bio_add_page(q, bio, page, len, offset, blk_max_size_offset(q, bio->bi_iter.bi_sector)); } EXPORT_SYMBOL(bio_add_page); diff --git a/block/blk-settings.c b/block/blk-settings.c index 5d21239bc859..a2b9cb195e70 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; lim->discard_granularity = 0; @@ -276,6 +277,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto } EXPORT_SYMBOL(blk_queue_max_hw_sectors); +/** + * blk_queue_chunk_sectors - set size of the chunk for this queue + * @q: the request queue for the device + * @chunk_sectors: chunk sectors in the usual 512b unit + * + * Description: + * If a driver doesn't want IOs to cross a given chunk size, it can set + * this limit and prevent merging across chunks. Note that the chunk size + * must currently be a power-of-2 in sectors. + **/ +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +{ + BUG_ON(!is_power_of_2(chunk_sectors)); + q->limits.chunk_sectors = chunk_sectors; +} +EXPORT_SYMBOL(blk_queue_chunk_sectors); + /** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3cd426e971db..dc2c703f05fd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -280,6 +280,7 @@ struct queue_limits { unsigned long seg_boundary_mask; unsigned int max_hw_sectors; + unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_segment_size; unsigned int physical_block_size; @@ -910,6 +911,20 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, return q->limits.max_sectors; } +/* + * Return maximum size of a request at given offset. Only valid for + * file system requests. + */ +static inline unsigned int blk_max_size_offset(struct request_queue *q, + sector_t offset) +{ + if (!q->limits.chunk_sectors) + return q->limits.max_hw_sectors; + + return q->limits.chunk_sectors - + (offset & (q->limits.chunk_sectors - 1)); +} + static inline unsigned int blk_rq_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; @@ -917,7 +932,11 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) return q->limits.max_hw_sectors; - return blk_queue_get_max_sectors(q, rq->cmd_flags); + if (!q->limits.chunk_sectors) + return blk_queue_get_max_sectors(q, rq->cmd_flags); + + return min(blk_max_size_offset(q, blk_rq_pos(rq)), + blk_queue_get_max_sectors(q, rq->cmd_flags)); } static inline unsigned int blk_rq_count_bios(struct request *rq) @@ -983,6 +1002,7 @@ extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); +extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, -- cgit v1.2.3-70-g09d2 From f27b087b81b70513b8c61ec20596c868f7b93474 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2014 07:57:37 -0600 Subject: block: add blk_rq_set_block_pc() With the optimizations around not clearing the full request at alloc time, we are leaving some of the needed init for REQ_TYPE_BLOCK_PC up to the user allocating the request. Add a blk_rq_set_block_pc() that sets the command type to REQ_TYPE_BLOCK_PC, and properly initializes the members associated with this type of request. Update callers to use this function instead of manipulating rq->cmd_type directly. Includes fixes from Christoph Hellwig for my half-assed attempt. Signed-off-by: Jens Axboe --- block/blk-core.c | 18 ++++++++++++++++++ block/bsg.c | 3 ++- block/scsi_ioctl.c | 6 +++--- drivers/block/pktcdvd.c | 2 +- drivers/cdrom/cdrom.c | 2 +- drivers/scsi/device_handler/scsi_dh_alua.c | 2 +- drivers/scsi/device_handler/scsi_dh_emc.c | 2 +- drivers/scsi/device_handler/scsi_dh_hp_sw.c | 4 ++-- drivers/scsi/device_handler/scsi_dh_rdac.c | 2 +- drivers/scsi/osd/osd_initiator.c | 4 ++-- drivers/scsi/osst.c | 2 +- drivers/scsi/scsi_error.c | 3 ++- drivers/scsi/scsi_lib.c | 2 +- drivers/scsi/sg.c | 3 +-- drivers/scsi/st.c | 2 +- drivers/target/target_core_pscsi.c | 3 ++- include/linux/blkdev.h | 1 + 17 files changed, 41 insertions(+), 20 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 40d654861c33..9aca8c71e70b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1218,6 +1218,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, if (unlikely(!rq)) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + for_each_bio(bio) { struct bio *bounce_bio = bio; int ret; @@ -1234,6 +1236,22 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(blk_make_request); +/** + * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC + * @rq: request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted diff --git a/block/bsg.c b/block/bsg.c index e5214c148096..ff46addde5d8 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -196,7 +196,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->request_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -273,6 +272,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9c28a5b38042..14695c6221c8 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -229,7 +229,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->cmd_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -311,6 +310,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) return -ENOMEM; + blk_rq_set_block_pc(rq); if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { blk_put_request(rq); @@ -491,7 +491,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -524,7 +524,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; rq->cmd[4] = data; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ef166ad2dbad..758ac442c5b5 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,6 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, __GFP_WAIT); + blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, @@ -716,7 +717,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE); rq->timeout = 60*HZ; - rq->cmd_type = REQ_TYPE_BLOCK_PC; if (cgc->quiet) rq->cmd_flags |= REQ_QUIET; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 49ac5662585b..0f40c95049c0 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2184,6 +2184,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, ret = -ENOMEM; break; } + blk_rq_set_block_pc(rq); ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL); if (ret) { @@ -2203,7 +2204,6 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, rq->cmd[9] = 0xf8; rq->cmd_len = 12; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = 60 * HZ; bio = rq->bio; diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 5248c888552b..7bcf67eec921 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -120,6 +120,7 @@ static struct request *get_alua_req(struct scsi_device *sdev, "%s: blk_get_request failed\n", __func__); return NULL; } + blk_rq_set_block_pc(rq); if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_NOIO)) { blk_put_request(rq); @@ -128,7 +129,6 @@ static struct request *get_alua_req(struct scsi_device *sdev, return NULL; } - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER; rq->retries = ALUA_FAILOVER_RETRIES; diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c index e1c8be06de9d..6f07f7fe3aa1 100644 --- a/drivers/scsi/device_handler/scsi_dh_emc.c +++ b/drivers/scsi/device_handler/scsi_dh_emc.c @@ -280,6 +280,7 @@ static struct request *get_req(struct scsi_device *sdev, int cmd, return NULL; } + blk_rq_set_block_pc(rq); rq->cmd_len = COMMAND_SIZE(cmd); rq->cmd[0] = cmd; @@ -304,7 +305,6 @@ static struct request *get_req(struct scsi_device *sdev, int cmd, break; } - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER; rq->timeout = CLARIION_TIMEOUT; diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c index 084062bb8ee9..e9d9fea9e272 100644 --- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c +++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c @@ -120,7 +120,7 @@ retry: if (!req) return SCSI_DH_RES_TEMP_UNAVAIL; - req->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(req); req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER; req->cmd_len = COMMAND_SIZE(TEST_UNIT_READY); @@ -250,7 +250,7 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *h) if (!req) return SCSI_DH_RES_TEMP_UNAVAIL; - req->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(req); req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER; req->cmd_len = COMMAND_SIZE(START_STOP); diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index 4b9cf93f3fb6..826069db9848 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -279,6 +279,7 @@ static struct request *get_rdac_req(struct scsi_device *sdev, "get_rdac_req: blk_get_request failed.\n"); return NULL; } + blk_rq_set_block_pc(rq); if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_NOIO)) { blk_put_request(rq); @@ -287,7 +288,6 @@ static struct request *get_rdac_req(struct scsi_device *sdev, return NULL; } - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER; rq->retries = RDAC_RETRIES; diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index bac04c2335aa..5f4cbf0c4759 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1570,6 +1570,7 @@ static struct request *_make_request(struct request_queue *q, bool has_write, if (unlikely(!req)) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(req); return req; } } @@ -1590,7 +1591,6 @@ static int _init_blk_request(struct osd_request *or, } or->request = req; - req->cmd_type = REQ_TYPE_BLOCK_PC; req->cmd_flags |= REQ_QUIET; req->timeout = or->timeout; @@ -1608,7 +1608,7 @@ static int _init_blk_request(struct osd_request *or, ret = PTR_ERR(req); goto out; } - req->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(req); or->in.req = or->request->next_rq = req; } } else if (has_in) diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 21883a2d6324..0727ea7cc387 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -365,7 +365,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, if (!req) return DRIVER_ERROR << 24; - req->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(req); req->cmd_flags |= REQ_QUIET; SRpnt->bio = NULL; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f17aa7aa7879..af624619d547 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1951,6 +1951,8 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); + blk_rq_set_block_pc(req); + req->cmd[0] = ALLOW_MEDIUM_REMOVAL; req->cmd[1] = 0; req->cmd[2] = 0; @@ -1960,7 +1962,6 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->cmd_len = COMMAND_SIZE(req->cmd[0]); - req->cmd_type = REQ_TYPE_BLOCK_PC; req->cmd_flags |= REQ_QUIET; req->timeout = 10 * HZ; req->retries = 5; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index a0c95cac91f0..c3c1697b143e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -195,6 +195,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); if (!req) return ret; + blk_rq_set_block_pc(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, __GFP_WAIT)) @@ -206,7 +207,6 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, req->sense_len = 0; req->retries = retries; req->timeout = timeout; - req->cmd_type = REQ_TYPE_BLOCK_PC; req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT; /* diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index df5e961484e1..53268aaba559 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1653,10 +1653,9 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd) if (!rq) return -ENOMEM; + blk_rq_set_block_pc(rq); memcpy(rq->cmd, cmd, hp->cmd_len); - rq->cmd_len = hp->cmd_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; srp->rq = rq; rq->end_io_data = srp; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index afc834e172c6..14eb4b256a03 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -484,7 +484,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, if (!req) return DRIVER_ERROR << 24; - req->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(req); req->cmd_flags |= REQ_QUIET; mdata->null_mapped = 1; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 0f199f6a0738..94d00df28f39 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1055,6 +1055,8 @@ pscsi_execute_cmd(struct se_cmd *cmd) ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto fail; } + + blk_rq_set_block_pc(req); } else { BUG_ON(!cmd->data_length); @@ -1071,7 +1073,6 @@ pscsi_execute_cmd(struct se_cmd *cmd) } } - req->cmd_type = REQ_TYPE_BLOCK_PC; req->end_io = pscsi_req_done; req->end_io_data = cmd; req->cmd_len = scsi_command_size(pt->pscsi_cdb); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc2c703f05fd..31e11051f1ba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -796,6 +796,7 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern struct request *blk_make_request(struct request_queue *, struct bio *, gfp_t); +extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); -- cgit v1.2.3-70-g09d2 From a4391c6465d9c978fd4bded12e34bdde3f5458f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2014 15:21:56 -0600 Subject: blk-mq: bump max tag depth to 10K tags For some scsi-mq cases, the tag map can be huge. So increase the max number of tags we support. Additionally, don't fail with EINVAL if a user requests too many tags. Warn that the tag depth has been adjusted down, and store the new value inside the tag_set passed in. Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++++++++- include/linux/blk-mq.h | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e4cd6208052..a6ee74e27957 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1967,13 +1967,19 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if if it too large. In that case, the set + * value will be stored in set->queue_depth. + */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i; if (!set->nr_hw_queues) return -EINVAL; - if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH) + if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; @@ -1981,6 +1987,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) return -EINVAL; + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_info("blk-mq: reduced tag depth to %u\n", + BLK_MQ_MAX_DEPTH); + set->queue_depth = BLK_MQ_MAX_DEPTH; + } set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0feedebfde48..a002cf191427 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -135,7 +135,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_MAX_DEPTH = 2048, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, }; -- cgit v1.2.3-70-g09d2 From 3b632cf0eaa2e89a12c18f043e6e7c5bcc003645 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Jun 2014 10:22:07 -0600 Subject: blk-mq: don't allow queue entering for a dying queue If the queue is going away, don't let new allocs or queueing happen on it. Go through the normal wait process, and exit with ENODEV in that case. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a6ee74e27957..75fc33f34251 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -82,8 +82,10 @@ static int blk_mq_queue_enter(struct request_queue *q) __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); smp_wmb(); - /* we have problems to freeze the queue if it's initializing */ - if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) + + /* we have problems freezing the queue if it's initializing */ + if (!blk_queue_dying(q) && + (!blk_queue_bypass(q) || !blk_queue_init_done(q))) return 0; __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); -- cgit v1.2.3-70-g09d2 From f6be4fb4bcb396fc3b1c134b7863351972de081f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2014 11:03:48 -0600 Subject: blk-mq: ->timeout should be cleared in blk_mq_rq_ctx_init() It'll be used in blk_mq_start_request() to set a potential timeout for the request, so clear it to zero at alloc time to ensure that we know if someone has set it or not. Fixes random early timeouts on NVMe testing. Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 75fc33f34251..fafea52281ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -204,6 +204,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->sense = NULL; INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + rq->end_io = NULL; rq->end_io_data = NULL; rq->next_rq = NULL; -- cgit v1.2.3-70-g09d2 From f45c40a92d2c6915a0e88ff8a947095be2ba1c8e Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Fri, 6 Jun 2014 13:28:48 -0600 Subject: mtip32xx: minor performance enhancements This patch adds the following: 1) Compiler hinting in the fast path. 2) A prefetch of port->flags to eliminate moderate cpu stalling later in mtip_hw_submit_io(). 3) Eliminate a redundant rq_data_dir(). 4) Reorder members of driver_data to eliminate false cacheline sharing between irq_workers_active and unal_qdepth. With some workload and topology configurations, I'm seeing ~1.5% throughput improvement in small block random read benchmarks as well as improved latency std. dev. Signed-off-by: Sam Bradshaw Add include of Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 15 +++++++++------ drivers/block/mtip32xx/mtip32xx.h | 8 ++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 74abd49fabdc..295f3afbbef5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -39,6 +39,7 @@ #include <../drivers/ata/ahci.h> #include #include +#include #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -2380,6 +2381,8 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, /* Map the scatter list for DMA access */ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + prefetch(&port->flags); + command->scatter_ents = nents; /* @@ -2392,7 +2395,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, fis = command->command; fis->type = 0x27; fis->opts = 1 << 7; - if (rq_data_dir(rq) == READ) + if (dma_dir == DMA_FROM_DEVICE) fis->command = ATA_CMD_FPDMA_READ; else fis->command = ATA_CMD_FPDMA_WRITE; @@ -2412,7 +2415,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, fis->res3 = 0; fill_command_sg(dd, command, nents); - if (command->unaligned) + if (unlikely(command->unaligned)) fis->device |= 1 << 7; /* Populate the command header */ @@ -2433,7 +2436,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, * To prevent this command from being issued * if an internal command is in progress or error handling is active. */ - if (port->flags & MTIP_PF_PAUSE_IO) { + if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); return; @@ -3754,7 +3757,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, struct driver_data *dd = hctx->queue->queuedata; struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - if (!dd->unal_qdepth || rq_data_dir(rq) == READ) + if (rq_data_dir(rq) == READ || !dd->unal_qdepth) return false; /* @@ -3776,11 +3779,11 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) { int ret; - if (mtip_check_unal_depth(hctx, rq)) + if (unlikely(mtip_check_unal_depth(hctx, rq))) return BLK_MQ_RQ_QUEUE_BUSY; ret = mtip_submit_request(hctx, rq); - if (!ret) + if (likely(!ret)) return BLK_MQ_RQ_QUEUE_OK; rq->errors = ret; diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 4b9b554234bc..ba1b31ee22ec 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -493,19 +493,19 @@ struct driver_data { struct workqueue_struct *isr_workq; - struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; - atomic_t irq_workers_active; + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; + int isr_binding; struct block_device *bdev; - int unal_qdepth; /* qdepth of unaligned IO queue */ - struct list_head online_list; /* linkage for online list */ struct list_head remove_list; /* linkage for removing list */ + + int unal_qdepth; /* qdepth of unaligned IO queue */ }; #endif -- cgit v1.2.3-70-g09d2 From de83953f9d710f84c4a162a1d498a73475c07d98 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Sat, 7 Jun 2014 00:37:26 +0200 Subject: block: blk-exec.c: Cleaning up local variable address returnd Address of local variable assigned to a function parameter This was partly found using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Signed-off-by: Jens Axboe --- block/blk-exec.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/blk-exec.c b/block/blk-exec.c index dbf4502b1d67..f4d27b12c90b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -132,6 +132,11 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, if (rq->errors) err = -EIO; + if (rq->sense == sense) { + rq->sense = NULL; + rq->sense_len = 0; + } + return err; } EXPORT_SYMBOL(blk_execute_rq); -- cgit v1.2.3-70-g09d2 From 3ee3237239583a6555db4f297d00eebdbb6d76ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Jun 2014 09:36:53 -0600 Subject: blk-mq: always initialize request->start_time The blk-mq core only initializes this if io stats are enabled, since blk-mq only reads the field in that case. But drivers could potentially use it internally, so ensure that we always set it to the current time when the request is allocated. Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fafea52281ac..a5ea37d7e820 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -185,6 +185,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; + rq->start_time = jiffies; #ifdef CONFIG_BLK_CGROUP rq->rl = NULL; set_start_time_ns(rq); @@ -1104,10 +1105,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - if (blk_do_io_stat(rq)) { - rq->start_time = jiffies; + if (blk_do_io_stat(rq)) blk_account_io_start(rq, 1); - } } static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3-70-g09d2 From 2b8393b43ec672bb263009cd74c056ab01d6ac17 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 10 Jun 2014 00:16:41 +0800 Subject: blk-mq: add timer in blk_mq_start_request This way will become consistent with non-mq case, also avoid to update rq->deadline twice for mq. The comment said: "We do this early, to ensure we are on the right CPU.", but no percpu stuff is used in blk_add_timer(), so it isn't necessary. Even when inserting from plug list, there is no such guarantee at all. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a5ea37d7e820..e11f5f8e0313 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -411,16 +411,7 @@ static void blk_mq_start_request(struct request *rq, bool last) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); - /* - * Just mark start time and set the started bit. Due to memory - * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, - * unless one has been set in the request. - */ - if (!rq->timeout) - rq->deadline = jiffies + q->rq_timeout; - else - rq->deadline = jiffies + rq->timeout; + blk_add_timer(rq); /* * Mark us as started and clear complete. Complete might have been @@ -972,11 +963,6 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); - - /* - * We do this early, to ensure we are on the right CPU. - */ - blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -1219,7 +1205,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); blk_mq_start_request(rq, true); - blk_add_timer(rq); /* * For OK queue, we are done. For error, kill it. Any other -- cgit v1.2.3-70-g09d2 From 58a4915ad2f8a87f4456aac260396df7e300e6f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Jun 2014 12:53:56 -0600 Subject: block: ensure that bio_add_page() always accepts a page for an empty bio With commit 762380ad9322 added support for chunk sizes and no merging across them, it broke the rule of always allowing adding of a single page to an empty bio. So relax the restriction a bit to allow for that, similarly to what we have always done. This fixes a crash with mkfs.xfs and 512b sector sizes on NVMe. Reported-by: Keith Busch Signed-off-by: Jens Axboe --- block/bio.c | 7 ++++++- block/blk-settings.c | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 97e832cc9b9c..2d64488e51c6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,8 +849,13 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int max_sectors; - return __bio_add_page(q, bio, page, len, offset, blk_max_size_offset(q, bio->bi_iter.bi_sector)); + max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) + max_sectors = len >> 9; + + return __bio_add_page(q, bio, page, len, offset, max_sectors); } EXPORT_SYMBOL(bio_add_page); diff --git a/block/blk-settings.c b/block/blk-settings.c index a2b9cb195e70..f1a1795a5683 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -285,7 +285,10 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); * Description: * If a driver doesn't want IOs to cross a given chunk size, it can set * this limit and prevent merging across chunks. Note that the chunk size - * must currently be a power-of-2 in sectors. + * must currently be a power-of-2 in sectors. Also note that the block + * layer must accept a page worth of data at any offset. So if the + * crossing of chunks is a hard limitation in the driver, it must still be + * prepared to split single page bios. **/ void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { -- cgit v1.2.3-70-g09d2 From b5097e956a4d2919ee248d6481e4204c5568ed5c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Jun 2014 20:04:50 +0200 Subject: block: add __init to elv_register elv_register is only called by elevator init functions: __init cfq_init __init deadline_init __init noop_init Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- include/linux/elevator.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 1e01b66a0b92..f35edddfe9b5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -845,7 +845,7 @@ void elv_unregister_queue(struct request_queue *q) } EXPORT_SYMBOL(elv_unregister_queue); -int elv_register(struct elevator_type *e) +int __init elv_register(struct elevator_type *e) { char *def = ""; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index df63bd3a8cf1..4ff262e2bf37 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -144,7 +144,7 @@ extern void elv_drain_elevator(struct request_queue *); * io scheduler registration */ extern void __init load_default_elevator_module(void); -extern int elv_register(struct elevator_type *); +extern int __init elv_register(struct elevator_type *); extern void elv_unregister(struct elevator_type *); /* -- cgit v1.2.3-70-g09d2 From a2d445d440003f2d70ee4cd4970ea82ace616fee Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Jun 2014 20:18:36 +0200 Subject: block: add __init to blkcg_policy_register blkcg_policy_register is only called by __init functions: __init cfq_init __init throtl_init Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1039fb9ff5f5..5aa9b8b8a362 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1093,7 +1093,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ -int blkcg_policy_register(struct blkcg_policy *pol) +int __init blkcg_policy_register(struct blkcg_policy *pol) { int i, ret; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 371fe8e92ab5..5480adedd387 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -145,7 +145,7 @@ void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); /* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); +int __init blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol); @@ -580,7 +580,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { ret static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline int __init blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { return 0; } -- cgit v1.2.3-70-g09d2