From adce7e9856798d4883f42c3d8429123707fa34e8 Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Wed, 27 Nov 2019 10:17:43 -0700 Subject: nvme: remove unused return code from nvme_alloc_ns The return code of nvme_alloc_ns is never used, so change it to void. Reviewed-by: Christoph Hellwig Signed-off-by: Edmund Nadolski Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4d8c90ee7cc..414076aaf52b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3480,7 +3480,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) return 0; } -static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; @@ -3490,13 +3490,11 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return -ENOMEM; + return; ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) { - ret = PTR_ERR(ns->queue); + if (IS_ERR(ns->queue)) goto out_free_ns; - } if (ctrl->opts && ctrl->opts->data_digest) ns->queue->backing_dev_info->capabilities @@ -3519,10 +3517,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ret) goto out_free_queue; - if (id->ncap == 0) { - ret = -EINVAL; + if (id->ncap == 0) /* no namespace (legacy quirk) */ goto out_free_id; - } ret = nvme_init_ns_head(ns, nsid, id); if (ret) @@ -3531,10 +3527,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); - if (!disk) { - ret = -ENOMEM; + if (!disk) goto out_unlink_ns; - } disk->fops = &nvme_fops; disk->private_data = ns; @@ -3565,7 +3559,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); kfree(id); - return 0; + return; out_put_disk: put_disk(ns->disk); out_unlink_ns: @@ -3579,9 +3573,6 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; } static void nvme_ns_remove(struct nvme_ns *ns) -- cgit v1.2.3-70-g09d2 From 527123c7deafd5aa921773f739887d610d59b437 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 26 Jan 2020 12:35:44 -0800 Subject: nvmet: configfs code cleanup This is a pure code cleanup patch which does not change any functionality. This patch removes the extra lines, get rid of else which is duplicate for return. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 98613a45bd3b..403508a52e17 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -395,14 +395,12 @@ static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, struct nvmet_subsys *subsys = ns->subsys; int ret = 0; - mutex_lock(&subsys->lock); if (ns->enabled) { ret = -EBUSY; goto out_unlock; } - if (uuid_parse(page, &ns->uuid)) ret = -EINVAL; @@ -815,10 +813,10 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, (int)NVME_MAJOR(subsys->ver), (int)NVME_MINOR(subsys->ver), (int)NVME_TERTIARY(subsys->ver)); - else - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, @@ -828,7 +826,6 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; - ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 94a39d61f80fcd679debda11e1ca02b88d90e67e Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 30 Jan 2020 10:29:31 -0800 Subject: nvmet: make ctrl-id configurable This patch adds a new target subsys attribute which allows user to optionally specify target controller IDs which then used in the nvmet_execute_identify_ctrl() to fill up the nvme_id_ctrl structure. For example, when using a cluster setup with two nodes, with a dual ported NVMe drive and exporting the drive from both the nodes, The connection to the host fails due to the same controller ID and results in the following error message:- "nvme nvmeX: Duplicate cntlid XXX with nvmeX, rejecting" With this patch now user can partition the controller IDs for each subsystem by setting up the cntlid_min and cntlid_max. These values will be used at the time of the controller ID creation. By partitioning the ctrl-ids for each subsystem results in the unique ctrl-id space which avoids the collision. When new attribute is not specified target will fall back to original cntlid calculation method. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 62 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 8 ++++-- drivers/nvme/target/nvmet.h | 2 ++ 3 files changed, 70 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 403508a52e17..71c50751b5a6 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -859,10 +859,72 @@ static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_serial); +static ssize_t nvmet_subsys_attr_cntlid_min_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_min); +} + +static ssize_t nvmet_subsys_attr_cntlid_min_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_min; + + if (sscanf(page, "%hu\n", &cntlid_min) != 1) + return -EINVAL; + + if (cntlid_min == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_min >= to_subsys(item)->cntlid_max) + goto out_unlock; + to_subsys(item)->cntlid_min = cntlid_min; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_min); + +static ssize_t nvmet_subsys_attr_cntlid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_max); +} + +static ssize_t nvmet_subsys_attr_cntlid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_max; + + if (sscanf(page, "%hu\n", &cntlid_max) != 1) + return -EINVAL; + + if (cntlid_max == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_max <= to_subsys(item)->cntlid_min) + goto out_unlock; + to_subsys(item)->cntlid_max = cntlid_max; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_serial, + &nvmet_subsys_attr_attr_cntlid_min, + &nvmet_subsys_attr_attr_cntlid_max, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 576de773b4db..48080c948692 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1289,8 +1289,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->sqs) goto out_free_cqs; + if (subsys->cntlid_min > subsys->cntlid_max) + goto out_free_cqs; + ret = ida_simple_get(&cntlid_ida, - NVME_CNTLID_MIN, NVME_CNTLID_MAX, + subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; @@ -1438,7 +1441,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kfree(subsys); return ERR_PTR(-ENOMEM); } - + subsys->cntlid_min = NVME_CNTLID_MIN; + subsys->cntlid_max = NVME_CNTLID_MAX; kref_init(&subsys->ref); mutex_init(&subsys->lock); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index eda28b22a2c8..c2d518fb1789 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -211,6 +211,8 @@ struct nvmet_subsys { struct list_head namespaces; unsigned int nr_namespaces; unsigned int max_nsid; + u16 cntlid_min; + u16 cntlid_max; struct list_head ctrls; -- cgit v1.2.3-70-g09d2 From 013b7ebe5a0d70e2a02fd225174595e79c591b3e Mon Sep 17 00:00:00 2001 From: Mark Ruijter Date: Thu, 30 Jan 2020 10:29:32 -0800 Subject: nvmet: make ctrl model configurable This patch adds a new target subsys attribute which allows user to optionally specify model name which then used in the nvmet_execute_identify_ctrl() to fill up the nvme_id_ctrl structure. The default value for the model is set to "Linux" for backward compatibility. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Mark Ruijter [chaitanya.kulkarni@wdc.com *Use macro for default model, coding style fixes. *Use RCU for accessing model in for configfs and in nvmet_execute_identify_ctrl(). ] Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 17 +++++++++-- drivers/nvme/target/configfs.c | 66 +++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 1 + drivers/nvme/target/nvmet.h | 8 +++++ 4 files changed, 90 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..19f949570625 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -322,12 +322,25 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); } +static void nvmet_id_set_model_number(struct nvme_id_ctrl *id, + struct nvmet_subsys *subsys) +{ + const char *model = NVMET_DEFAULT_CTRL_MODEL; + struct nvmet_subsys_model *subsys_model; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + memcpy_and_pad(id->mn, sizeof(id->mn), model, strlen(model), ' '); + rcu_read_unlock(); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; u16 status = 0; - const char model[] = "Linux"; id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) { @@ -342,7 +355,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memset(id->sn, ' ', sizeof(id->sn)); bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); - memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + nvmet_id_set_model_number(id, ctrl->subsys); memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 71c50751b5a6..1654064deea5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -919,12 +919,78 @@ out_unlock: } CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); +static ssize_t nvmet_subsys_attr_model_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *subsys_model; + char *model = NVMET_DEFAULT_CTRL_MODEL; + int ret; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + ret = snprintf(page, PAGE_SIZE, "%s\n", model); + rcu_read_unlock(); + + return ret; +} + +/* See Section 1.5 of NVMe 1.4 */ +static bool nvmet_is_ascii(const char c) +{ + return c >= 0x20 && c <= 0x7e; +} + +static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *new_model; + char *new_model_number; + int pos = 0, len; + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + new_model_number = kstrndup(page, len, GFP_KERNEL); + if (!new_model_number) + return -ENOMEM; + + new_model = kzalloc(sizeof(*new_model) + len + 1, GFP_KERNEL); + if (!new_model) { + kfree(new_model_number); + return -ENOMEM; + } + memcpy(new_model->number, new_model_number, len); + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + new_model = rcu_replace_pointer(subsys->model, new_model, + mutex_is_locked(&subsys->lock)); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + kfree_rcu(new_model, rcuhead); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_model); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_serial, &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_model, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 48080c948692..b685f99d56a1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1461,6 +1461,7 @@ static void nvmet_subsys_free(struct kref *ref) WARN_ON_ONCE(!list_empty(&subsys->namespaces)); kfree(subsys->subsysnqn); + kfree_rcu(subsys->model, rcuhead); kfree(subsys); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c2d518fb1789..42ba2ddd9e96 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -23,6 +23,7 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) +#define NVMET_DEFAULT_CTRL_MODEL "Linux" /* * Supported optional AENs: @@ -202,6 +203,11 @@ struct nvmet_ctrl { struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; }; +struct nvmet_subsys_model { + struct rcu_head rcuhead; + char number[]; +}; + struct nvmet_subsys { enum nvme_subsys_type type; @@ -229,6 +235,8 @@ struct nvmet_subsys { struct config_group namespaces_group; struct config_group allowed_hosts_group; + + struct nvmet_subsys_model __rcu *model; }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) -- cgit v1.2.3-70-g09d2 From d3a9b0cadf8cea1746a6bf525d049198e705836a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 30 Jan 2020 10:29:33 -0800 Subject: nvmet: check sscanf value for subsys serial attr For nvmet in configfs.c we check return values for all the sscanf() calls. Add similar check into the nvmet_subsys_attr_serial_store(). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 1654064deea5..7aa10788b7c8 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -849,10 +849,13 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, const char *page, size_t count) { - struct nvmet_subsys *subsys = to_subsys(item); + u64 serial; + + if (sscanf(page, "%llx\n", &serial) != 1) + return -EINVAL; down_write(&nvmet_config_sem); - sscanf(page, "%llx\n", &subsys->serial); + to_subsys(item)->serial = serial; up_write(&nvmet_config_sem); return count; -- cgit v1.2.3-70-g09d2 From 9912ade355902adb9dacbec640fac23c4e73019d Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Thu, 16 Jan 2020 00:46:12 +0000 Subject: nvme-tcp: Set SO_PRIORITY for all host sockets Enable ability to associate all sockets related to NVMf TCP traffic to a priority group that will perform optimized network processing for this traffic class. Maintain initial default behavior of using priority of zero. Signed-off-by: Kiran Patil Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 49d4373b84eb..e384239af880 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -20,6 +20,16 @@ struct nvme_tcp_queue; +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -1309,6 +1319,17 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_PRIORITY sock opt, ret %d\n", + ret); + goto err_sock; + } + } + /* Set socket type of service */ if (nctrl->opts->tos >= 0) { opt = nctrl->opts->tos; -- cgit v1.2.3-70-g09d2 From 43cc66892e81bb05283159e489a19cec177e6f9d Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Thu, 16 Jan 2020 00:46:16 +0000 Subject: nvmet-tcp: set SO_PRIORITY for accepted sockets Enable ability to associate all sockets related to NVMf TCP traffic to a priority group that will perform optimized network processing for this traffic class. Maintain initial default behavior of using priority of zero. Signed-off-by: Kiran Patil Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index af674fc0bb1e..cbff1038bdb3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -19,6 +19,16 @@ #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -1433,6 +1443,13 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + if (so_priority > 0) { + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) + return ret; + } + /* Set socket type of service */ if (inet->rcv_tos > 0) { int tos = inet->rcv_tos; @@ -1622,6 +1639,15 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); + goto err_sock; + } + } + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); if (ret) { -- cgit v1.2.3-70-g09d2 From a7afff31d56db22647251d76d6af030cd47bd97e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2020 13:31:00 -0700 Subject: scsi: treewide: Consolidate {get,put}_unaligned_[bl]e24() definitions Move the get_unaligned_be24(), get_unaligned_le24() and put_unaligned_le24() definitions from various drivers into include/linux/unaligned/generic.h. Add a put_unaligned_be24() implementation. Link: https://lore.kernel.org/r/20200313203102.16613-4-bvanassche@acm.org Cc: Keith Busch Cc: Sagi Grimberg Cc: Jens Axboe Cc: Harvey Harrison Cc: Martin K. Petersen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman # For drivers/usb Reviewed-by: Felipe Balbi # For drivers/usb/gadget Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/nvme/host/rdma.c | 8 ----- drivers/nvme/target/rdma.c | 6 ---- drivers/usb/gadget/function/f_mass_storage.c | 1 + drivers/usb/gadget/function/storage_common.h | 5 --- include/linux/unaligned/generic.h | 46 ++++++++++++++++++++++++++++ include/target/target_core_backend.h | 6 ---- 6 files changed, 47 insertions(+), 25 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2a47c6c5007e..85e9bcbd44b4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -142,14 +142,6 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static const struct blk_mq_ops nvme_rdma_mq_ops; static const struct blk_mq_ops nvme_rdma_admin_mq_ops; -/* XXX: really should move to a generic header sooner or later.. */ -static inline void put_unaligned_le24(u32 val, u8 *p) -{ - *p++ = val; - *p++ = val >> 8; - *p++ = val >> 16; -} - static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) { return queue - queue->ctrl->queues; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 37d262a65877..8fcede75e02a 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -143,12 +143,6 @@ static int num_pages(int len) return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); } -/* XXX: really should move to a generic header sooner or later.. */ -static inline u32 get_unaligned_le24(const u8 *p) -{ - return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; -} - static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 7c96c4665178..950d2a85f098 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -216,6 +216,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/function/storage_common.h b/drivers/usb/gadget/function/storage_common.h index e5e3a2553aaa..bdeb1e233fc9 100644 --- a/drivers/usb/gadget/function/storage_common.h +++ b/drivers/usb/gadget/function/storage_common.h @@ -172,11 +172,6 @@ enum data_direction { DATA_DIR_NONE }; -static inline u32 get_unaligned_be24(u8 *buf) -{ - return 0xffffff & (u32) get_unaligned_be32(buf - 1); -} - static inline struct fsg_lun *fsg_lun_from_dev(struct device *dev) { return container_of(dev, struct fsg_lun, dev); diff --git a/include/linux/unaligned/generic.h b/include/linux/unaligned/generic.h index 57d3114656e5..303289492859 100644 --- a/include/linux/unaligned/generic.h +++ b/include/linux/unaligned/generic.h @@ -2,6 +2,8 @@ #ifndef _LINUX_UNALIGNED_GENERIC_H #define _LINUX_UNALIGNED_GENERIC_H +#include + /* * Cause a link-time error if we try an unaligned access other than * 1,2,4 or 8 bytes long @@ -66,4 +68,48 @@ extern void __bad_unaligned_access_size(void); } \ (void)0; }) +static inline u32 __get_unaligned_be24(const u8 *p) +{ + return p[0] << 16 | p[1] << 8 | p[2]; +} + +static inline u32 get_unaligned_be24(const void *p) +{ + return __get_unaligned_be24(p); +} + +static inline u32 __get_unaligned_le24(const u8 *p) +{ + return p[0] | p[1] << 8 | p[2] << 16; +} + +static inline u32 get_unaligned_le24(const void *p) +{ + return __get_unaligned_le24(p); +} + +static inline void __put_unaligned_be24(const u32 val, u8 *p) +{ + *p++ = val >> 16; + *p++ = val >> 8; + *p++ = val; +} + +static inline void put_unaligned_be24(const u32 val, void *p) +{ + __put_unaligned_be24(val, p); +} + +static inline void __put_unaligned_le24(const u32 val, u8 *p) +{ + *p++ = val; + *p++ = val >> 8; + *p++ = val >> 16; +} + +static inline void put_unaligned_le24(const u32 val, void *p) +{ + __put_unaligned_le24(val, p); +} + #endif /* _LINUX_UNALIGNED_GENERIC_H */ diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 51b6f50eabee..1b752d8ea529 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -116,10 +116,4 @@ static inline bool target_dev_configured(struct se_device *se_dev) return !!(se_dev->dev_flags & DF_CONFIGURED); } -/* Only use get_unaligned_be24() if reading p - 1 is allowed. */ -static inline uint32_t get_unaligned_be24(const uint8_t *const p) -{ - return get_unaligned_be32(p - 1) & 0xffffffU; -} - #endif /* TARGET_CORE_BACKEND_H */ -- cgit v1.2.3-70-g09d2 From cb224c3af4df5996246eb779a23d968a8aabad28 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:08 +0000 Subject: nvme: Convert to use set_capacity_revalidate_and_notify block/genhd provides set_capacity_revalidate_and_notify() for sending RESIZE notifications via uevents. This notification is newly added to NVME devices Signed-off-by: Balbir Singh Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4d8c90ee7cc..41ad07f6a564 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1810,7 +1810,7 @@ static void nvme_update_disk_info(struct gendisk *disk, ns->lba_shift > PAGE_SHIFT) capacity = 0; - set_capacity(disk, capacity); + set_capacity_revalidate_and_notify(disk, capacity, false); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); -- cgit v1.2.3-70-g09d2 From c6a564ffadc9105880329710164ee493f0de103c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:42 +0100 Subject: block: move the part_stat* helpers from genhd.h to a new header These macros are just used by a few files. Move them out of genhd.h, which is included everywhere into a new standalone header. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 1 + drivers/block/drbd/drbd_receiver.c | 1 + drivers/block/drbd/drbd_worker.c | 1 + drivers/block/zram/zram_drv.c | 1 + drivers/md/dm.c | 1 + drivers/md/md.c | 1 + drivers/nvme/target/admin-cmd.c | 1 + fs/ext4/super.c | 2 +- fs/ext4/sysfs.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + include/linux/genhd.h | 108 ---------------------------------- include/linux/part_stat.h | 115 +++++++++++++++++++++++++++++++++++++ 13 files changed, 126 insertions(+), 109 deletions(-) create mode 100644 include/linux/part_stat.h (limited to 'drivers/nvme') diff --git a/block/blk.h b/block/blk.h index ac20f972842e..d9673164a145 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "blk-mq.h" #include "blk-mq-sched.h" diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 79e216446030..c15e7083b13a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index b7f605c6e231..0dc019da1f8d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1bdb5793842b..76e75097e9ab 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "zram_drv.h" diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0413018c8305..0d881cfa160b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -25,6 +25,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c7193715f07..f6cf3b53f6c1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include "md.h" diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..cca759c918a4 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4cbd1cc34dfa..c8dff4c68141 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -43,7 +43,7 @@ #include #include #include - +#include #include #include diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d218ebdafa4a..04bfaf63752c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5355be6b6755..088c3e7a1080 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 65a7a432dfee..d398b2d90c6c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 14354a6e89c2..927eed0be179 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -298,114 +298,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -/* - * Macros to operate on percpu disk statistics: - * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters - * and should be called between disk_stat_lock() and - * disk_stat_unlock(). - * - * part_stat_read() can be called at any time. - * - * part_stat_{add|set_all}() and {init|free}_part_stats are for - * internal use only. - */ -#ifdef CONFIG_SMP -#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) - -#define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field) - -#define part_stat_get(part, field) \ - part_stat_get_cpu(part, field, smp_processor_id()) - -#define part_stat_read(part, field) \ -({ \ - typeof((part)->dkstats->field) res = 0; \ - unsigned int _cpu; \ - for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ - res; \ -}) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - int i; - - for_each_possible_cpu(i) - memset(per_cpu_ptr(part->dkstats, i), value, - sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ - free_percpu(part->dkstats); -} - -#else /* !CONFIG_SMP */ -#define part_stat_lock() ({ rcu_read_lock(); 0; }) -#define part_stat_unlock() rcu_read_unlock() - -#define part_stat_get(part, field) ((part)->dkstats.field) -#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) -#define part_stat_read(part, field) part_stat_get(part, field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - memset(&part->dkstats, value, sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ -} - -#endif /* CONFIG_SMP */ - -#define part_stat_read_accum(part, field) \ - (part_stat_read(part, field[STAT_READ]) + \ - part_stat_read(part, field[STAT_WRITE]) + \ - part_stat_read(part, field[STAT_DISCARD])) - -#define __part_stat_add(part, field, addnd) \ - (part_stat_get(part, field) += (addnd)) - -#define part_stat_add(part, field, addnd) do { \ - __part_stat_add((part), field, addnd); \ - if ((part)->partno) \ - __part_stat_add(&part_to_disk((part))->part0, \ - field, addnd); \ -} while (0) - -#define part_stat_dec(gendiskp, field) \ - part_stat_add(gendiskp, field, -1) -#define part_stat_inc(gendiskp, field) \ - part_stat_add(gendiskp, field, 1) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) - -#define part_stat_local_dec(gendiskp, field) \ - local_dec(&(part_stat_get(gendiskp, field))) -#define part_stat_local_inc(gendiskp, field) \ - local_inc(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read(gendiskp, field) \ - local_read(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read_cpu(gendiskp, field, cpu) \ - local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h new file mode 100644 index 000000000000..ece607607a86 --- /dev/null +++ b/include/linux/part_stat.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PART_STAT_H +#define _LINUX_PART_STAT_H + +#include + +/* + * Macros to operate on percpu disk statistics: + * + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters + * and should be called between disk_stat_lock() and + * disk_stat_unlock(). + * + * part_stat_read() can be called at any time. + * + * part_stat_{add|set_all}() and {init|free}_part_stats are for + * internal use only. + */ +#ifdef CONFIG_SMP +#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) +#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) + +#define part_stat_get_cpu(part, field, cpu) \ + (per_cpu_ptr((part)->dkstats, (cpu))->field) + +#define part_stat_get(part, field) \ + part_stat_get_cpu(part, field, smp_processor_id()) + +#define part_stat_read(part, field) \ +({ \ + typeof((part)->dkstats->field) res = 0; \ + unsigned int _cpu; \ + for_each_possible_cpu(_cpu) \ + res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ + res; \ +}) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + int i; + + for_each_possible_cpu(i) + memset(per_cpu_ptr(part->dkstats, i), value, + sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + part->dkstats = alloc_percpu(struct disk_stats); + if (!part->dkstats) + return 0; + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ + free_percpu(part->dkstats); +} + +#else /* !CONFIG_SMP */ +#define part_stat_lock() ({ rcu_read_lock(); 0; }) +#define part_stat_unlock() rcu_read_unlock() + +#define part_stat_get(part, field) ((part)->dkstats.field) +#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) +#define part_stat_read(part, field) part_stat_get(part, field) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + memset(&part->dkstats, value, sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ +} + +#endif /* CONFIG_SMP */ + +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) + +#define __part_stat_add(part, field, addnd) \ + (part_stat_get(part, field) += (addnd)) + +#define part_stat_add(part, field, addnd) do { \ + __part_stat_add((part), field, addnd); \ + if ((part)->partno) \ + __part_stat_add(&part_to_disk((part))->part0, \ + field, addnd); \ +} while (0) + +#define part_stat_dec(gendiskp, field) \ + part_stat_add(gendiskp, field, -1) +#define part_stat_inc(gendiskp, field) \ + part_stat_add(gendiskp, field, 1) +#define part_stat_sub(gendiskp, field, subnd) \ + part_stat_add(gendiskp, field, -subnd) + +#define part_stat_local_dec(gendiskp, field) \ + local_dec(&(part_stat_get(gendiskp, field))) +#define part_stat_local_inc(gendiskp, field) \ + local_inc(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read(gendiskp, field) \ + local_read(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read_cpu(gendiskp, field, cpu) \ + local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) + +#endif /* _LINUX_PART_STAT_H */ -- cgit v1.2.3-70-g09d2 From 76171c6cdf832bc18b6d8207c9be94d78e54ed09 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 7 Feb 2020 17:13:53 -0800 Subject: nvme: expose hostnqn via sysfs for fabrics controllers We allow userspace to connect with a custom hostnqn which is useful for certain use-cases. However there is no way to tell what is the hostnqn used to connect to a given controller. Expose this so userspace can correlate controllers based on hostnqn. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 414076aaf52b..4633acc0e68f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3242,6 +3242,16 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); +static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); +} +static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); + static ssize_t nvme_sysfs_show_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -3267,6 +3277,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_numa_node.attr, &dev_attr_queue_count.attr, &dev_attr_sqsize.attr, + &dev_attr_hostnqn.attr, NULL }; @@ -3280,6 +3291,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_address.attr && !ctrl->ops->get_address) return 0; + if (a == &dev_attr_hostnqn.attr && !ctrl->opts) + return 0; return a->mode; } -- cgit v1.2.3-70-g09d2 From 45fb19f766d94a642cd820fe523ac29f502eece2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 7 Feb 2020 17:13:54 -0800 Subject: nvme: expose hostid via sysfs for fabrics controllers We allow userspace to connect with a custom hostid which is useful for certain use-cases. However there is is no way to tell what is the hostid used to connect to a given controller. Expose this so userspace can correlate controllers based on hostid. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4633acc0e68f..720840ca875c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3252,6 +3252,16 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, } static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); +static ssize_t nvme_sysfs_show_hostid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); +} +static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); + static ssize_t nvme_sysfs_show_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -3278,6 +3288,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_queue_count.attr, &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, + &dev_attr_hostid.attr, NULL }; @@ -3293,6 +3304,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_hostnqn.attr && !ctrl->opts) return 0; + if (a == &dev_attr_hostid.attr && !ctrl->opts) + return 0; return a->mode; } -- cgit v1.2.3-70-g09d2 From 228914504cecd1c7d1279b5b884ab55d474cc87e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 11 Feb 2020 13:41:36 +0100 Subject: nvme: Don't deter users from enabling hwmon support I see no good reason for the "If unsure, say N" advice in the description of the NVME_HWMON configuration option. It is not dangerous, it does not select any other option, and has a fairly low overhead. As the option is already not enabled by default, further suggesting hesitant users to not enable it is not useful anyway. Unlike some other options where the description alone may not be sufficient for users to make a decision, NVME_HWMON is pretty simple to grasp in my opinion, so just let the user do what they want. Signed-off-by: Jean Delvare Reviewed-by: Chaitanya Kulkarni Reviewed-by: Guenter Roeck Cc: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b9358db83e96..9c17ed32be64 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -32,8 +32,6 @@ config NVME_HWMON a hardware monitoring device will be created for each NVMe drive in the system. - If unsure, say N. - config NVME_FABRICS tristate -- cgit v1.2.3-70-g09d2 From ad95a613ea447e2404e343ab3636c4d960fa9580 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Feb 2020 08:14:31 -0800 Subject: nvme: code cleanup nvme_identify_ns_desc() The function nvme_identify_ns_desc() has 3 levels of nesting which make error message to exceeded > 80 char per line which is not aligned with the kernel code standards and rest of the NVMe subsystem code. Add a helper function to move the processing of the log when the command is successful by reducing the nesting and keeping the code < 80 char per line. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 76 +++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 720840ca875c..c4dbc852b5e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1055,6 +1055,43 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, + struct nvme_ns_id_desc *cur) +{ + const char *warn_str = "ctrl returned bogus length:"; + void *data = cur; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); + return NVME_NIDT_EUI64_LEN; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); + return NVME_NIDT_NGUID_LEN; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", + warn_str, cur->nidl); + return -1; + } + uuid_copy(&ids->uuid, data + sizeof(*cur)); + return NVME_NIDT_UUID_LEN; + default: + /* Skip unknown types */ + return cur->nidl; + } +} + static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -1083,42 +1120,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - switch (cur->nidt) { - case NVME_NIDT_EUI64: - if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_EUI64_LEN; - memcpy(ids->eui64, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_NGUID: - if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_NGUID_LEN; - memcpy(ids->nguid, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_UUID: - if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_UUID_LEN; - uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); - break; - default: - /* Skip unknown types */ - len = cur->nidl; - break; - } + len = nvme_process_ns_desc(ctrl, ids, cur); + if (len < 0) + goto free_data; len += sizeof(*cur); } -- cgit v1.2.3-70-g09d2 From 94d2e705b6a6fe9c56a990c0cd31a7298cfcee9a Mon Sep 17 00:00:00 2001 From: Rupesh Girase Date: Thu, 27 Feb 2020 22:15:26 +0530 Subject: nvme: log additional message for controller status Log the controller status to know more about issue if it lies within kernel nvme subsytem or controller is unhealthy. Signed-off-by: Rupesh Girase Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4dbc852b5e9..c9988942d0aa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2083,8 +2083,8 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(ctrl->device, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); + "Device not ready; aborting %s, CSTS=0x%x\n", + enabled ? "initialisation" : "reset", csts); return -ENODEV; } } -- cgit v1.2.3-70-g09d2 From 3e98c2443f5c7f127b5b7492a3089e92a1c85112 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 28 Feb 2020 18:52:28 -0800 Subject: nvme: Check for readiness more quickly, to speed up boot time After initialization, nvme_wait_ready checks for readiness every 100ms, even though the drive may be ready far sooner than that. This delays system boot by hundreds of milliseconds. Reduce the delay, checking for readiness every millisecond instead. Boot-time tests on an AWS c5.12xlarge: Before: [ 0.546936] initcall nvme_init+0x0/0x5b returned 0 after 37 usecs ... [ 0.764178] nvme nvme0: 2/0/0 default/read/poll queues [ 0.768424] nvme0n1: p1 [ 0.774132] EXT4-fs (nvme0n1p1): mounted filesystem with ordered data mode. Opts: (null) [ 0.774146] VFS: Mounted root (ext4 filesystem) on device 259:1. ... [ 0.788141] Run /sbin/init as init process After: [ 0.537088] initcall nvme_init+0x0/0x5b returned 0 after 37 usecs ... [ 0.543457] nvme nvme0: 2/0/0 default/read/poll queues [ 0.548473] nvme0n1: p1 [ 0.554339] EXT4-fs (nvme0n1p1): mounted filesystem with ordered data mode. Opts: (null) [ 0.554344] VFS: Mounted root (ext4 filesystem) on device 259:1. ... [ 0.567931] Run /sbin/init as init process Signed-off-by: Josh Triplett Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9988942d0aa..0e38e07a302f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2078,7 +2078,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) if ((csts & NVME_CSTS_RDY) == bit) break; - msleep(100); + usleep_range(1000, 2000); if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { -- cgit v1.2.3-70-g09d2 From 6d525f9755c2ce444de2f3d604d41fbe4df91a8c Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sat, 29 Feb 2020 16:28:41 -0800 Subject: nvmet: check ncqr & nsqr for set-features cmd For set feature command when setting up NVME_FEAT_NUM_QUEUES, check Number of I/O Completion Queues Requested (NCQR) and Number of I/O Submission Queues Requested (NSQR) before we proceed, for invalid values (i.e. 65535) return an appropriate NVMe invalid field status. Signed-off-by: Amit Engel Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 19f949570625..c0aa9c34c699 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -733,13 +733,22 @@ static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); u16 status = 0; + u16 nsqr; + u16 ncqr; if (!nvmet_check_data_len(req, 0)) return; switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: + ncqr = (cdw11 >> 16) & 0xffff; + nsqr = cdw11 & 0xffff; + if (ncqr == 0xffff || nsqr == 0xffff) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; -- cgit v1.2.3-70-g09d2 From e2a366a4b0feaeba8f0bf6091ddd2ac27507a9d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Feb 2020 21:45:19 +0300 Subject: nvme-pci: slimmer CQ head update Update CQ head with pre-increment operator. This saves subtraction of 1 and a few registers. Also update phase with "^= 1". This generates only one RMW instruction. ffffffff815ba150 : ffffffff815ba150: 0f b7 47 70 movzx eax,WORD PTR [rdi+0x70] ffffffff815ba154: 83 c0 01 add eax,0x1 ffffffff815ba157: 66 89 47 70 mov WORD PTR [rdi+0x70],ax ffffffff815ba15b: 66 3b 47 68 cmp ax,WORD PTR [rdi+0x68] ffffffff815ba15f: 74 01 je ffffffff815ba162 ffffffff815ba161: c3 ret ffffffff815ba162: 31 c0 xor eax,eax ffffffff815ba164: 80 77 74 01 ===> xor BYTE PTR [rdi+0x74],0x1 ffffffff815ba168: 66 89 47 70 mov WORD PTR [rdi+0x70],ax ffffffff815ba16c: c3 ret add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-119 (-119) Function old new delta nvme_poll 690 678 -12 nvme_dev_disable 1230 1177 -53 nvme_irq 613 559 -54 Signed-off-by: Alexey Dobriyan --- drivers/nvme/host/pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3f23d6254e4..cdc9b6149d38 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -982,11 +982,9 @@ static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (nvmeq->cq_head == nvmeq->q_depth - 1) { + if (++nvmeq->cq_head == nvmeq->q_depth) { nvmeq->cq_head = 0; - nvmeq->cq_phase = !nvmeq->cq_phase; - } else { - nvmeq->cq_head++; + nvmeq->cq_phase ^= 1; } } -- cgit v1.2.3-70-g09d2 From bf392a5dc02a9b796f3da89fc5bb42856aca64cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Mar 2020 08:45:04 -0800 Subject: nvme-pci: Remove tag from process cq The only user for tagged completion was for timeout handling. That user, though, really only cares if the timed out command is completed, which we can safely check within the timeout handler. Remove the tag check to simplify completion handling. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cdc9b6149d38..98d8ddd7aa0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -989,14 +989,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end, unsigned int tag) + u16 *end) { int found = 0; *start = nvmeq->cq_head; while (nvme_cqe_pending(nvmeq)) { - if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) - found++; + found++; nvme_update_cq_head(nvmeq); } *end = nvmeq->cq_head; @@ -1017,7 +1016,7 @@ static irqreturn_t nvme_irq(int irq, void *data) * the irq handler, even if that was on another CPU. */ rmb(); - nvme_process_cq(nvmeq, &start, &end, -1); + nvme_process_cq(nvmeq, &start, &end); wmb(); if (start != end) { @@ -1040,7 +1039,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data) * Poll for completions any queue, including those not dedicated to polling. * Can be called from any context. */ -static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) +static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); u16 start, end; @@ -1053,11 +1052,11 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, tag); + found = nvme_process_cq(nvmeq, &start, &end); spin_unlock(&nvmeq->cq_poll_lock); } else { disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq, &start, &end, tag); + found = nvme_process_cq(nvmeq, &start, &end); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } @@ -1075,8 +1074,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, -1); - nvme_complete_cqes(nvmeq, start, end); + found = nvme_process_cq(nvmeq, &start, &end); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1253,7 +1251,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - if (nvme_poll_irqdisable(nvmeq, req->tag)) { + nvme_poll_irqdisable(nvmeq); + if (blk_mq_request_completed(req)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1396,7 +1395,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) else nvme_disable_ctrl(&dev->ctrl); - nvme_poll_irqdisable(nvmeq, -1); + nvme_poll_irqdisable(nvmeq); } /* @@ -1411,7 +1410,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev) int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - nvme_process_cq(&dev->queues[i], &start, &end, -1); + nvme_process_cq(&dev->queues[i], &start, &end); nvme_complete_cqes(&dev->queues[i], start, end); } } -- cgit v1.2.3-70-g09d2 From 324b494c286298d51bc5ed5107644ebe23f9dad6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Mar 2020 08:56:53 -0800 Subject: nvme-pci: Remove two-pass completions Completion handling had been done in two steps: find all new completions under a lock, then handle those completions outside the lock. This was done to make the locked section as short as possible so that other threads using the same lock wait less time. The driver no longer shares locks during completion, and is in fact lockless for interrupt driven queues, so the optimization no longer serves its original purpose. Replace the two-pass completion queue handler with a single pass that completes entries immediately. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 98d8ddd7aa0f..02f22c63adcf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -971,15 +971,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) nvme_end_request(req, cqe->status, cqe->result); } -static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) -{ - while (start != end) { - nvme_handle_cqe(nvmeq, start); - if (++start == nvmeq->q_depth) - start = 0; - } -} - static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { if (++nvmeq->cq_head == nvmeq->q_depth) { @@ -988,19 +979,17 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end) +static inline int nvme_process_cq(struct nvme_queue *nvmeq) { int found = 0; - *start = nvmeq->cq_head; while (nvme_cqe_pending(nvmeq)) { found++; + nvme_handle_cqe(nvmeq, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } - *end = nvmeq->cq_head; - if (*start != *end) + if (found) nvme_ring_cq_doorbell(nvmeq); return found; } @@ -1009,21 +998,16 @@ static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; irqreturn_t ret = IRQ_NONE; - u16 start, end; /* * The rmb/wmb pair ensures we see all updates from a previous run of * the irq handler, even if that was on another CPU. */ rmb(); - nvme_process_cq(nvmeq, &start, &end); + if (nvme_process_cq(nvmeq)) + ret = IRQ_HANDLED; wmb(); - if (start != end) { - nvme_complete_cqes(nvmeq, start, end); - return IRQ_HANDLED; - } - return ret; } @@ -1042,7 +1026,6 @@ static irqreturn_t nvme_irq_check(int irq, void *data) static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - u16 start, end; int found; /* @@ -1052,29 +1035,27 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); spin_unlock(&nvmeq->cq_poll_lock); } else { disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } - nvme_complete_cqes(nvmeq, start, end); return found; } static int nvme_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; - u16 start, end; bool found; if (!nvme_cqe_pending(nvmeq)) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1406,13 +1387,10 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) */ static void nvme_reap_pending_cqes(struct nvme_dev *dev) { - u16 start, end; int i; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - nvme_process_cq(&dev->queues[i], &start, &end); - nvme_complete_cqes(&dev->queues[i], start, end); - } + for (i = dev->ctrl.queue_count - 1; i > 0; i--) + nvme_process_cq(&dev->queues[i]); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, -- cgit v1.2.3-70-g09d2 From fa059b856a593a7bddd4d3779ae8ab1380e05d91 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 4 Mar 2020 09:17:01 -0800 Subject: nvme-pci: Simplify nvme_poll_irqdisable The timeout handler can use the existing nvme_poll() if it needs to check a polled queue, allowing nvme_poll_irqdisable() to handle only irq driven queues for the remaining callers. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 02f22c63adcf..f45e26e6af7e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1020,30 +1020,18 @@ static irqreturn_t nvme_irq_check(int irq, void *data) } /* - * Poll for completions any queue, including those not dedicated to polling. + * Poll for completions for any interrupt driven queue * Can be called from any context. */ -static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) +static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - int found; - /* - * For a poll queue we need to protect against the polling thread - * using the CQ lock. For normal interrupt driven threads we have - * to disable the interrupt to avoid racing with it. - */ - if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { - spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); - spin_unlock(&nvmeq->cq_poll_lock); - } else { - disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq); - enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - } + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); - return found; + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + nvme_process_cq(nvmeq); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } static int nvme_poll(struct blk_mq_hw_ctx *hctx) @@ -1232,7 +1220,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - nvme_poll_irqdisable(nvmeq); + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) + nvme_poll(req->mq_hctx); + else + nvme_poll_irqdisable(nvmeq); + if (blk_mq_request_completed(req)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", -- cgit v1.2.3-70-g09d2 From 40510a639ec08db81d5ff9c79856baf9dda94748 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 15:53:09 -0800 Subject: nvme-tcp: optimize queue io_cpu assignment for multiple queue maps Currently, queue io_cpu assignment is done sequentially for default, read and poll queues based on queue id. This causes miss-alignment between context of CPU initiating I/O and the I/O worker thread processing queued requests or completions. Change to modify queue io_cpu assignment to take into account queue maps offset. Each queue io_cpu will start at zero for each queue map. This essentially aligns read/poll queues to start over the same range as default queues. Testing performed by Mark with: - ram device (nvmet) - single CPU core (pinned) - 100% 4k reads - engine io_uring (not using sq_thread option) - hipri flag set Micro-benchmark results show a net gain of: - increase of 18%-29% in IOPs - reduction of 16%-22% in average latency - reduction of 7%-23% in 99.99% latency Baseline: ======== QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us] ----------------------------------------------------------------- 1/1 | 32.4 | 30.11 | 50.94 32/8 | 179 | 168.20 | 371 CPU alignment: ============= QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us] ----------------------------------------------------------------- 1/1 | 38.5 | 25.18 | 39.16 32/8 | 231 | 130.75 | 343 Reported-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e384239af880..11a7c26f8573 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1258,13 +1258,67 @@ free_icreq: return ret; } +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + int n = 0; + + if (nvme_tcp_default_queue(queue)) + n = qid - 1; + else if (nvme_tcp_read_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + else if (nvme_tcp_poll_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); +} + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, size_t queue_size) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size, n; + int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1343,11 +1397,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } queue->sock->sk->sk_allocation = GFP_ATOMIC; - if (!qid) - n = 0; - else - n = (qid - 1) % num_online_cpus(); - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; -- cgit v1.2.3-70-g09d2 From 9cda34e37489244a8c8628617e24b2dbc8a8edad Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:42:27 -0800 Subject: nvmet-tcp: fix maxh2cdata icresp parameter MAXH2CDATA is not zero based. Also no reason to limit ourselves to 1M transfers as we can do more easily. Make this an arbitrary limit of 16M. Reported-by: Wenhua Liu Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index cbff1038bdb3..1942c941c31c 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -798,7 +798,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); - icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */ + icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; -- cgit v1.2.3-70-g09d2 From 5ff4e11264780ce49a9acb66e0b4c5a30a569be4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:43:23 -0800 Subject: nvme-tcp: move send failure to nvme_tcp_try_send Consolidate the request failure handling code to where it is being fetched (nvme_tcp_try_send). Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 11a7c26f8573..221a5a59aa06 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1027,8 +1027,15 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) if (req->state == NVME_TCP_SEND_DDGST) ret = nvme_tcp_try_send_ddgst(req); done: - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + if (ret != -EPIPE && ret != -ECONNRESET) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } return ret; } @@ -1059,21 +1066,10 @@ static void nvme_tcp_io_work(struct work_struct *w) int result; result = nvme_tcp_try_send(queue); - if (result > 0) { + if (result > 0) pending = true; - } else if (unlikely(result < 0)) { - dev_err(queue->ctrl->ctrl.device, - "failed to send request %d\n", result); - - /* - * Fail the request unless peer closed the connection, - * in which case error recovery flow will complete all. - */ - if ((result != -EPIPE) && (result != -ECONNRESET)) - nvme_tcp_fail_request(queue->request); - nvme_tcp_done_send_req(queue); - return; - } + else if (unlikely(result < 0)) + break; result = nvme_tcp_try_recv(queue); if (result > 0) -- cgit v1.2.3-70-g09d2 From 761ad26c45b0260a8516bc1fc9d25bb66ca4e25c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:43:24 -0800 Subject: nvme-tcp: break from io_work loop if recv failed If we failed to receive data from the socket, don't try to further process it, we will for sure be handling a queue error at this point. While no issue was seen with the current behavior thus far, its safer to cease socket processing if we detected an error. Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 221a5a59aa06..4b20301e517c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1074,6 +1074,8 @@ static void nvme_tcp_io_work(struct work_struct *w) result = nvme_tcp_try_recv(queue); if (result > 0) pending = true; + else if (unlikely(result < 0)) + break; if (!pending) return; -- cgit v1.2.3-70-g09d2 From 2db24e4a22bc97c713261a81fc75e2a36db65715 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 9 Mar 2020 17:04:12 +0200 Subject: nvme-pci: properly print controller address Align PCI address print with fabrics address that is printed with newline character. Before: [root@server40 linux]# cat /sys/class/nvme/nvme2/address 0000:0b:00.0[root@server40 linux]# After: [root@server40 linux]# cat /sys/class/nvme/nvme2/address 0000:0b:00.0 [root@server40 linux]# Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f45e26e6af7e..e6fa0c7bb96c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2656,7 +2656,7 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) { struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); - return snprintf(buf, size, "%s", dev_name(&pdev->dev)); + return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { -- cgit v1.2.3-70-g09d2 From 02cb00e233ade7c050e0f476902e63847e78114e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:03 +0200 Subject: nvmet: Add get_mdts op for controllers Some transports, such as RDMA, would like to set the Maximum Data Transfer Size (MDTS) according to device/port/ctrl characteristics. This will enable the transport to set the optimal MDTS according to controller needs and device capabilities. Add a new nvmet transport op that is called during ctrl identification. This will not effect transports that don't implement this option. The return value of the new op is according to the NVMe spec definition for MDTS. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin --- drivers/nvme/target/admin-cmd.c | 8 ++++++-- drivers/nvme/target/nvmet.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c0aa9c34c699..b9ec489dc748 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -369,8 +369,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* we support multiple ports, multiples hosts and ANA: */ id->cmic = (1 << 0) | (1 << 1) | (1 << 3); - /* no limit on data transfer sizes for now */ - id->mdts = 0; + /* Limit MDTS according to transport capability */ + if (ctrl->ops->get_mdts) + id->mdts = ctrl->ops->get_mdts(ctrl); + else + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); id->ver = cpu_to_le32(ctrl->subsys->ver); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 42ba2ddd9e96..421dff3ea143 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -289,6 +289,7 @@ struct nvmet_fabrics_ops { struct nvmet_port *port, char *traddr); u16 (*install_queue)(struct nvmet_sq *nvme_sq); void (*discovery_chg)(struct nvmet_port *port); + u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); }; #define NVMET_MAX_INLINE_BIOVEC 8 -- cgit v1.2.3-70-g09d2 From ec6d20e16c2d2bef8df2d82d63dcee51caa4ac27 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:04 +0200 Subject: nvmet-rdma: Implement get_mdts controller op Set the maximal data transfer size to be 1MB (currently mdts is unlimited). This will allow calculating the amount of MR's that one ctrl should allocate to fulfill it's capabilities. Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy --- drivers/nvme/target/rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 37d262a65877..f47a79b9fc6c 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -31,6 +31,9 @@ #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 + struct nvmet_rdma_cmd { struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; @@ -1602,6 +1605,11 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, } } +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + return NVMET_RDMA_MAX_MDTS; +} + static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, @@ -1612,6 +1620,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, + .get_mdts = nvmet_rdma_get_mdts, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) -- cgit v1.2.3-70-g09d2 From c363f249e7e6576587d8982d9087406fe98beb99 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:05 +0200 Subject: nvmet-rdma: allocate RW ctxs according to mdts Current nvmet-rdma code allocates MR pool budget based on queue size, assuming both host and target use the same "max_pages_per_mr" count. After limiting the mdts value for RDMA controllers, we know the factor of maximum MR's per IO operation. Thus, make sure MR pool will be sufficient for the required IO depth and IO size. That is, say host's SQ size is 100, then the MR pool budget allocated currently at target will also be 100 MRs. But 100 IO WRITE Requests with 256 sg_count(IO size above 1MB) require 200 MRs when target's "max_pages_per_mr" is 128. Reported-by: Krishnamraju Eraparaju Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy --- drivers/nvme/target/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f47a79b9fc6c..9e1b8c61f54e 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -978,7 +978,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { struct ib_qp_init_attr qp_attr; struct nvmet_rdma_device *ndev = queue->dev; - int comp_vector, nr_cqe, ret, i; + int comp_vector, nr_cqe, ret, i, factor; /* * Spread the io queues across completion vectors, @@ -1011,7 +1011,9 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; - qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, ndev->device->attrs.max_send_sge); -- cgit v1.2.3-70-g09d2 From 764e9332098c0e60251386a507fe46ac91276120 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Thu, 20 Feb 2020 10:05:38 +0900 Subject: nvme-multipath: do not reset on unknown status The nvme multipath error handling defaults to controller reset if the error is unknown. There are, however, no existing nvme status codes that indicate a reset should be used, and resetting causes unnecessary disruption to the rest of IO. Change nvme's error handling to first check if failover should happen. If not, let the normal error handling take over rather than reset the controller. Based-on-a-patch-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: John Meneghini Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +---- drivers/nvme/host/multipath.c | 21 +++++++++------------ drivers/nvme/host/nvme.h | 5 +++-- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0e38e07a302f..fde4b3a526ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -291,11 +291,8 @@ void nvme_complete_rq(struct request *req) nvme_req(req)->ctrl->comp_seen = true; if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && - blk_path_error(status)) { - nvme_failover_req(req); + if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) return; - } if (!blk_queue_dying(req->q)) { nvme_retry_req(req); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..90dd1d641b7b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -64,17 +64,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -void nvme_failover_req(struct request *req) +bool nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status; unsigned long flags; - spin_lock_irqsave(&ns->head->requeue_lock, flags); - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); - switch (status & 0x7ff) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: @@ -103,15 +98,17 @@ void nvme_failover_req(struct request *req) nvme_mpath_clear_current_path(ns); break; default: - /* - * Reset the controller for any non-ANA error as we don't know - * what caused the error. - */ - nvme_reset_ctrl(ns->ctrl); - break; + /* This was a non-ANA error so follow the normal error path. */ + return false; } + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); + return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1024fec7914c..d800b9a51c2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -550,7 +550,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); +bool nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -599,8 +599,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline void nvme_failover_req(struct request *req) +static inline bool nvme_failover_req(struct request *req) { + return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { -- cgit v1.2.3-70-g09d2 From 8d8a50e20dc2dc41cb788085968b9024dc36f7a5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2020 09:50:37 +0100 Subject: nvme-fabrics: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Reviewed-by: Christoph Hellwig Signed-off-by: Takashi Iwai Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 74b8818ac9a1..2a6c8190eeb7 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -105,14 +105,14 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) int len = 0; if (ctrl->opts->mask & NVMF_OPT_TRADDR) - len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + len += scnprintf(buf, size, "traddr=%s", ctrl->opts->traddr); if (ctrl->opts->mask & NVMF_OPT_TRSVCID) - len += snprintf(buf + len, size - len, "%strsvcid=%s", + len += scnprintf(buf + len, size - len, "%strsvcid=%s", (len) ? "," : "", ctrl->opts->trsvcid); if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) - len += snprintf(buf + len, size - len, "%shost_traddr=%s", + len += scnprintf(buf + len, size - len, "%shost_traddr=%s", (len) ? "," : "", ctrl->opts->host_traddr); - len += snprintf(buf + len, size - len, "\n"); + len += scnprintf(buf + len, size - len, "\n"); return len; } -- cgit v1.2.3-70-g09d2 From e90d172b11b845b0f2caa9422c2f9d3ef59af575 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 12 Mar 2020 16:06:39 -0700 Subject: nvmet-tcp: optimize tcp stack TX when data digest is used If we have a 4-byte data digest to send to the wire, but we have more data to send, set MSG_MORE to tell the stack that more is coming. Reviewed-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1942c941c31c..dcee4995e22d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -626,7 +626,7 @@ static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) return 1; } -static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct nvmet_tcp_queue *queue = cmd->queue; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; @@ -636,6 +636,9 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) }; int ret; + if (!last_in_batch && cmd->queue->send_list_len) + msg.msg_flags |= MSG_MORE; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; @@ -676,7 +679,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, } if (cmd->state == NVMET_TCP_SEND_DDGST) { - ret = nvmet_try_send_ddgst(cmd); + ret = nvmet_try_send_ddgst(cmd, last_in_batch); if (ret <= 0) goto done_send; } -- cgit v1.2.3-70-g09d2 From c225b610311bc5695d952cd3590136f26199a227 Mon Sep 17 00:00:00 2001 From: "masahiro31.yamada@kioxia.com" Date: Thu, 5 Mar 2020 11:13:29 +0000 Subject: nvme: Add compat_ioctl handler for NVME_IOCTL_SUBMIT_IO Currently 32 bit application gets ENOTTY when it calls compat_ioctl with NVME_IOCTL_SUBMIT_IO in 64 bit kernel. The cause is that the results of sizeof(struct nvme_user_io), which is used to define NVME_IOCTL_SUBMIT_IO, are not same between 32 bit compiler and 64 bit compiler. * 32 bit: the result of sizeof nvme_user_io is 44. * 64 bit: the result of sizeof nvme_user_io is 48. 64 bit compiler seems to add 32 bit padding for multiple of 8 bytes. This patch adds a compat_ioctl handler. The handler replaces NVME_IOCTL_SUBMIT_IO32 with NVME_IOCTL_SUBMIT_IO in case 32 bit application calls compat_ioctl for submit in 64 bit kernel. Then, it calls nvme_ioctl as usual. Reviewed-by: Christoph Hellwig Signed-off-by: Masahiro Yamada (KIOXIA) Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fde4b3a526ad..3c1c826ea491 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1585,6 +1585,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return ret; } +#ifdef CONFIG_COMPAT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); + +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) + +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + /* + * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO + * between 32 bit programs and 64 bit kernel. + * The cause is that the results of sizeof(struct nvme_user_io), + * which is used to define NVME_IOCTL_SUBMIT_IO, + * are not same between 32 bit compiler and 64 bit compiler. + * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling + * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. + * Other IOCTL numbers are same between 32 bit and 64 bit. + * So there is nothing to do regarding to other IOCTL numbers. + */ + if (cmd == NVME_IOCTL_SUBMIT_IO32) + return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); + + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif /* CONFIG_COMPAT */ + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -2028,7 +2069,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2056,7 +2097,7 @@ const struct block_device_operations nvme_ns_head_ops = { .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, .pr_ops = &nvme_pr_ops, }; -- cgit v1.2.3-70-g09d2 From f41cfd5d0a04b12a5dae753cd01163661432ebbb Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 18 Mar 2020 17:27:59 +0200 Subject: nvme: release ida resources ida instances allocate some internal memory in addition to the base 'struct ida'. Use ida_destroy() to release that memory at module_exit(). Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3c1c826ea491..ad0847b6c769 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4358,6 +4358,7 @@ static void __exit nvme_core_exit(void) destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_instance_ida); } MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From e7c43feae2ab8744c3112b5a714959c8ea71ca19 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 10 Mar 2020 16:39:10 +0200 Subject: nvme: Use nvme_state_terminal helper Improve code readability. Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Israel Rukshin Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ad0847b6c769..392af3cf0bf9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2633,8 +2633,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, lockdep_assert_held(&nvme_subsystems_lock); list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (tmp->state == NVME_CTRL_DELETING || - tmp->state == NVME_CTRL_DEAD) + if (nvme_state_terminal(tmp)) continue; if (tmp->cntlid == ctrl->cntlid) { -- cgit v1.2.3-70-g09d2 From 6721c18a0610db39cf0110b9be07946bbc208ed7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:39 +0200 Subject: nvme: Remove unused return code from nvme_delete_ctrl_sync The return code of nvme_delete_ctrl_sync is never used, so change it to void. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 392af3cf0bf9..8a7761c3086e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -192,21 +192,16 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { - int ret = 0; - /* * Keep a reference until nvme_do_delete_ctrl() complete, * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) - ret = -EBUSY; - if (!ret) + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) nvme_do_delete_ctrl(ctrl); nvme_put_ctrl(ctrl); - return ret; } static inline bool nvme_ns_has_pi(struct nvme_ns *ns) -- cgit v1.2.3-70-g09d2 From 253fd4ac806896293c9b9d12c794195447bad164 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:40 +0200 Subject: nvme-pci: Re-order nvme_pci_free_ctrl Destroy the resources in the same order like in nvme_probe error flow to improve code readability. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6fa0c7bb96c..ff0bd2d84f3e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2470,13 +2470,13 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) struct nvme_dev *dev = to_nvme_dev(ctrl); nvme_dbbuf_dma_free(dev); - put_device(dev->dev); nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); - kfree(dev->queues); free_opal_dev(dev->ctrl.opal_dev); mempool_destroy(dev->iod_mempool); + put_device(dev->dev); + kfree(dev->queues); kfree(dev); } -- cgit v1.2.3-70-g09d2 From b780d7415aacec855e2f2370cbf98f918b224903 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:41 +0200 Subject: nvme: Fix ctrl use-after-free during sysfs deletion In case nvme_sysfs_delete() is called by the user before taking the ctrl reference count, the ctrl may be freed during the creation and cause the bug. Take the reference as soon as the controller is externally visible, which is done by cdev_device_add() in nvme_init_ctrl(). Also take the reference count at the core layer instead of taking it on each transport separately. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/fc.c | 4 +--- drivers/nvme/host/pci.c | 1 - drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/host/tcp.c | 3 +-- drivers/nvme/target/loop.c | 3 +-- 6 files changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a7761c3086e..51f80be0fe90 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4130,6 +4130,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_release_instance; + nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); ctrl->cdev.owner = ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); @@ -4148,6 +4149,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: + nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5a70ac395d53..59d2e2bec179 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3181,10 +3181,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto fail_ctrl; } - nvme_get_ctrl(&ctrl->ctrl); - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - nvme_put_ctrl(&ctrl->ctrl); dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to schedule initial connect\n", ctrl->cnum); @@ -3209,6 +3206,7 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ff0bd2d84f3e..4e062c3a84bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2802,7 +2802,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_reset_ctrl(&dev->ctrl); - nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); return 0; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3e85c5cacefd..ca782deea72d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2043,8 +2043,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); @@ -2054,6 +2052,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4b20301e517c..dd569b122a0d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2428,8 +2428,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); mutex_unlock(&nvme_tcp_ctrl_mutex); @@ -2439,6 +2437,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4df4ebde208a..a425e2858829 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -618,8 +618,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - nvme_get_ctrl(&ctrl->ctrl); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); @@ -637,6 +635,7 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) -- cgit v1.2.3-70-g09d2 From 726612b6b8259afa41d265a2722991c87f059223 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:42 +0200 Subject: nvme: Make nvme_uninit_ctrl symmetric to nvme_init_ctrl Put the ctrl reference count at nvme_uninit_ctrl as opposed to nvme_init_ctrl which takes it. This decrease the reference count at the core layer instead of decreasing it on each transport separately. Also move the call of nvme_uninit_ctrl at PCI driver after calling to nvme_release_prp_pools and nvme_dev_unmap, in order to put the reference count after using the dev. This is safe because those functions use nvme_dev which is freed only later at nvme_pci_free_ctrl. Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 1 - drivers/nvme/host/pci.c | 3 +-- drivers/nvme/host/rdma.c | 1 - drivers/nvme/host/tcp.c | 1 - drivers/nvme/target/loop.c | 2 -- 6 files changed, 2 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 51f80be0fe90..8e6a3ada9d44 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -171,7 +171,6 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_delete_ctrl_work(struct work_struct *work) @@ -4048,6 +4047,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 59d2e2bec179..a8bf2fb1287b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3206,7 +3206,6 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e062c3a84bc..4e79e412b276 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2873,10 +2873,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); - nvme_put_ctrl(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ca782deea72d..c99a88247660 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2052,7 +2052,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index dd569b122a0d..f111430bb617 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2437,7 +2437,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a425e2858829..0d54e730cbf2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -485,7 +485,6 @@ out_destroy_admin: out_disable: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { @@ -635,7 +634,6 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) -- cgit v1.2.3-70-g09d2 From ce1518139e6976cf19c133b555083354fdb629b8 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:43 +0200 Subject: nvme: Fix controller creation races with teardown flow Calling nvme_sysfs_delete() when the controller is in the middle of creation may cause several bugs. If the controller is in NEW state we remove delete_controller file and don't delete the controller. The user will not be able to use nvme disconnect command on that controller again, although the controller may be active. Other bugs may happen if the controller is in the middle of create_ctrl callback and nvme_do_delete_ctrl() starts. For example, freeing I/O tagset at nvme_do_delete_ctrl() before it was allocated at create_ctrl callback. To fix all those races don't allow the user to delete the controller before it was fully created. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 6 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8e6a3ada9d44..66fe301d9abb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3228,6 +3228,10 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + /* Can't delete non-created controllers */ + if (!ctrl->created) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -4039,6 +4043,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d800b9a51c2c..2e04a36296d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -259,6 +259,7 @@ struct nvme_ctrl { struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; + bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ -- cgit v1.2.3-70-g09d2 From 96135862dfcce38b98beff7d1009188263b7e6f7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:44 +0200 Subject: nvme-rdma: Add warning on state change failure at nvme_rdma_setup_ctrl The transition to LIVE state should not fail in case of a new controller. Moving to DELETING state before nvme_tcp_create_ctrl() allocates all the resources may leads to NULL dereference at teardown flow (e.g., IO tagset, admin_q, connect_q). Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c99a88247660..3ae3011a95ea 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1022,8 +1022,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } -- cgit v1.2.3-70-g09d2 From bea54ef53fce57c8b2f11315c9384e43b2ecb321 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:45 +0200 Subject: nvme-tcp: Add warning on state change failure at nvme_tcp_setup_ctrl The transition to LIVE state should not fail in case of a new controller. Moving to DELETING state before nvme_tcp_create_ctrl() allocates all the resources may leads to NULL dereference at teardown flow (e.g., IO tagset, admin_q, connect_q). Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f111430bb617..0ef14f0fad86 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1930,8 +1930,13 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } -- cgit v1.2.3-70-g09d2 From fb314eb0cbb2e11540d1ae1a7b28346397f621ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:35 +0100 Subject: nvme: refactor nvme_identify_ns_descs error handling Move the handling of an error into the function from the caller, and only do it for an actual error on the admin command itself, not the command parsing, as that should be enough to deal with devices claiming a bogus version compliance. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 66fe301d9abb..6bd1c6dfac6b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1102,8 +1102,17 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, NVME_IDENTIFY_DATA_SIZE); - if (status) + if (status) { + dev_warn(ctrl->device, + "Identify Descriptors failed (%d)\n", status); + /* + * Don't treat an error as fatal, as we potentially already + * have a NGUID or EUI-64. + */ + if (status > 0) + status = 0; goto free_data; + } for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { struct nvme_ns_id_desc *cur = data + pos; @@ -1757,26 +1766,15 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { - int ret = 0; - memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) { - /* Don't treat error as fatal we potentially - * already have a NGUID or EUI-64 - */ - ret = nvme_identify_ns_descs(ctrl, nsid, ids); - if (ret) - dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", ret); - if (ret > 0) - ret = 0; - } - return ret; + if (ctrl->vs >= NVME_VS(1, 3, 0)) + return nvme_identify_ns_descs(ctrl, nsid, ids); + return 0; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) -- cgit v1.2.3-70-g09d2 From 026d2ef752f47f33efd92244b9cf6be65d2a1621 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:36 +0100 Subject: nvme: rename __nvme_find_ns_head to nvme_find_ns_head There is no non __-prefixed version, so make the name a little more readable. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6bd1c6dfac6b..56a0dc18ed2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3357,7 +3357,7 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; -static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, +static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, unsigned nsid) { struct nvme_ns_head *h; @@ -3457,7 +3457,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, mutex_lock(&ctrl->subsys->lock); if (is_shared) - head = __nvme_find_ns_head(ctrl->subsys, nsid); + head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { head = nvme_alloc_ns_head(ctrl, nsid, id); if (IS_ERR(head)) { -- cgit v1.2.3-70-g09d2 From 43fcd9e1eae87c3235b8077f97bc6a286c3ae59b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:37 +0100 Subject: nvme: cleanup namespace identifier reporting in nvme_init_ns_head Lift the common namespace identifier reporting between the shared namespace and new nshead cases into common code. This also means one less lock is held while doing I/O. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 56a0dc18ed2d..2b0f693437a8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3390,7 +3390,8 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns *id) + unsigned nsid, struct nvme_id_ns *id, + struct nvme_ns_ids *ids) { struct nvme_ns_head *head; size_t size = sizeof(*head); @@ -3413,12 +3414,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_ida_remove; head->subsys = ctrl->subsys; head->ns_id = nsid; + head->ids = *ids; kref_init(&head->ref); - ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); - if (ret) - goto out_cleanup_srcu; - ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { dev_err(ctrl->device, @@ -3453,24 +3451,23 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_ctrl *ctrl = ns->ctrl; bool is_shared = id->nmic & (1 << 0); struct nvme_ns_head *head = NULL; + struct nvme_ns_ids ids; int ret = 0; + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out; + mutex_lock(&ctrl->subsys->lock); if (is_shared) head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, id); + head = nvme_alloc_ns_head(ctrl, nsid, id, &ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; } } else { - struct nvme_ns_ids ids; - - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) - goto out_unlock; - if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3485,6 +3482,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); +out: if (ret > 0) ret = blk_status_to_errno(nvme_error_status(ret)); return ret; -- cgit v1.2.3-70-g09d2 From 3d745ea5b095a3985129e162900b7e6c22518a9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:11 +0100 Subject: block: simplify queue allocation Current make_request based drivers use either blk_alloc_queue_node or blk_alloc_queue to allocate a queue, and then set up the make_request_fn function pointer and a few parameters using the blk_queue_make_request helper. Simplify this by passing the make_request pointer to blk_alloc_queue, and while at it merge the _node variant into the main helper by always passing a node_id, and remove the superfluous gfp_mask parameter. A lower-level __blk_alloc_queue is kept for the blk-mq case. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 3 +-- arch/xtensa/platforms/iss/simdisk.c | 3 +-- block/blk-cgroup.c | 2 +- block/blk-core.c | 39 ++++++++++++++++++++++--------------- block/blk-mq.c | 8 ++------ block/blk-settings.c | 36 ---------------------------------- block/blk.h | 2 ++ drivers/block/brd.c | 4 +--- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/null_blk_main.c | 3 +-- drivers/block/pktcdvd.c | 3 +-- drivers/block/ps3vram.c | 3 +-- drivers/block/rsxx/dev.c | 3 +-- drivers/block/umem.c | 4 +--- drivers/block/zram/zram_drv.c | 4 +--- drivers/lightnvm/core.c | 3 +-- drivers/md/bcache/super.c | 3 +-- drivers/md/dm.c | 9 ++++----- drivers/md/md.c | 3 +-- drivers/nvdimm/blk.c | 3 +-- drivers/nvdimm/btt.c | 3 +-- drivers/nvdimm/pmem.c | 3 +-- drivers/nvme/host/multipath.c | 3 +-- drivers/s390/block/dcssblk.c | 4 ++-- drivers/s390/block/xpram.c | 4 ++-- include/linux/blkdev.h | 4 +--- 26 files changed, 54 insertions(+), 108 deletions(-) (limited to 'drivers/nvme') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 40712e49381b..c3a630440512 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -118,12 +118,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(nfhd_make_request, NUMA_NO_NODE); if (dev->queue == NULL) goto free_dev; dev->queue->queuedata = dev; - blk_queue_make_request(dev->queue, nfhd_make_request); blk_queue_logical_block_size(dev->queue, bsize); dev->disk = alloc_disk(16); diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 833109880165..49322b66cda9 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -267,13 +267,12 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(simdisk_make_request, NUMA_NO_NODE); if (dev->queue == NULL) { pr_err("blk_alloc_queue failed\n"); goto out_alloc_queue; } - blk_queue_make_request(dev->queue, simdisk_make_request); dev->queue->queuedata = dev; dev->gd = alloc_disk(SIMDISK_MINORS); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a229b94d5390..c15a26096038 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1010,7 +1010,7 @@ unlock: * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * Called from __blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index eaf6cb3887e6..18b8c09d093e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -388,12 +388,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_alloc_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -470,24 +464,19 @@ static void blk_timeout_work(struct work_struct *work) { } -/** - * blk_alloc_queue_node - allocate a request queue - * @gfp_mask: memory allocation flags - * @node_id: NUMA node to allocate memory from - */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +struct request_queue *__blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; @@ -495,7 +484,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); if (!q->backing_dev_info) goto fail_split; @@ -541,6 +530,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + return q; fail_ref: @@ -557,7 +549,22 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue_node); + +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) +{ + struct request_queue *q; + + if (WARN_ON_ONCE(!make_request)) + return -EINVAL; + + q = __blk_alloc_queue(node_id); + if (!q) + return NULL; + q->make_request_fn = make_request; + q->nr_requests = BLKDEV_MAX_RQ; + return q; +} +EXPORT_SYMBOL(blk_alloc_queue); bool blk_get_queue(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 216bf62e88b6..f6291ceedee4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2729,7 +2729,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = __blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; @@ -2939,11 +2939,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - - /* - * Do this after blk_queue_make_request() overrides it... - */ + q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index c8eda2e7b91e..126d216a2db6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -86,42 +86,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - - q->make_request_fn = mfn; - blk_queue_dma_alignment(q, 511); - - blk_set_default_limits(&q->limits); -} -EXPORT_SYMBOL(blk_queue_make_request); - /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index d9673164a145..491e52fc0aa6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -482,4 +482,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } +struct request_queue *__blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 220c5e18aba0..2fb25c348d53 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,12 +381,10 @@ static struct brd_device *brd_alloc(int i) spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); - brd->brd_queue = blk_alloc_queue(GFP_KERNEL); + brd->brd_queue = blk_alloc_queue(brd_make_request, NUMA_NO_NODE); if (!brd->brd_queue) goto out_free_dev; - blk_queue_make_request(brd->brd_queue, brd_make_request); - /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a18155cdce41..72a7c3ea2ce3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2801,7 +2801,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + q = blk_alloc_queue(drbd_make_request, NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; @@ -2828,7 +2828,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info->congested_fn = drbd_congested; q->backing_dev_info->congested_data = device; - blk_queue_make_request(q, drbd_make_request); blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 2f864782550d..5f13793d35ee 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1743,12 +1743,11 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); + nullb->q = blk_alloc_queue(null_queue_bio, dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; } - blk_queue_make_request(nullb->q, null_queue_bio); rv = init_driver_queues(nullb); if (rv) goto out_cleanup_blk_queue; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 0d286a87e647..0b944ac96d6b 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2493,7 +2493,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd) { struct request_queue *q = pd->disk->queue; - blk_queue_make_request(q, pkt_make_request); blk_queue_logical_block_size(q, CD_FRAMESIZE); blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); q->queuedata = pd; @@ -2750,7 +2749,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); disk->private_data = pd; - disk->queue = blk_alloc_queue(GFP_KERNEL); + disk->queue = blk_alloc_queue(pkt_make_request, NUMA_NO_NODE); if (!disk->queue) goto out_mem2; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 4628e1a27a2b..821d4d8b1d76 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -737,7 +737,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(ps3vram_make_request, NUMA_NO_NODE); if (!queue) { dev_err(&dev->core, "blk_alloc_queue failed\n"); error = -ENOMEM; @@ -746,7 +746,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->queue = queue; queue->queuedata = dev; - blk_queue_make_request(queue, ps3vram_make_request); blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index c47d28b2ce44..8ffa8260dcaf 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -248,7 +248,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - card->queue = blk_alloc_queue(GFP_KERNEL); + card->queue = blk_alloc_queue(rsxx_make_request, NUMA_NO_NODE); if (!card->queue) { dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); unregister_blkdev(card->major, DRIVER_NAME); @@ -269,7 +269,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) blk_queue_logical_block_size(card->queue, blk_size); } - blk_queue_make_request(card->queue, rsxx_make_request); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 4eaf97d7a170..d84e8a878df2 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -885,11 +885,9 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) card->biotail = &card->bio; spin_lock_init(&card->lock); - card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + card->queue = blk_alloc_queue(mm_make_request, NUMA_NO_NODE); if (!card->queue) goto failed_alloc; - - blk_queue_make_request(card->queue, mm_make_request); card->queue->queuedata = card; tasklet_init(&card->tasklet, process_page, (unsigned long)card); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76e75097e9ab..ebb234f36909 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1895,7 +1895,7 @@ static int zram_add(void) #ifdef CONFIG_ZRAM_WRITEBACK spin_lock_init(&zram->wb_limit_lock); #endif - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(zram_make_request, NUMA_NO_NODE); if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); @@ -1903,8 +1903,6 @@ static int zram_add(void) goto out_free_idr; } - blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 7543e395a2c6..db38a68abb6c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -380,12 +380,11 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_dev; } - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + tqueue = blk_alloc_queue(tt->make_rq, dev->q->node); if (!tqueue) { ret = -ENOMEM; goto err_disk; } - blk_queue_make_request(tqueue, tt->make_rq); strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); tdisk->flags = GENHD_FL_EXT_DEVT; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5e38a167c85e..d98354fa28e3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -866,11 +866,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); if (!q) return -ENOMEM; - blk_queue_make_request(q, make_request_fn); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0d881cfa160b..753302e83910 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1939,16 +1939,15 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); - if (!md->queue) - goto bad; - md->queue->queuedata = md; /* * default to bio-based required ->make_request_fn until DM * table is loaded and md->type established. If request-based * table is loaded: blk-mq will override accordingly. */ - blk_queue_make_request(md->queue, dm_make_request); + md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + if (!md->queue) + goto bad; + md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) diff --git a/drivers/md/md.c b/drivers/md/md.c index f6cf3b53f6c1..cd1210a0d957 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5623,12 +5623,11 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); + mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 677d6f45b5c4..43751fab9d36 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -249,13 +249,12 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk)); available_disk_size = internal_nlba * nsblk_sector_size(nsblk); - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(nd_blk_make_request, NUMA_NO_NODE); if (!q) return -ENOMEM; if (devm_add_action_or_reset(dev, nd_blk_release_queue, q)) return -ENOMEM; - blk_queue_make_request(q, nd_blk_make_request); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0d04ea3d9fd7..3b09419218d6 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1521,7 +1521,7 @@ static int btt_blk_init(struct btt *btt) struct nd_namespace_common *ndns = nd_btt->ndns; /* create a new disk and request queue for btt */ - btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + btt->btt_queue = blk_alloc_queue(btt_make_request, NUMA_NO_NODE); if (!btt->btt_queue) return -ENOMEM; @@ -1540,7 +1540,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; - blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4eae441f86c9..4ffc6f7ca131 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -395,7 +395,7 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); + q = blk_alloc_queue(pmem_make_request, dev_to_node(dev)); if (!q) return -ENOMEM; @@ -433,7 +433,6 @@ static int pmem_attach_disk(struct device *dev, pmem->virt_addr = addr; blk_queue_write_cache(q, true, fua); - blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..a38d7f196aba 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -377,11 +377,10 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); + q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); if (!q) goto out; q->queuedata = head; - blk_queue_make_request(q, nvme_ns_head_make_request); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 63502ca537eb..80d22290f268 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -636,10 +636,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } dev_info->gd->major = dcssblk_major; dev_info->gd->fops = &dcssblk_devops; - dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL); + dev_info->dcssblk_queue = + blk_alloc_queue(dcssblk_make_request, NUMA_NO_NODE); dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; - blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 3df5d68d09f0..45a04daec89e 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -343,14 +343,14 @@ static int __init xpram_setup_blkdev(void) xpram_disks[i] = alloc_disk(1); if (!xpram_disks[i]) goto out; - xpram_queues[i] = blk_alloc_queue(GFP_KERNEL); + xpram_queues[i] = blk_alloc_queue(xpram_make_request, + NUMA_NO_NODE); if (!xpram_queues[i]) { put_disk(xpram_disks[i]); goto out; } blk_queue_flag_set(QUEUE_FLAG_NONROT, xpram_queues[i]); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, xpram_queues[i]); - blk_queue_make_request(xpram_queues[i], xpram_make_request); blk_queue_logical_block_size(xpram_queues[i], 4096); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e8defd718d62..3f27ff08483e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1063,7 +1063,6 @@ extern void blk_abort_request(struct request *); * Access functions for manipulating queue properties */ extern void blk_cleanup_queue(struct request_queue *); -extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); @@ -1140,8 +1139,7 @@ extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(gfp_t); -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id); +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); -- cgit v1.2.3-70-g09d2 From c95b708d5fa65b4e51f088ee077d127fd5a57b70 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Sat, 28 Mar 2020 01:09:09 -0400 Subject: nvme: fix compat address handling in several ioctls On a 32-bit kernel, the upper bits of userspace addresses passed via various ioctls are silently ignored by the nvme driver. However on a 64-bit kernel running a compat task, these upper bits are not ignored and are in fact required to be zero for the ioctls to work. Unfortunately, this difference matters. 32-bit smartctl submits the NVME_IOCTL_ADMIN_CMD ioctl with garbage in these upper bits because it seems the pointer value it puts into the nvme_passthru_cmd structure is sign extended. This works fine on 32-bit kernels but fails on a 64-bit one because (at least on my setup) the addresses smartctl uses are consistently above 2G. For example: # smartctl -x /dev/nvme0n1 smartctl 7.1 2019-12-30 r5022 [x86_64-linux-5.5.11] (local build) Copyright (C) 2002-19, Bruce Allen, Christian Franke, www.smartmontools.org Read NVMe Identify Controller failed: NVME_IOCTL_ADMIN_CMD: Bad address Since changing 32-bit kernels to actually check all of the submitted address bits now would break existing userspace, this patch fixes the compat problem by explicitly zeroing the upper bits in the compat case. This enables 32-bit smartctl to work on a 64-bit kernel. Signed-off-by: Nick Bowler Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4f907e3beda1..2db8563aeb2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -1252,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); } +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -1275,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = (io.nblocks + 1) << ns->lba_shift; meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; + metadata = nvme_to_user_ptr(io.metadata); if (ns->ext) { length += meta_len; @@ -1298,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, + nvme_to_user_ptr(io.addr), length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } @@ -1418,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, - cmd.metadata_len, 0, &result, timeout); + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); nvme_passthru_end(ctrl, effects); if (status >= 0) { @@ -1465,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); nvme_passthru_end(ctrl, effects); -- cgit v1.2.3-70-g09d2 From 38803fcffb5baf40cd403c1bd980f22308aefee8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 18 Mar 2020 14:41:12 -0700 Subject: nvme-fcloop: fix deallocation of working context There's been a longstanding bug of LS completions which freed ls ops, particularly the disconnect LS, while executing on a work context that is in the memory being free. Not a good thing to do. Rework LS handling to make callbacks in the rport context rather than the ls_request context. Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 76 ++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 24 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1c50af6219f3..9861fcea39f6 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -198,10 +198,13 @@ struct fcloop_lport_priv { }; struct fcloop_rport { - struct nvme_fc_remote_port *remoteport; - struct nvmet_fc_target_port *targetport; - struct fcloop_nport *nport; - struct fcloop_lport *lport; + struct nvme_fc_remote_port *remoteport; + struct nvmet_fc_target_port *targetport; + struct fcloop_nport *nport; + struct fcloop_lport *lport; + spinlock_t lock; + struct list_head ls_list; + struct work_struct ls_work; }; struct fcloop_tport { @@ -224,11 +227,10 @@ struct fcloop_nport { }; struct fcloop_lsreq { - struct fcloop_tport *tport; struct nvmefc_ls_req *lsreq; - struct work_struct work; struct nvmefc_tgt_ls_req tgt_ls_req; int status; + struct list_head ls_list; /* fcloop_rport->ls_list */ }; struct fcloop_rscn { @@ -292,21 +294,32 @@ fcloop_delete_queue(struct nvme_fc_local_port *localport, { } - -/* - * Transmit of LS RSP done (e.g. buffers all set). call back up - * initiator "done" flows. - */ static void -fcloop_tgt_lsrqst_done_work(struct work_struct *work) +fcloop_rport_lsrqst_work(struct work_struct *work) { - struct fcloop_lsreq *tls_req = - container_of(work, struct fcloop_lsreq, work); - struct fcloop_tport *tport = tls_req->tport; - struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_rport *rport = + container_of(work, struct fcloop_rport, ls_work); + struct fcloop_lsreq *tls_req; - if (!tport || tport->remoteport) - lsreq->done(lsreq, tls_req->status); + spin_lock(&rport->lock); + for (;;) { + tls_req = list_first_entry_or_null(&rport->ls_list, + struct fcloop_lsreq, ls_list); + if (!tls_req) + break; + + list_del(&tls_req->ls_list); + spin_unlock(&rport->lock); + + tls_req->lsreq->done(tls_req->lsreq, tls_req->status); + /* + * callee may free memory containing tls_req. + * do not reference lsreq after this. + */ + + spin_lock(&rport->lock); + } + spin_unlock(&rport->lock); } static int @@ -319,17 +332,18 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, int ret = 0; tls_req->lsreq = lsreq; - INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work); + INIT_LIST_HEAD(&tls_req->ls_list); if (!rport->targetport) { tls_req->status = -ECONNREFUSED; - tls_req->tport = NULL; - schedule_work(&tls_req->work); + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); return ret; } tls_req->status = 0; - tls_req->tport = rport->targetport->private; ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req, lsreq->rqstaddr, lsreq->rqstlen); @@ -337,18 +351,28 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, } static int -fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, +fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, struct nvmefc_tgt_ls_req *tgt_lsreq) { struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq); struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_tport *tport = targetport->private; + struct nvme_fc_remote_port *remoteport = tport->remoteport; + struct fcloop_rport *rport; memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf, ((lsreq->rsplen < tgt_lsreq->rsplen) ? lsreq->rsplen : tgt_lsreq->rsplen)); + tgt_lsreq->done(tgt_lsreq); - schedule_work(&tls_req->work); + if (remoteport) { + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); + } return 0; } @@ -834,6 +858,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + flush_work(&rport->ls_work); fcloop_nport_put(rport->nport); } @@ -1136,6 +1161,9 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, rport->nport = nport; rport->lport = nport->lport; nport->rport = rport; + spin_lock_init(&rport->lock); + INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); + INIT_LIST_HEAD(&rport->ls_list); return count; } -- cgit v1.2.3-70-g09d2 From a62315b83664dc9a7a6c6170ba2d174bb9fbed3c Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 31 Mar 2020 15:46:33 +0300 Subject: nvme-rdma: Replace comma with a semicolon Use a semicolon at the end of an assignment expression. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86603d9b0cef..f704e7227d05 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1350,7 +1350,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, int ret; sge->addr = qe->dma; - sge->length = sizeof(struct nvme_command), + sge->length = sizeof(struct nvme_command); sge->lkey = queue->device->pd->local_dma_lkey; wr.next = NULL; -- cgit v1.2.3-70-g09d2 From d038dd815fc56cd77ae8a51bb6d1d11e3aab9609 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 18 Mar 2020 14:40:43 -0700 Subject: nvmet-fc: fix typo in comment Fix typo in comment: about should be abort Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Chiatanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a0db6371b43e..a8ceb7721640 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -684,7 +684,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) disconnect = atomic_xchg(&queue->connected, 0); spin_lock_irqsave(&queue->qlock, flags); - /* about outstanding io's */ + /* abort outstanding io's */ for (i = 0; i < queue->sqsize; fod++, i++) { if (fod->active) { spin_lock(&fod->flock); -- cgit v1.2.3-70-g09d2 From 25e5cb780e62bde432b401f312bb847edc78b432 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 23 Mar 2020 15:06:30 -0700 Subject: nvme-tcp: fix possible crash in write_zeroes processing We cannot look at blk_rq_payload_bytes without first checking that the request has a mappable physical segments first (e.g. blk_rq_nr_phys_segments(rq) != 0) and only then to take the request payload bytes. This caused us to send a wrong sgl to the target or even dereference a non-existing buffer in case we actually got to the data send sequence (if it was in-capsule). Reported-by: Tony Asleson Suggested-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0ef14f0fad86..aa754ae3ca08 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -174,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) { struct request *rq; - unsigned int bytes; if (unlikely(nvme_tcp_async_req(req))) return false; /* async events don't have a request */ rq = blk_mq_rq_from_pdu(req); - bytes = blk_rq_payload_bytes(rq); - return rq_data_dir(rq) == WRITE && bytes && - bytes <= nvme_tcp_inline_data_size(req->queue); + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req->queue); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -2164,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (rq_data_dir(rq) == WRITE && req->data_len && + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else @@ -2191,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; - req->data_len = blk_rq_payload_bytes(rq); + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; if (rq_data_dir(rq) == WRITE && -- cgit v1.2.3-70-g09d2 From f86e5bf817a57c7e6538dafee2fc65a525bb9935 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 23 Mar 2020 16:43:52 -0700 Subject: nvme-tcp: don't poll a non-live queue In error recovery we might be removing the queue so check we can actually poll before we do. Reported-by: Mark Wunderlich Tested-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index aa754ae3ca08..eb31c689d2cf 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2299,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); nvme_tcp_try_recv(queue); -- cgit v1.2.3-70-g09d2 From 39d06079a50fe2a651091b38e311e605de0788cb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 31 Mar 2020 22:44:23 -0700 Subject: nvme-tcp: fix possible crash in recv error flow If the target misbehaves and sends us unexpected payload we need to make sure to fail the controller and stop processing the input stream. We clear the rd_enabled flag and stop the io_work, but we may still requeue it if we still have pending sends and then in the next invocation we will process the input stream as the check is only in the .data_ready upcall. To fix this we need to make sure not to self-requeue io_work upon a recv flow error. This fixes the crash: nvme nvme2: receive failed: -22 BUG: unable to handle page fault for address: ffffbeb5816c3b48 nvme_ns_head_make_request: 29 callbacks suppressed block nvme0n5: no usable path - requeuing I/O block nvme0n5: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O #PF: supervisor read access inkernel mode #PF: error_code(0x0000) - not-present page PGD 1039157067 P4D 1039157067 PUD 103915a067 PMD 102719f067 PTE 0 Oops: 0000 [#1] SMP PTI CPU: 8 PID: 411 Comm: kworker/8:1H Not tainted 5.3.0-40-generic #32~18.04.1-Ubuntu Hardware name: Supermicro Super Server/X10SRi-F, BIOS 2.0 12/17/2015 Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] RIP: 0010:nvme_tcp_recv_skb+0x2ae/0xb50 [nvme_tcp] RSP: 0018:ffffbeb5806cfd10 EFLAGS: 00010246 RAX: ffffbeb5816c3b48 RBX: 00000000000003d0 RCX: 0000000000000008 RDX: 00000000000003d0 RSI: 0000000000000001 RDI: ffff9a3040684b40 RBP: ffffbeb5806cfd90 R08: 0000000000000000 R09: ffffffff946e6900 R10: ffffbeb5806cfce0 R11: 0000000000000001 R12: 0000000000000000 R13: ffff9a2ff86501c0 R14: 00000000000003d0 R15: ffff9a30b85f2798 FS: 0000000000000000(0000) GS:ffff9a30bf800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffbeb5816c3b48 CR3: 000000088400a006 CR4: 00000000003626e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_read_sock+0x8c/0x290 ? __release_sock+0x9d/0xe0 ? nvme_tcp_write_space+0xb0/0xb0 [nvme_tcp] nvme_tcp_io_work+0x4b4/0x830 [nvme_tcp] ? finish_task_switch+0x163/0x270 process_one_work+0x1fd/0x3f0 worker_thread+0x34/0x410 kthread+0x121/0x140 ? process_one_work+0x3f0/0x3f0 ? kthread_park+0xb0/0xb0 ret_from_fork+0x35/0x40 Reported-by: Roy Shterman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index eb31c689d2cf..c15a92163c1f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1073,7 +1073,7 @@ static void nvme_tcp_io_work(struct work_struct *w) if (result > 0) pending = true; else if (unlikely(result < 0)) - break; + return; if (!pending) return; -- cgit v1.2.3-70-g09d2 From 74e4d20e2f43cf09a35543d960ac8f7a1ffcbbb5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 1 Apr 2020 22:44:43 -0700 Subject: nvme: inherit stable pages constraint in the mpath stack device If the backing device require stable pages, we need to set it on the stack mpath device as well. This applies to rdma/fc transports when doing data integrity and tcp transport calculating digests. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2db8563aeb2d..91c1bd659947 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1897,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); + if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { + struct backing_dev_info *info = + ns->head->disk->queue->backing_dev_info; + + info->capabilities |= BDI_CAP_STABLE_WRITES; + } + revalidate_disk(ns->head->disk); } #endif -- cgit v1.2.3-70-g09d2 From f0e656e4f253120eb871a53ffab7664530c1d9f4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 1 Apr 2020 16:16:27 -0700 Subject: nvmet: fix NULL dereference when removing a referral When item release is called, the parent is already null. We need the parent to pass to nvmet_referral_disable so hook it up to ->disconnect_notify. Reported-by: Tony Asleson Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7aa10788b7c8..58cabd7b6fc5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1098,12 +1098,19 @@ static struct configfs_attribute *nvmet_referral_attrs[] = { NULL, }; -static void nvmet_referral_release(struct config_item *item) +static void nvmet_referral_notify(struct config_group *group, + struct config_item *item) { struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); struct nvmet_port *port = to_nvmet_port(item); nvmet_referral_disable(parent, port); +} + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + kfree(port); } @@ -1134,6 +1141,7 @@ static struct config_group *nvmet_referral_make( static struct configfs_group_operations nvmet_referral_group_ops = { .make_group = nvmet_referral_make, + .disconnect_notify = nvmet_referral_notify, }; static const struct config_item_type nvmet_referrals_type = { -- cgit v1.2.3-70-g09d2 From a032e4f6d60d0aca4f6570d2ad33105a2b9ba385 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 2 Apr 2020 08:48:53 -0700 Subject: nvmet-rdma: fix bonding failover possible NULL deref RDMA_CM_EVENT_ADDR_CHANGE event occur in the case of bonding failover on normal as well as on listening cm_ids. Hence this event will immediately trigger a NULL dereference trying to disconnect a queue for a cm_id that actually belongs to the port. To fix this we provide a different handler for the listener cm_ids that will defer a work to disable+(re)enable the port which essentially destroys and setups another listener cm_id Reported-by: Alex Lyakas Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Tested-by: Alex Lyakas Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 175 ++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 56 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 9e1b8c61f54e..f78201421978 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -105,6 +105,13 @@ struct nvmet_rdma_queue { struct list_head queue_list; }; +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + struct delayed_work repair_work; +}; + struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; @@ -917,7 +924,8 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { - struct nvmet_port *port = cm_id->context; + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; int inline_page_count; int inline_sge_count; @@ -934,17 +942,17 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; - inline_page_count = num_pages(port->inline_data_size); + inline_page_count = num_pages(nport->inline_data_size); inline_sge_count = max(cm_id->device->attrs.max_sge_rd, cm_id->device->attrs.max_recv_sge) - 1; if (inline_page_count > inline_sge_count) { pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", - port->inline_data_size, cm_id->device->name, + nport->inline_data_size, cm_id->device->name, inline_sge_count * PAGE_SIZE); - port->inline_data_size = inline_sge_count * PAGE_SIZE; + nport->inline_data_size = inline_sge_count * PAGE_SIZE; inline_page_count = inline_sge_count; } - ndev->inline_data_size = port->inline_data_size; + ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -1272,6 +1280,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { + struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_device *ndev; struct nvmet_rdma_queue *queue; int ret = -EINVAL; @@ -1287,7 +1296,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = -ENOMEM; goto put_device; } - queue->port = cm_id->context; + queue->port = port->nport; if (queue->host_qid == 0) { /* Let inflight controller teardown complete */ @@ -1412,7 +1421,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - struct nvmet_port *port; + struct nvmet_rdma_port *port; if (queue) { /* @@ -1431,7 +1440,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ - if (xchg(&port->priv, NULL) != cm_id) + if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* @@ -1462,6 +1471,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + schedule_delayed_work(&port->repair_work, 0); + break; + } + /* FALLTHROUGH */ case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1504,42 +1520,19 @@ restart: mutex_unlock(&nvmet_rdma_queue_mutex); } -static int nvmet_rdma_add_port(struct nvmet_port *port) +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) { - struct rdma_cm_id *cm_id; - struct sockaddr_storage addr = { }; - __kernel_sa_family_t af; - int ret; - - switch (port->disc_addr.adrfam) { - case NVMF_ADDR_FAMILY_IP4: - af = AF_INET; - break; - case NVMF_ADDR_FAMILY_IP6: - af = AF_INET6; - break; - default: - pr_err("address family %d not supported\n", - port->disc_addr.adrfam); - return -EINVAL; - } + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); - if (port->inline_data_size < 0) { - port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; - } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { - pr_warn("inline_data_size %u is too large, reducing to %u\n", - port->inline_data_size, - NVMET_RDMA_MAX_INLINE_DATA_SIZE); - port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; - } + if (cm_id) + rdma_destroy_id(cm_id); +} - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, - port->disc_addr.trsvcid, &addr); - if (ret) { - pr_err("malformed ip/port passed: %s:%s\n", - port->disc_addr.traddr, port->disc_addr.trsvcid); - return ret; - } +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1558,23 +1551,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) goto out_destroy_id; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); + ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("binding CM ID to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } ret = rdma_listen(cm_id, 128); if (ret) { - pr_err("listening to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpcs)\n", - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); - port->priv = cm_id; + port->cm_id = cm_id; return 0; out_destroy_id: @@ -1582,18 +1571,92 @@ out_destroy_id: return ret; } -static void nvmet_rdma_remove_port(struct nvmet_port *port) +static void nvmet_rdma_repair_port_work(struct work_struct *w) { - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; - if (cm_id) - rdma_destroy_id(cm_id); + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + schedule_delayed_work(&port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(nport->disc_addr.portid), + (struct sockaddr *)&port->addr); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); } static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, - struct nvmet_port *port, char *traddr) + struct nvmet_port *nport, char *traddr) { - struct rdma_cm_id *cm_id = port->priv; + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = @@ -1603,7 +1666,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, sprintf(traddr, "%pISc", addr); } else { - memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } -- cgit v1.2.3-70-g09d2 From 657f1975e9d9c880fa13030e88ba6cc84964f1db Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 2 Apr 2020 09:34:54 -0700 Subject: nvme: fix deadlock caused by ANA update wrong locking The deadlock combines 4 flows in parallel: - ns scanning (triggered from reconnect) - request timeout - ANA update (triggered from reconnect) - I/O coming into the mpath device (1) ns scanning triggers disk revalidation -> update disk info -> freeze queue -> but blocked, due to (2) (2) timeout handler reference the g_usage_counter - > but blocks in the transport .timeout() handler, due to (3) (3) the transport timeout handler (indirectly) calls nvme_stop_queue() -> which takes the (down_read) namespaces_rwsem - > but blocks, due to (4) (4) ANA update takes the (down_write) namespaces_rwsem -> calls nvme_mpath_set_live() -> which synchronize the ns_head srcu (see commit 504db087aacc) -> but blocks, due to (5) (5) I/O came into nvme_mpath_make_request -> took srcu_read_lock -> direct_make_request > blk_queue_enter -> but blocked, due to (1) ==> the request queue is under freeze -> deadlock. The fix is making ANA update take a read lock as the namespaces list is not manipulated, it is just the ns and ns->head that are being updated (which is protected with the ns->head lock). Fixes: 0d0b660f214dc ("nvme: add ANA support") Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 61bf87592570..54603bd3e02d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -510,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_write(&ctrl->namespaces_rwsem); + down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { unsigned nsid = le32_to_cpu(desc->nsids[n]); @@ -521,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (++n == nr_nsids) break; } - up_write(&ctrl->namespaces_rwsem); + up_read(&ctrl->namespaces_rwsem); return 0; } -- cgit v1.2.3-70-g09d2 From 8c5c660529209a0e324c1c1a35ce3f83d67a2aa5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Apr 2020 07:33:20 -0700 Subject: nvme-fc: Revert "add module to ops template to allow module references" The original patch was to resolve the lldd being able to be unloaded while being used to talk to the boot device of the system. However, the end result of the original patch is that any driver unload while a nvme controller is live via the lldd is now being prohibited. Given the module reference, the module teardown routine can't be called, thus there's no way, other than manual actions to terminate the controllers. Fixes: 863fbae929c7 ("nvme_fc: add module to ops template to allow module references") Cc: # v5.4+ Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++------------ drivers/nvme/target/fcloop.c | 1 - drivers/scsi/lpfc/lpfc_nvme.c | 2 -- drivers/scsi/qla2xxx/qla_nvme.c | 1 - include/linux/nvme-fc-driver.h | 4 ---- 5 files changed, 2 insertions(+), 20 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a8bf2fb1287b..7dfc4a2ecf1e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, !template->ls_req || !template->fcp_io || !template->ls_abort || !template->fcp_abort || !template->max_hw_queues || !template->max_sgl_segments || - !template->max_dif_sgl_segments || !template->dma_boundary || - !template->module) { + !template->max_dif_sgl_segments || !template->dma_boundary) { ret = -EINVAL; goto out_reghost_failed; } @@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref) { struct nvme_fc_ctrl *ctrl = container_of(ref, struct nvme_fc_ctrl, ref); - struct nvme_fc_lport *lport = ctrl->lport; unsigned long flags; if (ctrl->ctrl.tagset) { @@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref) if (ctrl->ctrl.opts) nvmf_free_options(ctrl->ctrl.opts); kfree(ctrl); - module_put(lport->ops->module); } static void @@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } - if (!try_module_get(lport->ops->module)) { - ret = -EUNATCH; - goto out_free_ctrl; - } - idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; - goto out_mod_put; + goto out_free_ctrl; } ctrl->ctrl.opts = opts; @@ -3232,8 +3224,6 @@ out_free_queues: out_free_ida: put_device(ctrl->dev); ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); -out_mod_put: - module_put(lport->ops->module); out_free_ctrl: kfree(ctrl); out_fail: diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 9861fcea39f6..f69ce66e2d44 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -875,7 +875,6 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) #define FCLOOP_DMABOUND_4G 0xFFFFFFFF static struct nvme_fc_port_template fctemplate = { - .module = THIS_MODULE, .localport_delete = fcloop_localport_delete, .remoteport_delete = fcloop_remoteport_delete, .create_queue = fcloop_create_queue, diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index f6c8963c915d..db4a04a207ec 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -1985,8 +1985,6 @@ out_unlock: /* Declare and initialization an instance of the FC NVME template. */ static struct nvme_fc_port_template lpfc_nvme_template = { - .module = THIS_MODULE, - /* initiator-based functions */ .localport_delete = lpfc_nvme_localport_delete, .remoteport_delete = lpfc_nvme_remoteport_delete, diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index bfcd02fdf2b8..941aa53363f5 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -610,7 +610,6 @@ static void qla_nvme_remoteport_delete(struct nvme_fc_remote_port *rport) } static struct nvme_fc_port_template qla_nvme_fc_transport = { - .module = THIS_MODULE, .localport_delete = qla_nvme_localport_delete, .remoteport_delete = qla_nvme_remoteport_delete, .create_queue = qla_nvme_alloc_queue, diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 6d0d70f3219c..10f81629b9ce 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -270,8 +270,6 @@ struct nvme_fc_remote_port { * * Host/Initiator Transport Entrypoints/Parameters: * - * @module: The LLDD module using the interface - * * @localport_delete: The LLDD initiates deletion of a localport via * nvme_fc_deregister_localport(). However, the teardown is * asynchronous. This routine is called upon the completion of the @@ -385,8 +383,6 @@ struct nvme_fc_remote_port { * Value is Mandatory. Allowed to be zero. */ struct nvme_fc_port_template { - struct module *module; - /* initiator-based functions */ void (*localport_delete)(struct nvme_fc_local_port *); void (*remoteport_delete)(struct nvme_fc_remote_port *); -- cgit v1.2.3-70-g09d2 From 21f9024355e58772ec5d7fc3534aa5e29d72a8b6 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 7 Apr 2020 11:02:28 +0000 Subject: nvmet-rdma: fix double free of rdma queue In case rdma accept fails at nvmet_rdma_queue_connect(), release work is scheduled. Later on, a new RDMA CM event may arrive since we didn't destroy the cm-id and call nvmet_rdma_queue_connect_fail(), which schedule another release work. This will cause calling nvmet_rdma_free_queue twice. To fix this we implicitly destroy the cm_id with non-zero ret code, which guarantees that new rdma_cm events will not arrive afterwards. Also add a qp pointer to nvmet_rdma_queue structure, so we can use it when the cm_id pointer is NULL or was destroyed. Signed-off-by: Israel Rukshin Suggested-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f78201421978..ab867f32fb0d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -78,6 +78,7 @@ enum nvmet_rdma_queue_state { struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; @@ -474,7 +475,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, if (ndev->srq) ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); else - ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); @@ -513,7 +514,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } @@ -597,7 +598,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; @@ -752,7 +753,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { @@ -1038,6 +1039,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -1066,11 +1068,10 @@ err_destroy_cq: static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp *qp = queue->cm_id->qp; - - ib_drain_qp(qp); - rdma_destroy_id(queue->cm_id); - ib_destroy_qp(qp); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); ib_free_cq(queue->cq); } @@ -1305,9 +1306,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { - schedule_work(&queue->release_work); - /* Destroying rdma_cm id is not needed here */ - return 0; + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); @@ -1316,6 +1320,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; +free_queue: + nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); -- cgit v1.2.3-70-g09d2 From 132be62387c7a72a38872676c18b0dfae264adb8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 27 Apr 2020 14:34:41 +0200 Subject: nvme: prevent double free in nvme_alloc_ns() error handling When jumping to the out_put_disk label, we will call put_disk(), which will trigger a call to disk_release(), which calls blk_put_queue(). Later in the cleanup code, we do blk_cleanup_queue(), which will also call blk_put_queue(). Putting the queue twice is incorrect, and will generate a KASAN splat. Set the disk->queue pointer to NULL, before calling put_disk(), so that the first call to blk_put_queue() will not free the queue. The second call to blk_put_queue() uses another pointer to the same queue, so this call will still free the queue. Fixes: 85136c010285 ("lightnvm: simplify geometry enumeration") Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 91c1bd659947..f2adea96b04c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3642,6 +3642,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) return; out_put_disk: + /* prevent double queue cleanup */ + ns->disk->queue = NULL; put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); -- cgit v1.2.3-70-g09d2 From a8de6639169b90e3dc4f27e752a3c5abac5e90da Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 May 2020 23:07:04 +0300 Subject: nvme-pci: fix "slimmer CQ head update" Pre-incrementing ->cq_head can't be done in memory because OOB value can be observed by another context. This devalues space savings compared to original code :-\ $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-32 (-32) Function old new delta nvme_poll_irqdisable 464 456 -8 nvme_poll 455 447 -8 nvme_irq 388 380 -8 nvme_dev_disable 955 947 -8 But the code is minimal now: one read for head, one read for q_depth, one increment, one comparison, single instruction phase bit update and one write for new head. Signed-off-by: Alexey Dobriyan Reported-by: John Garry Tested-by: John Garry Fixes: e2a366a4b0feaeb ("nvme-pci: slimmer CQ head update") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e79e412b276..e13c370de830 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -973,9 +973,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (++nvmeq->cq_head == nvmeq->q_depth) { + u16 tmp = nvmeq->cq_head + 1; + + if (tmp == nvmeq->q_depth) { nvmeq->cq_head = 0; nvmeq->cq_phase ^= 1; + } else { + nvmeq->cq_head = tmp; } } -- cgit v1.2.3-70-g09d2 From 59c7c3caaaf8750df4ec3255082f15eb4e371514 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 6 May 2020 15:44:02 -0700 Subject: nvme: fix possible hang when ns scanning fails during error recovery When the controller is reconnecting, the host fails I/O and admin commands as the host cannot reach the controller. ns scanning may revalidate namespaces during that period and it is wrong to remove namespaces due to these failures as we may hang (see 205da2434301). One command that may fail is nvme_identify_ns_descs. Since we return success due to having ns identify descriptor list optional, we continue to compare ns identifiers in nvme_revalidate_disk, obviously fail and return -ENODEV to nvme_validate_ns, which will remove the namespace. Exactly what we don't want to happen. Fixes: 22802bf742c2 ("nvme: Namepace identification descriptor list is optional") Tested-by: Anton Eidelman Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f2adea96b04c..f3c037f5a9ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1110,7 +1110,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * Don't treat an error as fatal, as we potentially already * have a NGUID or EUI-64. */ - if (status > 0) + if (status > 0 && !(status & NVME_SC_DNR)) status = 0; goto free_data; } -- cgit v1.2.3-70-g09d2 From b69e2ef24b7b4867f80f47e2781e95d0bacd15cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 May 2020 13:04:06 -0700 Subject: nvme-pci: dma read memory barrier for completions Control dependencies do not guarantee load order across the condition, allowing a CPU to predict and speculate memory reads. Commit 324b494c2862 inlined verifying a new completion with its handling. At least one architecture was observed to access the contents out of order, resulting in the driver using stale data for the completion. Add a dma read barrier before reading the completion queue entry and after the condition its contents depend on to ensure the read order is determinsitic. Reported-by: John Garry Suggested-by: Will Deacon Signed-off-by: Keith Busch Tested-by: John Garry Acked-by: Will Deacon Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e13c370de830..3726dc780d15 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -989,6 +989,11 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) while (nvme_cqe_pending(nvmeq)) { found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); nvme_handle_cqe(nvmeq, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } -- cgit v1.2.3-70-g09d2 From 9210c075cef29c1f764b4252f93105103bdfb292 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 27 May 2020 09:13:52 -0700 Subject: nvme-pci: avoid race between nvme_reap_pending_cqes() and nvme_poll() There may be a race between nvme_reap_pending_cqes() and nvme_poll(), e.g., when doing live reset while polling the nvme device. CPU X CPU Y nvme_poll() nvme_dev_disable() -> nvme_stop_queues() -> nvme_suspend_io_queues() -> nvme_suspend_queue() -> spin_lock(&nvmeq->cq_poll_lock); -> nvme_reap_pending_cqes() -> nvme_process_cq() -> nvme_process_cq() In the above scenario, the nvme_process_cq() for the same queue may be running on both CPU X and CPU Y concurrently. It is much more easier to reproduce the issue when CONFIG_PREEMPT is enabled in kernel. When CONFIG_PREEMPT is disabled, it would take longer time for nvme_stop_queues()-->blk_mq_quiesce_queue() to wait for grace period. This patch protects nvme_process_cq() with nvmeq->cq_poll_lock in nvme_reap_pending_cqes(). Fixes: fa46c6fb5d61 ("nvme/pci: move cqe check after device shutdown") Signed-off-by: Dongli Zhang Reviewed-by: Ming Lei Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3726dc780d15..cc46e250fcac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1382,16 +1382,19 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) /* * Called only on a device that has been disabled and after all other threads - * that can check this device's completion queues have synced. This is the - * last chance for the driver to see a natural completion before - * nvme_cancel_request() terminates all incomplete requests. + * that can check this device's completion queues have synced, except + * nvme_poll(). This is the last chance for the driver to see a natural + * completion before nvme_cancel_request() terminates all incomplete requests. */ static void nvme_reap_pending_cqes(struct nvme_dev *dev) { int i; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) + for (i = dev->ctrl.queue_count - 1; i > 0; i--) { + spin_lock(&dev->queues[i].cq_poll_lock); nvme_process_cq(&dev->queues[i]); + spin_unlock(&dev->queues[i].cq_poll_lock); + } } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, -- cgit v1.2.3-70-g09d2