diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2020-07-21 19:02:33 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2020-07-21 19:02:33 -0700 |
commit | 9b031c86506cef9acae45e61339fcf9deaabb793 (patch) | |
tree | 9095d638ba9384f86df8d61dcf1f129c082481e1 /drivers/nvme | |
parent | 53aab92dec447f93489e07924e310d605a389dea (diff) | |
parent | 04d5ce620f794f1df69b5f1b9ad62910fea547f1 (diff) |
Merge branch 'elan-i2c' into next
Bring in update to Elan touchpad driver to support newer touchpads with
higher resolution.
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 293 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 8 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 17 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 28 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 6 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 103 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 19 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 136 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 35 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 156 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 9 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/fcloop.c | 77 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 3 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 11 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 226 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 35 |
18 files changed, 791 insertions, 375 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b9358db83e96..9c17ed32be64 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -32,8 +32,6 @@ config NVME_HWMON a hardware monitoring device will be created for each NVMe drive in the system. - If unsure, say N. - config NVME_FABRICS tristate diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4d8c90ee7cc..f3c037f5a9ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -6,6 +6,7 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> +#include <linux/compat.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/hdreg.h> @@ -171,7 +172,6 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_delete_ctrl_work(struct work_struct *work) @@ -192,21 +192,16 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { - int ret = 0; - /* * Keep a reference until nvme_do_delete_ctrl() complete, * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) - ret = -EBUSY; - if (!ret) + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) nvme_do_delete_ctrl(ctrl); nvme_put_ctrl(ctrl); - return ret; } static inline bool nvme_ns_has_pi(struct nvme_ns *ns) @@ -291,11 +286,8 @@ void nvme_complete_rq(struct request *req) nvme_req(req)->ctrl->comp_seen = true; if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && - blk_path_error(status)) { - nvme_failover_req(req); + if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) return; - } if (!blk_queue_dying(req->q)) { nvme_retry_req(req); @@ -1055,6 +1047,43 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, + struct nvme_ns_id_desc *cur) +{ + const char *warn_str = "ctrl returned bogus length:"; + void *data = cur; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); + return NVME_NIDT_EUI64_LEN; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); + return NVME_NIDT_NGUID_LEN; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", + warn_str, cur->nidl); + return -1; + } + uuid_copy(&ids->uuid, data + sizeof(*cur)); + return NVME_NIDT_UUID_LEN; + default: + /* Skip unknown types */ + return cur->nidl; + } +} + static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -1074,8 +1103,17 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, NVME_IDENTIFY_DATA_SIZE); - if (status) + if (status) { + dev_warn(ctrl->device, + "Identify Descriptors failed (%d)\n", status); + /* + * Don't treat an error as fatal, as we potentially already + * have a NGUID or EUI-64. + */ + if (status > 0 && !(status & NVME_SC_DNR)) + status = 0; goto free_data; + } for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { struct nvme_ns_id_desc *cur = data + pos; @@ -1083,42 +1121,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - switch (cur->nidt) { - case NVME_NIDT_EUI64: - if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_EUI64_LEN; - memcpy(ids->eui64, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_NGUID: - if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_NGUID_LEN; - memcpy(ids->nguid, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_UUID: - if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_UUID_LEN; - uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); - break; - default: - /* Skip unknown types */ - len = cur->nidl; - break; - } + len = nvme_process_ns_desc(ctrl, ids, cur); + if (len < 0) + goto free_data; len += sizeof(*cur); } @@ -1248,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); } +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -1271,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = (io.nblocks + 1) << ns->lba_shift; meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; + metadata = nvme_to_user_ptr(io.metadata); if (ns->ext) { length += meta_len; @@ -1294,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, + nvme_to_user_ptr(io.addr), length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } @@ -1414,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, - cmd.metadata_len, 0, &result, timeout); + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); nvme_passthru_end(ctrl, effects); if (status >= 0) { @@ -1461,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); nvme_passthru_end(ctrl, effects); @@ -1584,6 +1601,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return ret; } +#ifdef CONFIG_COMPAT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); + +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) + +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + /* + * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO + * between 32 bit programs and 64 bit kernel. + * The cause is that the results of sizeof(struct nvme_user_io), + * which is used to define NVME_IOCTL_SUBMIT_IO, + * are not same between 32 bit compiler and 64 bit compiler. + * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling + * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. + * Other IOCTL numbers are same between 32 bit and 64 bit. + * So there is nothing to do regarding to other IOCTL numbers. + */ + if (cmd == NVME_IOCTL_SUBMIT_IO32) + return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); + + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif /* CONFIG_COMPAT */ + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -1721,26 +1779,15 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { - int ret = 0; - memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) { - /* Don't treat error as fatal we potentially - * already have a NGUID or EUI-64 - */ - ret = nvme_identify_ns_descs(ctrl, nsid, ids); - if (ret) - dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", ret); - if (ret > 0) - ret = 0; - } - return ret; + if (ctrl->vs >= NVME_VS(1, 3, 0)) + return nvme_identify_ns_descs(ctrl, nsid, ids); + return 0; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) @@ -1810,7 +1857,7 @@ static void nvme_update_disk_info(struct gendisk *disk, ns->lba_shift > PAGE_SHIFT) capacity = 0; - set_capacity(disk, capacity); + set_capacity_revalidate_and_notify(disk, capacity, false); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); @@ -1850,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); + if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { + struct backing_dev_info *info = + ns->head->disk->queue->backing_dev_info; + + info->capabilities |= BDI_CAP_STABLE_WRITES; + } + revalidate_disk(ns->head->disk); } #endif @@ -2027,7 +2081,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2055,7 +2109,7 @@ const struct block_device_operations nvme_ns_head_ops = { .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, .pr_ops = &nvme_pr_ops, }; @@ -2074,13 +2128,13 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) if ((csts & NVME_CSTS_RDY) == bit) break; - msleep(100); + usleep_range(1000, 2000); if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(ctrl->device, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); + "Device not ready; aborting %s, CSTS=0x%x\n", + enabled ? "initialisation" : "reset", csts); return -ENODEV; } } @@ -2591,8 +2645,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, lockdep_assert_held(&nvme_subsystems_lock); list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (tmp->state == NVME_CTRL_DELETING || - tmp->state == NVME_CTRL_DEAD) + if (nvme_state_terminal(tmp)) continue; if (tmp->cntlid == ctrl->cntlid) { @@ -3193,6 +3246,10 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + /* Can't delete non-created controllers */ + if (!ctrl->created) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -3242,6 +3299,26 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); +static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); +} +static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); + +static ssize_t nvme_sysfs_show_hostid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); +} +static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); + static ssize_t nvme_sysfs_show_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -3267,6 +3344,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_numa_node.attr, &dev_attr_queue_count.attr, &dev_attr_sqsize.attr, + &dev_attr_hostnqn.attr, + &dev_attr_hostid.attr, NULL }; @@ -3280,6 +3359,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_address.attr && !ctrl->ops->get_address) return 0; + if (a == &dev_attr_hostnqn.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_hostid.attr && !ctrl->opts) + return 0; return a->mode; } @@ -3294,7 +3377,7 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; -static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, +static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, unsigned nsid) { struct nvme_ns_head *h; @@ -3327,7 +3410,8 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns *id) + unsigned nsid, struct nvme_id_ns *id, + struct nvme_ns_ids *ids) { struct nvme_ns_head *head; size_t size = sizeof(*head); @@ -3350,12 +3434,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_ida_remove; head->subsys = ctrl->subsys; head->ns_id = nsid; + head->ids = *ids; kref_init(&head->ref); - ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); - if (ret) - goto out_cleanup_srcu; - ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { dev_err(ctrl->device, @@ -3390,24 +3471,23 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_ctrl *ctrl = ns->ctrl; bool is_shared = id->nmic & (1 << 0); struct nvme_ns_head *head = NULL; + struct nvme_ns_ids ids; int ret = 0; + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out; + mutex_lock(&ctrl->subsys->lock); if (is_shared) - head = __nvme_find_ns_head(ctrl->subsys, nsid); + head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, id); + head = nvme_alloc_ns_head(ctrl, nsid, id, &ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; } } else { - struct nvme_ns_ids ids; - - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) - goto out_unlock; - if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3422,6 +3502,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); +out: if (ret > 0) ret = blk_status_to_errno(nvme_error_status(ret)); return ret; @@ -3480,7 +3561,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) return 0; } -static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; @@ -3490,13 +3571,11 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return -ENOMEM; + return; ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) { - ret = PTR_ERR(ns->queue); + if (IS_ERR(ns->queue)) goto out_free_ns; - } if (ctrl->opts && ctrl->opts->data_digest) ns->queue->backing_dev_info->capabilities @@ -3519,10 +3598,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ret) goto out_free_queue; - if (id->ncap == 0) { - ret = -EINVAL; + if (id->ncap == 0) /* no namespace (legacy quirk) */ goto out_free_id; - } ret = nvme_init_ns_head(ns, nsid, id); if (ret) @@ -3531,10 +3608,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); - if (!disk) { - ret = -ENOMEM; + if (!disk) goto out_unlink_ns; - } disk->fops = &nvme_fops; disk->private_data = ns; @@ -3565,8 +3640,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); kfree(id); - return 0; + return; out_put_disk: + /* prevent double queue cleanup */ + ns->disk->queue = NULL; put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -3579,9 +3656,6 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; } static void nvme_ns_remove(struct nvme_ns *ns) @@ -3987,6 +4061,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); @@ -3995,6 +4070,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); @@ -4077,6 +4153,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_release_instance; + nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); ctrl->cdev.owner = ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); @@ -4095,6 +4172,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: + nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); @@ -4299,6 +4377,7 @@ static void __exit nvme_core_exit(void) destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_instance_ida); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 74b8818ac9a1..2a6c8190eeb7 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -105,14 +105,14 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) int len = 0; if (ctrl->opts->mask & NVMF_OPT_TRADDR) - len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + len += scnprintf(buf, size, "traddr=%s", ctrl->opts->traddr); if (ctrl->opts->mask & NVMF_OPT_TRSVCID) - len += snprintf(buf + len, size - len, "%strsvcid=%s", + len += scnprintf(buf + len, size - len, "%strsvcid=%s", (len) ? "," : "", ctrl->opts->trsvcid); if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) - len += snprintf(buf + len, size - len, "%shost_traddr=%s", + len += scnprintf(buf + len, size - len, "%shost_traddr=%s", (len) ? "," : "", ctrl->opts->host_traddr); - len += snprintf(buf + len, size - len, "\n"); + len += scnprintf(buf + len, size - len, "\n"); return len; } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5a70ac395d53..7dfc4a2ecf1e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, !template->ls_req || !template->fcp_io || !template->ls_abort || !template->fcp_abort || !template->max_hw_queues || !template->max_sgl_segments || - !template->max_dif_sgl_segments || !template->dma_boundary || - !template->module) { + !template->max_dif_sgl_segments || !template->dma_boundary) { ret = -EINVAL; goto out_reghost_failed; } @@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref) { struct nvme_fc_ctrl *ctrl = container_of(ref, struct nvme_fc_ctrl, ref); - struct nvme_fc_lport *lport = ctrl->lport; unsigned long flags; if (ctrl->ctrl.tagset) { @@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref) if (ctrl->ctrl.opts) nvmf_free_options(ctrl->ctrl.opts); kfree(ctrl); - module_put(lport->ops->module); } static void @@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } - if (!try_module_get(lport->ops->module)) { - ret = -EUNATCH; - goto out_free_ctrl; - } - idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; - goto out_mod_put; + goto out_free_ctrl; } ctrl->ctrl.opts = opts; @@ -3181,10 +3173,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto fail_ctrl; } - nvme_get_ctrl(&ctrl->ctrl); - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - nvme_put_ctrl(&ctrl->ctrl); dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to schedule initial connect\n", ctrl->cnum); @@ -3235,8 +3224,6 @@ out_free_queues: out_free_ida: put_device(ctrl->dev); ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); -out_mod_put: - module_put(lport->ops->module); out_free_ctrl: kfree(ctrl); out_fail: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..54603bd3e02d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -64,17 +64,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -void nvme_failover_req(struct request *req) +bool nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status; unsigned long flags; - spin_lock_irqsave(&ns->head->requeue_lock, flags); - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); - switch (status & 0x7ff) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: @@ -103,15 +98,17 @@ void nvme_failover_req(struct request *req) nvme_mpath_clear_current_path(ns); break; default: - /* - * Reset the controller for any non-ANA error as we don't know - * what caused the error. - */ - nvme_reset_ctrl(ns->ctrl); - break; + /* This was a non-ANA error so follow the normal error path. */ + return false; } + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); + return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -377,11 +374,10 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); + q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); if (!q) goto out; q->queuedata = head; - blk_queue_make_request(q, nvme_ns_head_make_request); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); @@ -514,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_write(&ctrl->namespaces_rwsem); + down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { unsigned nsid = le32_to_cpu(desc->nsids[n]); @@ -525,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (++n == nr_nsids) break; } - up_write(&ctrl->namespaces_rwsem); + up_read(&ctrl->namespaces_rwsem); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1024fec7914c..2e04a36296d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -259,6 +259,7 @@ struct nvme_ctrl { struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; + bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ @@ -550,7 +551,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); +bool nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -599,8 +600,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline void nvme_failover_req(struct request *req) +static inline bool nvme_failover_req(struct request *req) { + return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3f23d6254e4..cc46e250fcac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -971,39 +971,34 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) nvme_end_request(req, cqe->status, cqe->result); } -static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) -{ - while (start != end) { - nvme_handle_cqe(nvmeq, start); - if (++start == nvmeq->q_depth) - start = 0; - } -} - static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (nvmeq->cq_head == nvmeq->q_depth - 1) { + u16 tmp = nvmeq->cq_head + 1; + + if (tmp == nvmeq->q_depth) { nvmeq->cq_head = 0; - nvmeq->cq_phase = !nvmeq->cq_phase; + nvmeq->cq_phase ^= 1; } else { - nvmeq->cq_head++; + nvmeq->cq_head = tmp; } } -static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end, unsigned int tag) +static inline int nvme_process_cq(struct nvme_queue *nvmeq) { int found = 0; - *start = nvmeq->cq_head; while (nvme_cqe_pending(nvmeq)) { - if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) - found++; + found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); + nvme_handle_cqe(nvmeq, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } - *end = nvmeq->cq_head; - if (*start != *end) + if (found) nvme_ring_cq_doorbell(nvmeq); return found; } @@ -1012,21 +1007,16 @@ static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; irqreturn_t ret = IRQ_NONE; - u16 start, end; /* * The rmb/wmb pair ensures we see all updates from a previous run of * the irq handler, even if that was on another CPU. */ rmb(); - nvme_process_cq(nvmeq, &start, &end, -1); + if (nvme_process_cq(nvmeq)) + ret = IRQ_HANDLED; wmb(); - if (start != end) { - nvme_complete_cqes(nvmeq, start, end); - return IRQ_HANDLED; - } - return ret; } @@ -1039,46 +1029,30 @@ static irqreturn_t nvme_irq_check(int irq, void *data) } /* - * Poll for completions any queue, including those not dedicated to polling. + * Poll for completions for any interrupt driven queue * Can be called from any context. */ -static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) +static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - u16 start, end; - int found; - /* - * For a poll queue we need to protect against the polling thread - * using the CQ lock. For normal interrupt driven threads we have - * to disable the interrupt to avoid racing with it. - */ - if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { - spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, tag); - spin_unlock(&nvmeq->cq_poll_lock); - } else { - disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq, &start, &end, tag); - enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - } + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); - nvme_complete_cqes(nvmeq, start, end); - return found; + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + nvme_process_cq(nvmeq); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } static int nvme_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; - u16 start, end; bool found; if (!nvme_cqe_pending(nvmeq)) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, -1); - nvme_complete_cqes(nvmeq, start, end); + found = nvme_process_cq(nvmeq); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1255,7 +1229,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - if (nvme_poll_irqdisable(nvmeq, req->tag)) { + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) + nvme_poll(req->mq_hctx); + else + nvme_poll_irqdisable(nvmeq); + + if (blk_mq_request_completed(req)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1398,23 +1377,23 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) else nvme_disable_ctrl(&dev->ctrl); - nvme_poll_irqdisable(nvmeq, -1); + nvme_poll_irqdisable(nvmeq); } /* * Called only on a device that has been disabled and after all other threads - * that can check this device's completion queues have synced. This is the - * last chance for the driver to see a natural completion before - * nvme_cancel_request() terminates all incomplete requests. + * that can check this device's completion queues have synced, except + * nvme_poll(). This is the last chance for the driver to see a natural + * completion before nvme_cancel_request() terminates all incomplete requests. */ static void nvme_reap_pending_cqes(struct nvme_dev *dev) { - u16 start, end; int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - nvme_process_cq(&dev->queues[i], &start, &end, -1); - nvme_complete_cqes(&dev->queues[i], start, end); + spin_lock(&dev->queues[i].cq_poll_lock); + nvme_process_cq(&dev->queues[i]); + spin_unlock(&dev->queues[i].cq_poll_lock); } } @@ -2503,13 +2482,13 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) struct nvme_dev *dev = to_nvme_dev(ctrl); nvme_dbbuf_dma_free(dev); - put_device(dev->dev); nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); - kfree(dev->queues); free_opal_dev(dev->ctrl.opal_dev); mempool_destroy(dev->iod_mempool); + put_device(dev->dev); + kfree(dev->queues); kfree(dev); } @@ -2689,7 +2668,7 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) { struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); - return snprintf(buf, size, "%s", dev_name(&pdev->dev)); + return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { @@ -2835,7 +2814,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_reset_ctrl(&dev->ctrl); - nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); return 0; @@ -2907,10 +2885,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); - nvme_put_ctrl(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0fe08c4dfd2f..cac8a930396a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -142,14 +142,6 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static const struct blk_mq_ops nvme_rdma_mq_ops; static const struct blk_mq_ops nvme_rdma_admin_mq_ops; -/* XXX: really should move to a generic header sooner or later.. */ -static inline void put_unaligned_le24(u32 val, u8 *p) -{ - *p++ = val; - *p++ = val >> 8; - *p++ = val >> 16; -} - static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) { return queue - queue->ctrl->queues; @@ -1024,8 +1016,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } @@ -1345,7 +1342,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, int ret; sge->addr = qe->dma; - sge->length = sizeof(struct nvme_command), + sge->length = sizeof(struct nvme_command); sge->lkey = queue->device->pd->local_dma_lkey; wr.next = NULL; @@ -2045,8 +2042,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 49d4373b84eb..c15a92163c1f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -20,6 +20,16 @@ struct nvme_tcp_queue; +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -164,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) { struct request *rq; - unsigned int bytes; if (unlikely(nvme_tcp_async_req(req))) return false; /* async events don't have a request */ rq = blk_mq_rq_from_pdu(req); - bytes = blk_rq_payload_bytes(rq); - return rq_data_dir(rq) == WRITE && bytes && - bytes <= nvme_tcp_inline_data_size(req->queue); + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req->queue); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -1017,8 +1025,15 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) if (req->state == NVME_TCP_SEND_DDGST) ret = nvme_tcp_try_send_ddgst(req); done: - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + if (ret != -EPIPE && ret != -ECONNRESET) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } return ret; } @@ -1049,25 +1064,16 @@ static void nvme_tcp_io_work(struct work_struct *w) int result; result = nvme_tcp_try_send(queue); - if (result > 0) { + if (result > 0) pending = true; - } else if (unlikely(result < 0)) { - dev_err(queue->ctrl->ctrl.device, - "failed to send request %d\n", result); - - /* - * Fail the request unless peer closed the connection, - * in which case error recovery flow will complete all. - */ - if ((result != -EPIPE) && (result != -ECONNRESET)) - nvme_tcp_fail_request(queue->request); - nvme_tcp_done_send_req(queue); - return; - } + else if (unlikely(result < 0)) + break; result = nvme_tcp_try_recv(queue); if (result > 0) pending = true; + else if (unlikely(result < 0)) + return; if (!pending) return; @@ -1248,13 +1254,67 @@ free_icreq: return ret; } +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + int n = 0; + + if (nvme_tcp_default_queue(queue)) + n = qid - 1; + else if (nvme_tcp_read_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + else if (nvme_tcp_poll_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); +} + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, size_t queue_size) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size, n; + int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1309,6 +1369,17 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_PRIORITY sock opt, ret %d\n", + ret); + goto err_sock; + } + } + /* Set socket type of service */ if (nctrl->opts->tos >= 0) { opt = nctrl->opts->tos; @@ -1322,11 +1393,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } queue->sock->sk->sk_allocation = GFP_ATOMIC; - if (!qid) - n = 0; - else - n = (qid - 1) % num_online_cpus(); - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; @@ -1861,8 +1928,13 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } @@ -2090,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (rq_data_dir(rq) == WRITE && req->data_len && + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else @@ -2117,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; - req->data_len = blk_rq_payload_bytes(rq); + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; if (rq_data_dir(rq) == WRITE && @@ -2224,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); nvme_tcp_try_recv(queue); @@ -2359,8 +2437,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); mutex_unlock(&nvme_tcp_ctrl_mutex); diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..9d6f75cfa77c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/rculist.h> +#include <linux/part_stat.h> #include <generated/utsrelease.h> #include <asm/unaligned.h> @@ -322,12 +323,25 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); } +static void nvmet_id_set_model_number(struct nvme_id_ctrl *id, + struct nvmet_subsys *subsys) +{ + const char *model = NVMET_DEFAULT_CTRL_MODEL; + struct nvmet_subsys_model *subsys_model; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + memcpy_and_pad(id->mn, sizeof(id->mn), model, strlen(model), ' '); + rcu_read_unlock(); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; u16 status = 0; - const char model[] = "Linux"; id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) { @@ -342,7 +356,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memset(id->sn, ' ', sizeof(id->sn)); bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); - memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + nvmet_id_set_model_number(id, ctrl->subsys); memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); @@ -356,8 +370,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* we support multiple ports, multiples hosts and ANA: */ id->cmic = (1 << 0) | (1 << 1) | (1 << 3); - /* no limit on data transfer sizes for now */ - id->mdts = 0; + /* Limit MDTS according to transport capability */ + if (ctrl->ops->get_mdts) + id->mdts = ctrl->ops->get_mdts(ctrl); + else + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); id->ver = cpu_to_le32(ctrl->subsys->ver); @@ -720,13 +738,22 @@ static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); u16 status = 0; + u16 nsqr; + u16 ncqr; if (!nvmet_check_data_len(req, 0)) return; switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: + ncqr = (cdw11 >> 16) & 0xffff; + nsqr = cdw11 & 0xffff; + if (ncqr == 0xffff || nsqr == 0xffff) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 98613a45bd3b..58cabd7b6fc5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -395,14 +395,12 @@ static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, struct nvmet_subsys *subsys = ns->subsys; int ret = 0; - mutex_lock(&subsys->lock); if (ns->enabled) { ret = -EBUSY; goto out_unlock; } - if (uuid_parse(page, &ns->uuid)) ret = -EINVAL; @@ -815,10 +813,10 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, (int)NVME_MAJOR(subsys->ver), (int)NVME_MINOR(subsys->ver), (int)NVME_TERTIARY(subsys->ver)); - else - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, @@ -828,7 +826,6 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; - ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; @@ -852,20 +849,151 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, const char *page, size_t count) { - struct nvmet_subsys *subsys = to_subsys(item); + u64 serial; + + if (sscanf(page, "%llx\n", &serial) != 1) + return -EINVAL; down_write(&nvmet_config_sem); - sscanf(page, "%llx\n", &subsys->serial); + to_subsys(item)->serial = serial; up_write(&nvmet_config_sem); return count; } CONFIGFS_ATTR(nvmet_subsys_, attr_serial); +static ssize_t nvmet_subsys_attr_cntlid_min_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_min); +} + +static ssize_t nvmet_subsys_attr_cntlid_min_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_min; + + if (sscanf(page, "%hu\n", &cntlid_min) != 1) + return -EINVAL; + + if (cntlid_min == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_min >= to_subsys(item)->cntlid_max) + goto out_unlock; + to_subsys(item)->cntlid_min = cntlid_min; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_min); + +static ssize_t nvmet_subsys_attr_cntlid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_max); +} + +static ssize_t nvmet_subsys_attr_cntlid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_max; + + if (sscanf(page, "%hu\n", &cntlid_max) != 1) + return -EINVAL; + + if (cntlid_max == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_max <= to_subsys(item)->cntlid_min) + goto out_unlock; + to_subsys(item)->cntlid_max = cntlid_max; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); + +static ssize_t nvmet_subsys_attr_model_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *subsys_model; + char *model = NVMET_DEFAULT_CTRL_MODEL; + int ret; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + ret = snprintf(page, PAGE_SIZE, "%s\n", model); + rcu_read_unlock(); + + return ret; +} + +/* See Section 1.5 of NVMe 1.4 */ +static bool nvmet_is_ascii(const char c) +{ + return c >= 0x20 && c <= 0x7e; +} + +static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *new_model; + char *new_model_number; + int pos = 0, len; + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + new_model_number = kstrndup(page, len, GFP_KERNEL); + if (!new_model_number) + return -ENOMEM; + + new_model = kzalloc(sizeof(*new_model) + len + 1, GFP_KERNEL); + if (!new_model) { + kfree(new_model_number); + return -ENOMEM; + } + memcpy(new_model->number, new_model_number, len); + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + new_model = rcu_replace_pointer(subsys->model, new_model, + mutex_is_locked(&subsys->lock)); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + kfree_rcu(new_model, rcuhead); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_model); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_serial, + &nvmet_subsys_attr_attr_cntlid_min, + &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_model, NULL, }; @@ -970,12 +1098,19 @@ static struct configfs_attribute *nvmet_referral_attrs[] = { NULL, }; -static void nvmet_referral_release(struct config_item *item) +static void nvmet_referral_notify(struct config_group *group, + struct config_item *item) { struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); struct nvmet_port *port = to_nvmet_port(item); nvmet_referral_disable(parent, port); +} + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + kfree(port); } @@ -1006,6 +1141,7 @@ static struct config_group *nvmet_referral_make( static struct configfs_group_operations nvmet_referral_group_ops = { .make_group = nvmet_referral_make, + .disconnect_notify = nvmet_referral_notify, }; static const struct config_item_type nvmet_referrals_type = { diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 576de773b4db..b685f99d56a1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1289,8 +1289,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->sqs) goto out_free_cqs; + if (subsys->cntlid_min > subsys->cntlid_max) + goto out_free_cqs; + ret = ida_simple_get(&cntlid_ida, - NVME_CNTLID_MIN, NVME_CNTLID_MAX, + subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; @@ -1438,7 +1441,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kfree(subsys); return ERR_PTR(-ENOMEM); } - + subsys->cntlid_min = NVME_CNTLID_MIN; + subsys->cntlid_max = NVME_CNTLID_MAX; kref_init(&subsys->ref); mutex_init(&subsys->lock); @@ -1457,6 +1461,7 @@ static void nvmet_subsys_free(struct kref *ref) WARN_ON_ONCE(!list_empty(&subsys->namespaces)); kfree(subsys->subsysnqn); + kfree_rcu(subsys->model, rcuhead); kfree(subsys); } diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a0db6371b43e..a8ceb7721640 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -684,7 +684,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) disconnect = atomic_xchg(&queue->connected, 0); spin_lock_irqsave(&queue->qlock, flags); - /* about outstanding io's */ + /* abort outstanding io's */ for (i = 0; i < queue->sqsize; fod++, i++) { if (fod->active) { spin_lock(&fod->flock); diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1c50af6219f3..f69ce66e2d44 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -198,10 +198,13 @@ struct fcloop_lport_priv { }; struct fcloop_rport { - struct nvme_fc_remote_port *remoteport; - struct nvmet_fc_target_port *targetport; - struct fcloop_nport *nport; - struct fcloop_lport *lport; + struct nvme_fc_remote_port *remoteport; + struct nvmet_fc_target_port *targetport; + struct fcloop_nport *nport; + struct fcloop_lport *lport; + spinlock_t lock; + struct list_head ls_list; + struct work_struct ls_work; }; struct fcloop_tport { @@ -224,11 +227,10 @@ struct fcloop_nport { }; struct fcloop_lsreq { - struct fcloop_tport *tport; struct nvmefc_ls_req *lsreq; - struct work_struct work; struct nvmefc_tgt_ls_req tgt_ls_req; int status; + struct list_head ls_list; /* fcloop_rport->ls_list */ }; struct fcloop_rscn { @@ -292,21 +294,32 @@ fcloop_delete_queue(struct nvme_fc_local_port *localport, { } - -/* - * Transmit of LS RSP done (e.g. buffers all set). call back up - * initiator "done" flows. - */ static void -fcloop_tgt_lsrqst_done_work(struct work_struct *work) +fcloop_rport_lsrqst_work(struct work_struct *work) { - struct fcloop_lsreq *tls_req = - container_of(work, struct fcloop_lsreq, work); - struct fcloop_tport *tport = tls_req->tport; - struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_rport *rport = + container_of(work, struct fcloop_rport, ls_work); + struct fcloop_lsreq *tls_req; - if (!tport || tport->remoteport) - lsreq->done(lsreq, tls_req->status); + spin_lock(&rport->lock); + for (;;) { + tls_req = list_first_entry_or_null(&rport->ls_list, + struct fcloop_lsreq, ls_list); + if (!tls_req) + break; + + list_del(&tls_req->ls_list); + spin_unlock(&rport->lock); + + tls_req->lsreq->done(tls_req->lsreq, tls_req->status); + /* + * callee may free memory containing tls_req. + * do not reference lsreq after this. + */ + + spin_lock(&rport->lock); + } + spin_unlock(&rport->lock); } static int @@ -319,17 +332,18 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, int ret = 0; tls_req->lsreq = lsreq; - INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work); + INIT_LIST_HEAD(&tls_req->ls_list); if (!rport->targetport) { tls_req->status = -ECONNREFUSED; - tls_req->tport = NULL; - schedule_work(&tls_req->work); + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); return ret; } tls_req->status = 0; - tls_req->tport = rport->targetport->private; ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req, lsreq->rqstaddr, lsreq->rqstlen); @@ -337,18 +351,28 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, } static int -fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, +fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, struct nvmefc_tgt_ls_req *tgt_lsreq) { struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq); struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_tport *tport = targetport->private; + struct nvme_fc_remote_port *remoteport = tport->remoteport; + struct fcloop_rport *rport; memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf, ((lsreq->rsplen < tgt_lsreq->rsplen) ? lsreq->rsplen : tgt_lsreq->rsplen)); + tgt_lsreq->done(tgt_lsreq); - schedule_work(&tls_req->work); + if (remoteport) { + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); + } return 0; } @@ -834,6 +858,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + flush_work(&rport->ls_work); fcloop_nport_put(rport->nport); } @@ -850,7 +875,6 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) #define FCLOOP_DMABOUND_4G 0xFFFFFFFF static struct nvme_fc_port_template fctemplate = { - .module = THIS_MODULE, .localport_delete = fcloop_localport_delete, .remoteport_delete = fcloop_remoteport_delete, .create_queue = fcloop_create_queue, @@ -1136,6 +1160,9 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, rport->nport = nport; rport->lport = nport->lport; nport->rport = rport; + spin_lock_init(&rport->lock); + INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); + INIT_LIST_HEAD(&rport->ls_list); return count; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4df4ebde208a..0d54e730cbf2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -485,7 +485,6 @@ out_destroy_admin: out_disable: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { @@ -618,8 +617,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - nvme_get_ctrl(&ctrl->ctrl); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index eda28b22a2c8..421dff3ea143 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -23,6 +23,7 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) +#define NVMET_DEFAULT_CTRL_MODEL "Linux" /* * Supported optional AENs: @@ -202,6 +203,11 @@ struct nvmet_ctrl { struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; }; +struct nvmet_subsys_model { + struct rcu_head rcuhead; + char number[]; +}; + struct nvmet_subsys { enum nvme_subsys_type type; @@ -211,6 +217,8 @@ struct nvmet_subsys { struct list_head namespaces; unsigned int nr_namespaces; unsigned int max_nsid; + u16 cntlid_min; + u16 cntlid_max; struct list_head ctrls; @@ -227,6 +235,8 @@ struct nvmet_subsys { struct config_group namespaces_group; struct config_group allowed_hosts_group; + + struct nvmet_subsys_model __rcu *model; }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) @@ -279,6 +289,7 @@ struct nvmet_fabrics_ops { struct nvmet_port *port, char *traddr); u16 (*install_queue)(struct nvmet_sq *nvme_sq); void (*discovery_chg)(struct nvmet_port *port); + u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); }; #define NVMET_MAX_INLINE_BIOVEC 8 diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 37d262a65877..fd47de0e4e4e 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -31,6 +31,9 @@ #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 + struct nvmet_rdma_cmd { struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; @@ -75,6 +78,7 @@ enum nvmet_rdma_queue_state { struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; @@ -102,6 +106,13 @@ struct nvmet_rdma_queue { struct list_head queue_list; }; +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + struct delayed_work repair_work; +}; + struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; @@ -143,12 +154,6 @@ static int num_pages(int len) return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); } -/* XXX: really should move to a generic header sooner or later.. */ -static inline u32 get_unaligned_le24(const u8 *p) -{ - return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; -} - static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && @@ -464,7 +469,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, if (ndev->srq) ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); else - ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); @@ -503,7 +508,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } @@ -587,7 +592,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; @@ -742,7 +747,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { @@ -914,7 +919,8 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { - struct nvmet_port *port = cm_id->context; + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; int inline_page_count; int inline_sge_count; @@ -931,17 +937,17 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; - inline_page_count = num_pages(port->inline_data_size); + inline_page_count = num_pages(nport->inline_data_size); inline_sge_count = max(cm_id->device->attrs.max_sge_rd, cm_id->device->attrs.max_recv_sge) - 1; if (inline_page_count > inline_sge_count) { pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", - port->inline_data_size, cm_id->device->name, + nport->inline_data_size, cm_id->device->name, inline_sge_count * PAGE_SIZE); - port->inline_data_size = inline_sge_count * PAGE_SIZE; + nport->inline_data_size = inline_sge_count * PAGE_SIZE; inline_page_count = inline_sge_count; } - ndev->inline_data_size = port->inline_data_size; + ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -975,7 +981,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { struct ib_qp_init_attr qp_attr; struct nvmet_rdma_device *ndev = queue->dev; - int comp_vector, nr_cqe, ret, i; + int comp_vector, nr_cqe, ret, i, factor; /* * Spread the io queues across completion vectors, @@ -1008,7 +1014,9 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; - qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, ndev->device->attrs.max_send_sge); @@ -1025,6 +1033,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -1053,11 +1062,10 @@ err_destroy_cq: static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp *qp = queue->cm_id->qp; - - ib_drain_qp(qp); - rdma_destroy_id(queue->cm_id); - ib_destroy_qp(qp); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); ib_free_cq(queue->cq); } @@ -1267,6 +1275,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { + struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_device *ndev; struct nvmet_rdma_queue *queue; int ret = -EINVAL; @@ -1282,7 +1291,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = -ENOMEM; goto put_device; } - queue->port = cm_id->context; + queue->port = port->nport; if (queue->host_qid == 0) { /* Let inflight controller teardown complete */ @@ -1291,9 +1300,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { - schedule_work(&queue->release_work); - /* Destroying rdma_cm id is not needed here */ - return 0; + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); @@ -1302,6 +1314,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; +free_queue: + nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); @@ -1407,7 +1421,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - struct nvmet_port *port; + struct nvmet_rdma_port *port; if (queue) { /* @@ -1426,7 +1440,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ - if (xchg(&port->priv, NULL) != cm_id) + if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* @@ -1457,6 +1471,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + schedule_delayed_work(&port->repair_work, 0); + break; + } + /* FALLTHROUGH */ case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1499,42 +1520,19 @@ restart: mutex_unlock(&nvmet_rdma_queue_mutex); } -static int nvmet_rdma_add_port(struct nvmet_port *port) +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) { - struct rdma_cm_id *cm_id; - struct sockaddr_storage addr = { }; - __kernel_sa_family_t af; - int ret; - - switch (port->disc_addr.adrfam) { - case NVMF_ADDR_FAMILY_IP4: - af = AF_INET; - break; - case NVMF_ADDR_FAMILY_IP6: - af = AF_INET6; - break; - default: - pr_err("address family %d not supported\n", - port->disc_addr.adrfam); - return -EINVAL; - } + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); - if (port->inline_data_size < 0) { - port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; - } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { - pr_warn("inline_data_size %u is too large, reducing to %u\n", - port->inline_data_size, - NVMET_RDMA_MAX_INLINE_DATA_SIZE); - port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; - } + if (cm_id) + rdma_destroy_id(cm_id); +} - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, - port->disc_addr.trsvcid, &addr); - if (ret) { - pr_err("malformed ip/port passed: %s:%s\n", - port->disc_addr.traddr, port->disc_addr.trsvcid); - return ret; - } +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1553,23 +1551,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) goto out_destroy_id; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); + ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("binding CM ID to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } ret = rdma_listen(cm_id, 128); if (ret) { - pr_err("listening to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpcs)\n", - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); - port->priv = cm_id; + port->cm_id = cm_id; return 0; out_destroy_id: @@ -1577,18 +1571,92 @@ out_destroy_id: return ret; } -static void nvmet_rdma_remove_port(struct nvmet_port *port) +static void nvmet_rdma_repair_port_work(struct work_struct *w) { - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; - if (cm_id) - rdma_destroy_id(cm_id); + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + schedule_delayed_work(&port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(nport->disc_addr.portid), + (struct sockaddr *)&port->addr); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); } static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, - struct nvmet_port *port, char *traddr) + struct nvmet_port *nport, char *traddr) { - struct rdma_cm_id *cm_id = port->priv; + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = @@ -1598,10 +1666,15 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, sprintf(traddr, "%pISc", addr); } else { - memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + return NVMET_RDMA_MAX_MDTS; +} + static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, @@ -1612,6 +1685,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, + .get_mdts = nvmet_rdma_get_mdts, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 5bb5342b8d0c..f0da04e960f4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -19,6 +19,16 @@ #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -622,7 +632,7 @@ static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) return 1; } -static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct nvmet_tcp_queue *queue = cmd->queue; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; @@ -632,6 +642,9 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) }; int ret; + if (!last_in_batch && cmd->queue->send_list_len) + msg.msg_flags |= MSG_MORE; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; @@ -672,7 +685,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, } if (cmd->state == NVMET_TCP_SEND_DDGST) { - ret = nvmet_try_send_ddgst(cmd); + ret = nvmet_try_send_ddgst(cmd, last_in_batch); if (ret <= 0) goto done_send; } @@ -794,7 +807,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); - icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */ + icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; @@ -1439,6 +1452,13 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + if (so_priority > 0) { + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) + return ret; + } + /* Set socket type of service */ if (inet->rcv_tos > 0) { int tos = inet->rcv_tos; @@ -1628,6 +1648,15 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); + goto err_sock; + } + } + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); if (ret) { |