diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 10:51:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 10:51:40 -0700 |
commit | e0fc99e21e6e299673f1640105ac0c1d829c2d93 (patch) | |
tree | 4637b171164c3e653e9002b2bc962b32f05b5a07 /drivers/nvme/host | |
parent | 4834ce9d8e074bb7ae197632e0708219b9f389b5 (diff) | |
parent | f59589fc89665102923725e80e12f782d5f74f67 (diff) |
Merge tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
- NVMe:
- ZNS support (Aravind, Keith, Matias, Niklas)
- Misc cleanups, optimizations, fixes (Baolin, Chaitanya, David,
Dongli, Max, Sagi)
- null_blk zone capacity support (Aravind)
- MD:
- raid5/6 fixes (ChangSyun)
- Warning fixes (Damien)
- raid5 stripe fixes (Guoqing, Song, Yufen)
- sysfs deadlock fix (Junxiao)
- raid10 deadlock fix (Vitaly)
- struct_size conversions (Gustavo)
- Set of bcache updates/fixes (Coly)
* tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block: (117 commits)
md/raid5: Allow degraded raid6 to do rmw
md/raid5: Fix Force reconstruct-write io stuck in degraded raid5
raid5: don't duplicate code for different paths in handle_stripe
raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show
md: print errno in super_written
md/raid5: remove the redundant setting of STRIPE_HANDLE
md: register new md sysfs file 'uuid' read-only
md: fix max sectors calculation for super 1.0
nvme-loop: remove extra variable in create ctrl
nvme-loop: set ctrl state connecting after init
nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths
nvme-multipath: fix logic for non-optimized paths
nvme-rdma: fix controller reset hang during traffic
nvme-tcp: fix controller reset hang during traffic
nvmet: introduce the passthru Kconfig option
nvmet: introduce the passthru configfs interface
nvmet: Add passthru enable/disable helpers
nvmet: add passthru code to process commands
nvme: export nvme_find_get_ns() and nvme_put_ns()
nvme: introduce nvme_ctrl_get_by_path()
...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Makefile | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 561 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 6 | ||||
-rw-r--r-- | drivers/nvme/host/hwmon.c | 5 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 37 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 86 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 190 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 99 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 100 | ||||
-rw-r--r-- | drivers/nvme/host/zns.c | 256 |
13 files changed, 1057 insertions, 293 deletions
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index fc7b26be692d..d7f6a87687b8 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6 +13,7 @@ nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6bdcdd984394..767d62985bba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) * Revalidating a dead namespace sets capacity to 0. This will end * buffered writers dirtying pages that can't be synced. */ - if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { @@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; @@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } +static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -884,6 +929,120 @@ out: return ERR_PTR(ret); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (_nvme_revalidate_disk(ns->disk)) + nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } + up_read(&ctrl->namespaces_rwsem); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl, &effects); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } +} + +void nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + blk_execute_rq(rq->q, disk, rq, 0); + nvme_passthru_end(ctrl, effects); +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } - blk_execute_rq(req->q, disk, req, 0); + nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else @@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_update_formats(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - up_read(&ctrl->namespaces_rwsem); -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; u64 result; int status; @@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, return disk->private_data; } -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if (head) srcu_read_unlock(&head->srcu, idx); @@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk, static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + int ret; u32 iob; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + case NVME_CSI_ZNS: + ret = nvme_update_zone_info(disk, ns, lbaf); + if (ret) { + dev_warn(ctrl->device, + "failed to add zoned namespace:%u ret:%d\n", + ns->head->ns_id, ret); + return ret; + } + break; + default: + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -1977,7 +2088,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return 0; } -static int nvme_revalidate_disk(struct gendisk *disk) +static int _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2025,6 +2136,28 @@ out: return ret; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + int ret; + + ret = _nvme_revalidate_disk(disk); + if (ret) + return ret; + +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(disk->queue)) { + struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + + ret = blk_revalidate_disk_zones(disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(disk->queue, + ctrl->max_zone_append); + } +#endif + return ret; +} + static char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2155,6 +2288,7 @@ static const struct block_device_operations nvme_fops = { .release = nvme_release, .getgeo = nvme_getgeo, .revalidate_disk= nvme_revalidate_disk, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2181,6 +2315,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; #endif /* CONFIG_NVME_MULTIPATH */ @@ -2238,12 +2373,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min, page_shift = 12; + unsigned dev_page_min; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2253,17 +2383,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - if (page_shift < dev_page_min) { + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); return -ENODEV; } - ctrl->page_size = 1 << page_shift; - - ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; @@ -2313,13 +2444,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, if (ctrl->max_hw_sectors) { u32 max_segments = - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; @@ -2810,7 +2941,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2824,27 +2955,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) +{ + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) { + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2865,7 +3024,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); @@ -2877,7 +3036,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -2939,7 +3098,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); @@ -3345,6 +3504,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -3397,6 +3557,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev, } static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + else if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err) + return err; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3414,6 +3634,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, NULL }; @@ -3510,6 +3732,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3591,7 +3820,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3609,6 +3838,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -3726,7 +3956,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk && ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -3757,7 +3987,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (ns->disk && revalidate_disk(ns->disk)) + if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else @@ -3850,8 +4080,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3912,6 +4142,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); @@ -3995,8 +4228,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4106,8 +4339,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl) { - if (ctrl->kato) - nvme_start_keep_alive(ctrl); + nvme_start_keep_alive(ctrl); nvme_enable_aen(ctrl); @@ -4133,11 +4365,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4168,6 +4405,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4346,6 +4584,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +{ + struct nvme_ctrl *ctrl; + struct file *f; + + f = filp_open(path, O_RDWR, 0); + if (IS_ERR(f)) + return ERR_CAST(f); + + if (f->f_op != &nvme_dev_fops) { + ctrl = ERR_PTR(-EINVAL); + goto out_close; + } + + ctrl = f->private_data; + nvme_get_ctrl(ctrl); + +out_close: + filp_close(f, NULL); + return ctrl; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); + /* * Check we didn't inadvertently grow the command structure sizes: */ @@ -4364,6 +4625,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2a6c8190eeb7..4ec4829d6233 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING && + if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a0ec40ab62ee..a9c1e3b4585e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_DELETING)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6aa30bb5a762..eae43bb444e0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) break; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: default: /* no action to take - let it delete */ break; @@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; + ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << + (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 2e6477ed420f..412a6c97c0d8 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) int ret; ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); return ret <= 0 ? ret : -EIO; } @@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { - dev_warn(dev, "Failed to read smart log (error %d)\n", err); + dev_warn(ctrl->device, + "Failed to read smart log (error %d)\n", err); devm_kfree(dev, data); return; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69608755d415..8e562d0f2c30 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, dev_meta_off = dev_meta; ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, - offset); + NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, + dev_meta, len, offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a37a595411e..3ded54d2c9c6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static bool nvme_path_is_disabled(struct nvme_ns *ns) { - return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags); + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags)) + return true; + return false; } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) @@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } + /* No optimized path found, re-check the current path */ + if (!nvme_path_is_disabled(old) && + old->ana_state == NVME_ANA_OPTIMIZED) { + found = old; + goto out; + } if (!fallback) return NULL; found = fallback; @@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) - ns = nvme_round_robin_path(head, node, ns); - if (unlikely(!ns || !nvme_path_is_optimized(ns))) - ns = __nvme_find_path(head, node); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); return ns; } @@ -527,7 +545,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) int error; mutex_lock(&ctrl->ana_lock); - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) { dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); @@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + if (ctrl->state != NVME_CTRL_LIVE) + return; + nvme_read_ana_log(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9c5b82af7978..ebb8c3ed3885 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -37,6 +37,14 @@ extern unsigned int admin_timeout; #define NVME_INLINE_METADATA_SG_CNT 1 #endif +/* + * Default to a 4K page size, with the intention to update this + * path in the future to accommodate architectures with differing + * kernel and IO page sizes. + */ +#define NVME_CTRL_PAGE_SHIFT 12 +#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) + extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; @@ -180,12 +188,32 @@ static inline u16 nvme_req_qid(struct request *req) */ #define NVME_QUIRK_DELAY_AMOUNT 2300 +/* + * enum nvme_ctrl_state: Controller state + * + * @NVME_CTRL_NEW: New controller just allocated, initial state + * @NVME_CTRL_LIVE: Controller is connected and I/O capable + * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) + * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the + * transport + * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not + * disabled/failed immediately. This state comes + * after all async event processing took place and + * before ns removal and the controller deletion + * progress + * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during + * shutdown or removal. In this case we forcibly + * kill all inflight I/O as they have no chance to + * complete + */ enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, NVME_CTRL_DEAD, }; @@ -198,6 +226,12 @@ struct nvme_fault_inject { #endif }; +struct nvme_cel { + struct list_head entry; + struct nvme_effects_log log; + u8 csi; +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -235,10 +269,12 @@ struct nvme_ctrl { u32 queue_count; u64 cap; - u32 page_size; u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; +#ifdef CONFIG_BLK_DEV_ZONED + u32 max_zone_append; +#endif u16 crdt[3]; u16 oncs; u16 oacs; @@ -264,6 +300,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; + struct list_head cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -346,6 +383,7 @@ struct nvme_ns_ids { u8 eui64[8]; u8 nguid[16]; uuid_t uuid; + u8 csi; }; /* @@ -365,6 +403,7 @@ struct nvme_ns_head { struct kref ref; bool shared; int instance; + struct nvme_effects_log *effects; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; struct bio_list requeue_list; @@ -402,6 +441,9 @@ struct nvme_ns { u16 sgs; u32 sws; u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +#endif unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -567,8 +609,11 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx); +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; @@ -704,6 +749,36 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf); + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action); +#else +#define nvme_report_zones NULL + +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action) +{ + return BLK_STS_NOTSUPP; +} + +static inline int nvme_update_zone_info(struct gendisk *disk, + struct nvme_ns *ns, + unsigned lbaf) +{ + dev_warn(ns->ctrl->device, + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); + return -EPROTONOSUPPORT; +} +#endif + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); @@ -735,4 +810,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl); static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } #endif +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); +void nvme_execute_passthru_rq(struct request *rq); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +void nvme_put_ns(struct nvme_ns *ns); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0c85680984c1..ba725ae47305 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include <linux/acpi.h> #include <linux/aer.h> #include <linux/async.h> #include <linux/blkdev.h> @@ -61,10 +62,10 @@ MODULE_PARM_DESC(sgl_threshold, static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, - .get = param_get_int, + .get = param_get_uint, }; -static int io_queue_depth = 1024; +static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); @@ -94,6 +95,10 @@ static unsigned int poll_queues; module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static bool noacpi; +module_param(noacpi, bool, 0444); +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); + struct nvme_dev; struct nvme_queue; @@ -115,7 +120,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - int q_depth; + u16 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -151,13 +156,14 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int n = 0, ret; + int ret; + u16 n; - ret = kstrtoint(val, 10, &n); + ret = kstrtou16(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_int(val, kp); + return param_set_ushort(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -345,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, - dev->ctrl.page_size); + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -356,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) * Calculates the number of pages needed for the SGL segments. For example a 4k * page can accommodate 256 SGL descriptors. */ -static int nvme_pci_npages_sgl(unsigned int num_seg) +static int nvme_pci_npages_sgl(void) { - return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), + PAGE_SIZE); } -static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg, bool use_sgl) +static size_t nvme_pci_iod_alloc_size(void) { - size_t alloc_size; + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - if (use_sgl) - alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); - else - alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); - - return alloc_size + sizeof(struct scatterlist) * nseg; + return sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -500,9 +502,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; - if (nseg == 0) - return false; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) @@ -517,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; @@ -584,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->ctrl.page_size; - int offset = dma_addr & (page_size - 1); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; - length -= (page_size - offset); + length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { iod->first_dma = 0; goto done; } - dma_len -= (page_size - offset); + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); if (dma_len) { - dma_addr += (page_size - offset); + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= page_size) { + if (length <= NVME_CTRL_PAGE_SIZE) { iod->first_dma = dma_addr; goto done; } - nprps = DIV_ROUND_UP(length, page_size); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -630,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == page_size >> 3) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) @@ -641,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= page_size; - dma_addr += page_size; - length -= page_size; + dma_len -= NVME_CTRL_PAGE_SIZE; + dma_addr += NVME_CTRL_PAGE_SIZE; + length -= NVME_CTRL_PAGE_SIZE; if (length <= 0) break; if (dma_len > 0) @@ -753,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, struct bio_vec *bv) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); - unsigned int first_prp_len = dev->ctrl.page_size - offset; + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->first_dma)) @@ -764,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); if (bv->bv_len > first_prp_len) cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - return 0; + return BLK_STS_OK; } static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, @@ -782,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return 0; + return BLK_STS_OK; } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, @@ -796,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2) + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); @@ -846,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); - return 0; + return BLK_STS_OK; } /* @@ -1019,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; @@ -1154,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t error) static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) { - /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset. */ @@ -1261,9 +1259,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, @@ -1398,11 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, { int q_depth = dev->q_depth; unsigned q_size_aligned = roundup(q_depth * entry_size, - dev->ctrl.page_size); + NVME_CTRL_PAGE_SIZE); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); + + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1817,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) { + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; u64 dma_addr = dev->host_mem_descs_dma; struct nvme_command c; int ret; @@ -1825,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); c.features.dword11 = cpu_to_le32(bits); - c.features.dword12 = cpu_to_le32(dev->host_mem_size >> - ilog2(dev->ctrl.page_size)); + c.features.dword12 = cpu_to_le32(host_mem_size); c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); @@ -1846,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) for (i = 0; i < dev->nr_host_mem_descs; i++) { struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; - size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], le64_to_cpu(desc->addr), @@ -1898,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, break; descs[i].addr = cpu_to_le64(dma_addr); - descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); i++; } @@ -1914,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, out_free_bufs: while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, bufs[i], le64_to_cpu(descs[i].addr), @@ -1932,12 +1931,12 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - u32 chunk_size; + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - chunk_size /= 2) { + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; @@ -2003,7 +2002,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* - * If there is no interupt available for queues, ensure that + * If there is no interrupt available for queues, ensure that * the default queue is set to 1. The affinity set size is * also set to one, but the irq core ignores it for this case. * @@ -2261,8 +2260,8 @@ static void nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; @@ -2321,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); @@ -2760,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +#ifdef CONFIG_ACPI +static bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + struct acpi_device *adev; + struct pci_dev *root; + acpi_handle handle; + acpi_status status; + u8 val; + + /* + * Look for _DSD property specifying that the storage device on the port + * must use D3 to support deep platform power savings during + * suspend-to-idle. + */ + root = pcie_find_root_port(dev); + if (!root) + return false; + + adev = ACPI_COMPANION(&root->dev); + if (!adev) + return false; + + /* + * The property is defined in the PXSX device for South complex ports + * and in the PEGP device for North complex ports. + */ + status = acpi_get_handle(adev->handle, "PXSX", &handle); + if (ACPI_FAILURE(status)) { + status = acpi_get_handle(adev->handle, "PEGP", &handle); + if (ACPI_FAILURE(status)) + return false; + } + + if (acpi_bus_get_device(handle, &adev)) + return false; + + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", + &val)) + return false; + return val == 1; +} +#else +static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + return false; +} +#endif /* CONFIG_ACPI */ + static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; @@ -2809,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= check_vendor_combination_bug(pdev); + if (!noacpi && nvme_acpi_storage_d3(pdev)) { + /* + * Some systems use a bios work around to ask for D3 on + * platforms that support kernel managed suspend. + */ + dev_info(&pdev->dev, + "platform quirk: setting simple suspend\n"); + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; + } + /* * Double check that our mempool alloc size will cover the biggest * command we support. */ - alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, - NVME_MAX_SEGS, true); + alloc_size = nvme_pci_iod_alloc_size(); WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, @@ -2876,6 +2932,7 @@ static void nvme_reset_done(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_disable_prepare_reset(dev, true); } @@ -3006,6 +3063,7 @@ unfreeze: static int nvme_simple_suspend(struct device *dev) { struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + return nvme_disable_prepare_reset(ndev, true); } @@ -3079,16 +3137,16 @@ static const struct pci_error_handlers nvme_err_handler = { }; static const struct pci_device_id nvme_id_table[] = { - { PCI_VDEVICE(INTEL, 0x0953), + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a53), + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a54), + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a55), + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e881f879ac63..44c76ffbb264 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -96,6 +96,7 @@ struct nvme_rdma_queue { int cm_error; struct completion cm_done; bool pi_support; + int cq_size; }; struct nvme_rdma_ctrl { @@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.recv_cq = queue->ib_cq; if (queue->pi_support) init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -409,6 +411,14 @@ out_err: return NULL; } +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else + ib_cq_pool_put(queue->ib_cq, queue->cq_size); +} + static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; @@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } +static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } + + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + return ret; + } + + return 0; +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ - int comp_vector, idx = nvme_rdma_queue_idx(queue); - enum ib_poll_context poll_ctx; int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* - * Spread I/O queues completion vectors according their queue index. - * Admin queues can always go on completion vector 0. - */ - comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; - - /* Polling queues need direct cq polling context */ - if (nvme_rdma_poll_queue(queue)) - poll_ctx = IB_POLL_DIRECT; - else - poll_ctx = IB_POLL_SOFTIRQ; - /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(ibdev, queue, - cq_factor * queue->queue_size + 1, - comp_vector, poll_ctx); - if (IS_ERR(queue->ib_cq)) { - ret = PTR_ERR(queue->ib_cq); + queue->cq_size = cq_factor * queue->queue_size + 1; + + ret = nvme_rdma_create_cq(ibdev, queue); + if (ret) goto out_put_dev; - } ret = nvme_rdma_create_qp(queue, send_wr_factor); if (ret) @@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } @@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize PI MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_mr_pool; } } @@ -540,7 +565,7 @@ out_destroy_ring: out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; @@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; out_cleanup_connect_q: @@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { @@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); return; } @@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (ctrl->ctrl.state == NVME_CTRL_LIVE) @@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 472f9001521d..62fbaecdc960 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -46,6 +46,7 @@ struct nvme_tcp_request { u32 pdu_sent; u16 ttag; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -75,9 +76,10 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; + bool more_requests; /* recv state */ void *pdu; @@ -261,15 +263,13 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync) + bool sync, bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; - spin_lock(&queue->lock); - empty = list_empty(&queue->send_list) && !queue->request; - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; /* * if we're the first on the send_list and we can try to send @@ -278,25 +278,42 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, */ if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { + queue->more_requests = !last; nvme_tcp_try_send(queue); + queue->more_requests = false; mutex_unlock(&queue->send_mutex); - } else { + } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } } +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + static inline struct nvme_tcp_request * nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del(&req->entry); return req; } @@ -596,7 +613,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req, false); + nvme_tcp_queue_request(req, false, true); return 0; } @@ -863,6 +880,12 @@ done: read_unlock(&sk->sk_callback_lock); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list) || queue->more_requests; +} + static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) { queue->request = NULL; @@ -884,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) bool last = nvme_tcp_pdu_last_send(req, len); int ret, flags = MSG_DONTWAIT; - if (last && !queue->data_digest) + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) flags |= MSG_EOR; else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; @@ -931,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) int flags = MSG_DONTWAIT; int ret; - if (inline_data) + if (inline_data || nvme_tcp_queue_more(queue)) flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; else flags |= MSG_EOR; @@ -996,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = &req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; @@ -1344,8 +1372,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int ret, rcv_pdu_size; queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; @@ -1746,15 +1774,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(ctrl->tagset, - ctrl->queue_count - 1); } ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(ctrl); + nvme_wait_freeze(ctrl); + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + return 0; out_cleanup_connect_q: @@ -1859,6 +1892,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; + nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { @@ -1925,11 +1959,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1985,8 +2020,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2021,8 +2057,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work) nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2109,7 +2146,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true); + nvme_tcp_queue_request(&ctrl->async_req, true, true); } static enum blk_eh_timer_return @@ -2221,6 +2258,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2240,7 +2285,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req, true); + nvme_tcp_queue_request(req, true, bd->last); return BLK_STS_OK; } @@ -2308,6 +2353,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c new file mode 100644 index 000000000000..57cfd78731fb --- /dev/null +++ b/drivers/nvme/host/zns.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include "nvme.h" + +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_zns *id; + int status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (status) { + kfree(id); + return status; + } + + if (id->zasl) + ctrl->max_zone_append = 1 << (id->zasl + 3); + else + ctrl->max_zone_append = ctrl->max_hw_sectors; + kfree(id); + return 0; +} + +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf) +{ + struct nvme_effects_log *log = ns->head->effects; + struct request_queue *q = disk->queue; + struct nvme_command c = { }; + struct nvme_id_ns_zns *id; + int status; + + /* Driver requires zone append support */ + if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + NVME_CMD_EFFECTS_CSUPP)) { + dev_warn(ns->ctrl->device, + "append not supported for zoned namespace:%d\n", + ns->head->ns_id); + return -EINVAL; + } + + /* Lazily query controller append limit for the first zoned namespace */ + if (!ns->ctrl->max_zone_append) { + status = nvme_set_max_append(ns->ctrl); + if (status) + return status; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.cns = NVME_ID_CNS_CS_NS; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); + if (status) + goto free_data; + + /* + * We currently do not handle devices requiring any of the zoned + * operation characteristics. + */ + if (id->zoc) { + dev_warn(ns->ctrl->device, + "zone operations:%x not supported for namespace:%u\n", + le16_to_cpu(id->zoc), ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze)); + if (!is_power_of_2(ns->zsze)) { + dev_warn(ns->ctrl->device, + "invalid zone size:%llu for namespace:%u\n", + ns->zsze, ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); + blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); +free_data: + kfree(id); + return status; +} + +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ns->disk->queue; + size_t bufsize; + void *buf; + + const size_t min_bufsize = sizeof(struct nvme_zone_report) + + sizeof(struct nvme_zone_descriptor); + + nr_zones = min_t(unsigned int, nr_zones, + get_capacity(ns->disk) >> ilog2(ns->zsze)); + + bufsize = sizeof(struct nvme_zone_report) + + nr_zones * sizeof(struct nvme_zone_descriptor); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + while (bufsize >= min_bufsize) { + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + return NULL; +} + +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + struct nvme_zone_report *report, + size_t buflen) +{ + struct nvme_command c = { }; + int ret; + + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) + return ret; + + return le64_to_cpu(report->nr_zones); +} + +static int nvme_zone_parse_entry(struct nvme_ns *ns, + struct nvme_zone_descriptor *entry, + unsigned int idx, report_zones_cb cb, + void *data) +{ + struct blk_zone zone = { }; + + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { + dev_err(ns->ctrl->device, "invalid zone type %#x\n", + entry->zt); + return -EINVAL; + } + + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone.cond = entry->zs >> 4; + zone.len = ns->zsze; + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + + return cb(&zone, idx, data); +} + +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_zone_report *report; + int ret, zone_idx = 0; + unsigned int nz, i; + size_t buflen; + + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen); + if (!report) + return -ENOMEM; + + sector &= ~(ns->zsze - 1); + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { + memset(report, 0, buflen); + ret = __nvme_ns_report_zones(ns, sector, report, buflen); + if (ret < 0) + goto out_free; + + nz = min_t(unsigned int, ret, nr_zones); + if (!nz) + break; + + for (i = 0; i < nz && zone_idx < nr_zones; i++) { + ret = nvme_zone_parse_entry(ns, &report->entries[i], + zone_idx, cb, data); + if (ret) + goto out_free; + zone_idx++; + } + + sector += ns->zsze * nz; + } + + if (zone_idx > 0) + ret = zone_idx; + else + ret = -EINVAL; +out_free: + kvfree(report); + return ret; +} + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + if (ns->head->ids.csi == NVME_CSI_ZNS) + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + else + ret = -EINVAL; + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; +} + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *c, enum nvme_zone_mgmt_action action) +{ + c->zms.opcode = nvme_cmd_zone_mgmt_send; + c->zms.nsid = cpu_to_le32(ns->head->ns_id); + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + c->zms.zsa = action; + + if (req_op(req) == REQ_OP_ZONE_RESET_ALL) + c->zms.select_all = 1; + + return BLK_STS_OK; +} |