diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 561 |
1 files changed, 412 insertions, 149 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6bdcdd984394..767d62985bba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) * Revalidating a dead namespace sets capacity to 0. This will end * buffered writers dirtying pages that can't be synced. */ - if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { @@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; @@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } +static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -884,6 +929,120 @@ out: return ERR_PTR(ret); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (_nvme_revalidate_disk(ns->disk)) + nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } + up_read(&ctrl->namespaces_rwsem); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl, &effects); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } +} + +void nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + blk_execute_rq(rq->q, disk, rq, 0); + nvme_passthru_end(ctrl, effects); +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } - blk_execute_rq(req->q, disk, req, 0); + nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else @@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_update_formats(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - up_read(&ctrl->namespaces_rwsem); -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; u64 result; int status; @@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, return disk->private_data; } -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if (head) srcu_read_unlock(&head->srcu, idx); @@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk, static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + int ret; u32 iob; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + case NVME_CSI_ZNS: + ret = nvme_update_zone_info(disk, ns, lbaf); + if (ret) { + dev_warn(ctrl->device, + "failed to add zoned namespace:%u ret:%d\n", + ns->head->ns_id, ret); + return ret; + } + break; + default: + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -1977,7 +2088,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return 0; } -static int nvme_revalidate_disk(struct gendisk *disk) +static int _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2025,6 +2136,28 @@ out: return ret; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + int ret; + + ret = _nvme_revalidate_disk(disk); + if (ret) + return ret; + +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(disk->queue)) { + struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + + ret = blk_revalidate_disk_zones(disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(disk->queue, + ctrl->max_zone_append); + } +#endif + return ret; +} + static char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2155,6 +2288,7 @@ static const struct block_device_operations nvme_fops = { .release = nvme_release, .getgeo = nvme_getgeo, .revalidate_disk= nvme_revalidate_disk, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2181,6 +2315,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; #endif /* CONFIG_NVME_MULTIPATH */ @@ -2238,12 +2373,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min, page_shift = 12; + unsigned dev_page_min; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2253,17 +2383,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - if (page_shift < dev_page_min) { + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); return -ENODEV; } - ctrl->page_size = 1 << page_shift; - - ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; @@ -2313,13 +2444,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, if (ctrl->max_hw_sectors) { u32 max_segments = - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; @@ -2810,7 +2941,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2824,27 +2955,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) +{ + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) { + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2865,7 +3024,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); @@ -2877,7 +3036,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -2939,7 +3098,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); @@ -3345,6 +3504,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -3397,6 +3557,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev, } static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + else if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err) + return err; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3414,6 +3634,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, NULL }; @@ -3510,6 +3732,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3591,7 +3820,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3609,6 +3838,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -3726,7 +3956,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk && ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -3757,7 +3987,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (ns->disk && revalidate_disk(ns->disk)) + if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else @@ -3850,8 +4080,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3912,6 +4142,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); @@ -3995,8 +4228,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4106,8 +4339,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl) { - if (ctrl->kato) - nvme_start_keep_alive(ctrl); + nvme_start_keep_alive(ctrl); nvme_enable_aen(ctrl); @@ -4133,11 +4365,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4168,6 +4405,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4346,6 +4584,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +{ + struct nvme_ctrl *ctrl; + struct file *f; + + f = filp_open(path, O_RDWR, 0); + if (IS_ERR(f)) + return ERR_CAST(f); + + if (f->f_op != &nvme_dev_fops) { + ctrl = ERR_PTR(-EINVAL); + goto out_close; + } + + ctrl = f->private_data; + nvme_get_ctrl(ctrl); + +out_close: + filp_close(f, NULL); + return ctrl; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); + /* * Check we didn't inadvertently grow the command structure sizes: */ @@ -4364,6 +4625,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); |