diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 101 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 13 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 15 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 98 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 46 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 89 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 84 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 108 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 9 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 3 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 30 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 10 |
14 files changed, 416 insertions, 195 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 2b36f052bfb9..7b3f6555e67b 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -64,6 +64,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f0e0af3aa714..ce69aaea581a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -288,11 +288,8 @@ void nvme_complete_rq(struct request *req) nvme_req(req)->ctrl->comp_seen = true; if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && - blk_path_error(status)) { - nvme_failover_req(req); + if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) return; - } if (!blk_queue_dying(req->q)) { nvme_retry_req(req); @@ -633,7 +630,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } __rq_for_each_bio(bio, req) { - u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; if (n < segments) { @@ -674,7 +671,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); cmnd->write_zeroes.slba = - cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->write_zeroes.control = 0; @@ -698,7 +695,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) @@ -1032,6 +1029,19 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); +/* + * In NVMe 1.0 the CNS field was just a binary controller or namespace + * flag, thus sending any new CNS opcodes has a big chance of not working. + * Qemu unfortunately had that bug after reporting a 1.1 version compliance + * (but not for any later version). + */ +static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) +{ + if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) + return ctrl->vs < NVME_VS(1, 2, 0); + return ctrl->vs < NVME_VS(1, 1, 0); +} + static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; @@ -1061,6 +1071,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, int pos; int len; + if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) + return 0; + c.identify.opcode = nvme_admin_identify; c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; @@ -1074,12 +1087,6 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (status) { dev_warn(ctrl->device, "Identify Descriptors failed (%d)\n", status); - /* - * Don't treat an error as fatal, as we potentially already - * have a NGUID or EUI-64. - */ - if (status > 0 && !(status & NVME_SC_DNR)) - status = 0; goto free_data; } @@ -1673,12 +1680,6 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_set_chunk_size(struct nvme_ns *ns) -{ - u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); -} - static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -1712,8 +1713,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { - u32 max_sectors; - unsigned short bs = 1 << ns->lba_shift; + u64 max_blocks; if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) @@ -1729,11 +1729,12 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) * nvme_init_identify() if available. */ if (ns->ctrl->max_hw_sectors == UINT_MAX) - max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; + max_blocks = (u64)USHRT_MAX + 1; else - max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; + max_blocks = ns->ctrl->max_hw_sectors + 1; - blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); + blk_queue_max_write_zeroes_sectors(disk->queue, + nvme_lba_to_sect(ns, max_blocks)); } static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1767,7 +1768,7 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { - sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); + sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt; @@ -1833,6 +1834,7 @@ static void nvme_update_disk_info(struct gendisk *disk, static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; + u32 iob; /* * If identify namespace failed, use default 512 byte block size so @@ -1841,7 +1843,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; - ns->noiob = le16_to_cpu(id->noiob); + + if ((ns->ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && + is_power_of_2(ns->ctrl->max_hw_sectors)) + iob = ns->ctrl->max_hw_sectors; + else + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); /* the PI implementation requires metadata equal t10 pi tuple size */ @@ -1850,14 +1858,14 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else ns->pi_type = 0; - if (ns->noiob) - nvme_set_chunk_size(ns); + if (iob) + blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); - revalidate_disk(ns->head->disk); + nvme_mpath_update_disk_size(ns->head->disk); } #endif } @@ -2202,9 +2210,6 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && - is_power_of_2(ctrl->max_hw_sectors)) - blk_queue_chunk_sectors(q, ctrl->max_hw_sectors); blk_queue_virt_boundary(q, ctrl->page_size - 1); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; @@ -2926,10 +2931,26 @@ static int nvme_dev_open(struct inode *inode, struct file *file) return -EWOULDBLOCK; } + nvme_get_ctrl(ctrl); + if (!try_module_get(ctrl->ops->module)) { + nvme_put_ctrl(ctrl); + return -EINVAL; + } + file->private_data = ctrl; return 0; } +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); + + module_put(ctrl->ops->module); + nvme_put_ctrl(ctrl); + return 0; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; @@ -2992,6 +3013,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, + .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = nvme_dev_ioctl, }; @@ -3190,6 +3212,10 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + /* Can't delete non-created controllers */ + if (!ctrl->created) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -3740,8 +3766,7 @@ static void nvme_scan_work(struct work_struct *work) mutex_lock(&ctrl->scan_lock); nn = le32_to_cpu(id->nn); - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + if (!nvme_ctrl_limited_cns(ctrl)) { if (!nvme_scan_ns_list(ctrl, nn)) goto out_free_id; } @@ -3986,6 +4011,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); @@ -4003,7 +4029,7 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - if (subsys && ctrl->instance != subsys->instance) + if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); @@ -4076,6 +4102,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_release_instance; + nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); ctrl->cdev.owner = ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); @@ -4094,6 +4121,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: + nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); @@ -4139,7 +4167,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unfreeze); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; @@ -4150,6 +4178,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) break; } up_read(&ctrl->namespaces_rwsem); + return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 74b8818ac9a1..3bb71f177dfd 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -565,10 +565,14 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, struct nvme_request *req = nvme_req(rq); /* - * If we are in some state of setup or teardown only allow - * internally generated commands. + * currently we have a problem sending passthru commands + * on the admin_q if the controller is not LIVE because we can't + * make sure that they are going out after the admin connect, + * controller enable and/or other commands in the initialization + * sequence. until the controller will be LIVE, fail with + * BLK_STS_RESOURCE so that they will be rescheduled. */ - if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) + if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) return false; /* @@ -576,9 +580,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, * which is require to set the queue live in the appropinquate states. */ switch (ctrl->state) { - case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - if (nvme_is_fabrics(req->cmd) && + if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; break; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 83ac88924f25..65b3dc9cd693 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1740,7 +1740,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { dev_err(ctrl->dev, "FCP Op failed - cmdiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; goto out_on_error; } @@ -1750,7 +1750,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { dev_err(ctrl->dev, "FCP Op failed - rspiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; } atomic_set(&op->state, FCPOP_STATE_IDLE); @@ -1820,6 +1820,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) struct nvme_fc_fcp_op *aen_op; int i; + cancel_work_sync(&ctrl->ctrl.async_event_work); aen_op = ctrl->aen_ops; for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (!aen_op->fcp_req.private) @@ -3170,10 +3171,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto fail_ctrl; } - nvme_get_ctrl(&ctrl->ctrl); - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - nvme_put_ctrl(&ctrl->ctrl); dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to schedule initial connect\n", ctrl->cnum); @@ -3198,6 +3196,7 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); @@ -3320,12 +3319,14 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) spin_lock_irqsave(&nvme_fc_lock, flags); list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { if (lport->localport.node_name != laddr.nn || - lport->localport.port_name != laddr.pn) + lport->localport.port_name != laddr.pn || + lport->localport.port_state != FC_OBJSTATE_ONLINE) continue; list_for_each_entry(rport, &lport->endp_list, endp_list) { if (rport->remoteport.node_name != raddr.nn || - rport->remoteport.port_name != raddr.pn) + rport->remoteport.port_name != raddr.pn || + rport->remoteport.port_state != FC_OBJSTATE_ONLINE) continue; /* if fail to get reference fall through. Will error */ diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 56caddeabb5e..3968f89f7855 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -3,6 +3,7 @@ * Copyright (c) 2017-2018 Christoph Hellwig. */ +#include <linux/backing-dev.h> #include <linux/moduleparam.h> #include <trace/events/block.h> #include "nvme.h" @@ -64,17 +65,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -void nvme_failover_req(struct request *req) +bool nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status; unsigned long flags; - spin_lock_irqsave(&ns->head->requeue_lock, flags); - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); - switch (status & 0x7ff) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: @@ -103,15 +99,17 @@ void nvme_failover_req(struct request *req) nvme_mpath_clear_current_path(ns); break; default: - /* - * Reset the controller for any non-ANA error as we don't know - * what caused the error. - */ - nvme_reset_ctrl(ns->ctrl); - break; + /* This was a non-ANA error so follow the normal error path. */ + return false; } + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); + return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -248,6 +246,17 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ + if (!nvme_path_is_disabled(old) && + (old->ana_state == NVME_ANA_OPTIMIZED || + (!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED))) + return old; + if (!fallback) return NULL; found = fallback; @@ -268,10 +277,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) - ns = nvme_round_robin_path(head, node, ns); - if (unlikely(!ns || !nvme_path_is_optimized(ns))) - ns = __nvme_find_path(head, node); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); return ns; } @@ -413,15 +425,14 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head; - lockdep_assert_held(&ns->head->lock); - if (!head->disk) return; - if (!(head->disk->flags & GENHD_FL_UP)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx; @@ -430,9 +441,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } + mutex_unlock(&head->lock); - synchronize_srcu(&ns->head->srcu); - kblockd_schedule_work(&ns->head->requeue_work); + synchronize_srcu(&head->srcu); + kblockd_schedule_work(&head->requeue_work); } static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, @@ -483,14 +495,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state) static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, struct nvme_ns *ns) { - mutex_lock(&ns->head->lock); ns->ana_grpid = le32_to_cpu(desc->grpid); ns->ana_state = desc->state; clear_bit(NVME_NS_ANA_PENDING, &ns->flags); if (nvme_state_is_live(ns->ana_state)) nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); } static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -640,31 +650,45 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); -static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, +static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { - struct nvme_ns *ns = data; + struct nvme_ana_group_desc *dst = data; - if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { - nvme_update_ns_ana_state(desc, ns); - return -ENXIO; /* just break out of the loop */ - } + if (desc->grpid != dst->grpid) + return 0; - return 0; + *dst = *desc; + return -ENXIO; /* just break out of the loop */ } void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) { if (nvme_ctrl_use_ana(ns->ctrl)) { + struct nvme_ana_group_desc desc = { + .grpid = id->anagrpid, + .state = 0, + }; + mutex_lock(&ns->ctrl->ana_lock); ns->ana_grpid = le32_to_cpu(id->anagrpid); - nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); + nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); mutex_unlock(&ns->ctrl->ana_lock); + if (desc.state) { + /* found the group desc: update */ + nvme_update_ns_ana_state(&desc, ns); + } } else { - mutex_lock(&ns->head->lock); ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); + } + + if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { + struct gendisk *disk = ns->head->disk; + + if (disk) + disk->queue->backing_dev_info->capabilities |= + BDI_CAP_STABLE_WRITES; } } @@ -679,6 +703,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); blk_cleanup_queue(head->disk->queue); + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * if device_add_disk wasn't called, prevent + * disk release to put a bogus reference on the + * request queue + */ + head->disk->queue = NULL; + } put_disk(head->disk); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 22e8401352c2..d7132d8cb7c5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -115,6 +115,13 @@ enum nvme_quirks { * Prevent tag overlap between queues */ NVME_QUIRK_SHARED_TAGS = (1 << 13), + + /* + * The controller doesn't handle the Identify Namespace + * Identification Descriptor list subcommand despite claiming + * NVMe 1.3 compliance. + */ + NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15), }; /* @@ -246,6 +253,7 @@ struct nvme_ctrl { struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; + bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ @@ -345,6 +353,8 @@ struct nvme_ns_head { spinlock_t requeue_lock; struct work_struct requeue_work; struct mutex lock; + unsigned long flags; +#define NVME_NSHEAD_DISK_LIVE 0 struct nvme_ns __rcu *current_path[]; #endif }; @@ -374,7 +384,6 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 - u16 noiob; struct nvme_fault_inject fault_inject; @@ -419,9 +428,20 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); } -static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) +/* + * Convert a 512B sector number to a device logical block number. + */ +static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) { - return (sector >> (ns->lba_shift - 9)); + return sector >> (ns->lba_shift - SECTOR_SHIFT); +} + +/* + * Convert a device logical block number to a 512B sector number. + */ +static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba) +{ + return lba << (ns->lba_shift - SECTOR_SHIFT); } static inline void nvme_end_request(struct request *req, __le16 status, @@ -476,7 +496,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 @@ -521,7 +541,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); +bool nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -551,6 +571,16 @@ static inline void nvme_trace_bio_complete(struct request *req, req->bio, status); } +static inline void nvme_mpath_update_disk_size(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + + if (bdev) { + bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); + bdput(bdev); + } +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -570,8 +600,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline void nvme_failover_req(struct request *req) +static inline bool nvme_failover_req(struct request *req) { + return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { @@ -625,6 +656,9 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_update_disk_size(struct gendisk *disk) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cd64ddb129e5..f5d12bf109c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -128,6 +128,9 @@ struct nvme_dev { dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; + unsigned int nr_allocated_queues; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -210,25 +213,14 @@ struct nvme_iod { struct scatterlist *sg; }; -static unsigned int max_io_queues(void) +static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) { - return num_possible_cpus() + write_queues + poll_queues; -} - -static unsigned int max_queue_count(void) -{ - /* IO queues + admin queue */ - return 1 + max_io_queues(); -} - -static inline unsigned int nvme_dbbuf_size(u32 stride) -{ - return (max_queue_count() * 8 * stride); + return dev->nr_allocated_queues * 8 * dev->db_stride; } static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); + unsigned int mem_size = nvme_dbbuf_size(dev); if (dev->dbbuf_dbs) return 0; @@ -253,7 +245,7 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) static void nvme_dbbuf_dma_free(struct nvme_dev *dev) { - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); + unsigned int mem_size = nvme_dbbuf_size(dev); if (dev->dbbuf_dbs) { dma_free_coherent(dev->dev, mem_size, @@ -949,13 +941,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req; - if (unlikely(cqe->command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpu(cqe->sq_id)); - return; - } - /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to @@ -970,6 +955,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); + if (unlikely(!req)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; + } + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); nvme_end_request(req, cqe->status, cqe->result); } @@ -1282,8 +1274,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; @@ -1300,10 +1292,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; } @@ -2030,7 +2022,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) { struct nvme_dev *dev = affd->priv; - unsigned int nr_read_queues; + unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* * If there is no interupt available for queues, ensure that @@ -2046,12 +2038,12 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) if (!nrirqs) { nrirqs = 1; nr_read_queues = 0; - } else if (nrirqs == 1 || !write_queues) { + } else if (nrirqs == 1 || !nr_write_queues) { nr_read_queues = 0; - } else if (write_queues >= nrirqs) { + } else if (nr_write_queues >= nrirqs) { nr_read_queues = 1; } else { - nr_read_queues = nrirqs - write_queues; + nr_read_queues = nrirqs - nr_write_queues; } dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; @@ -2075,7 +2067,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) * Poll queues don't need interrupts, but we need at least one IO * queue left over for non-polled IO. */ - this_p_queues = poll_queues; + this_p_queues = dev->nr_poll_queues; if (this_p_queues >= nr_io_queues) { this_p_queues = nr_io_queues - 1; irq_queues = 1; @@ -2105,14 +2097,25 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) __nvme_disable_io_queues(dev, nvme_admin_delete_cq); } +static unsigned int nvme_max_io_queues(struct nvme_dev *dev) +{ + return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); - int result, nr_io_queues; + unsigned int nr_io_queues; unsigned long size; + int result; - nr_io_queues = max_io_queues(); + /* + * Sample the module parameters once at reset time so that we have + * stable values to work with. + */ + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; /* * If tags are shared with admin queue (Apple bug), then @@ -2120,6 +2123,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) */ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) nr_io_queues = 1; + else + nr_io_queues = min(nvme_max_io_queues(dev), + dev->nr_allocated_queues - 1); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) @@ -2794,8 +2800,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev) return -ENOMEM; - dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue), - GFP_KERNEL, node); + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; + dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; + dev->queues = kcalloc_node(dev->nr_allocated_queues, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2841,7 +2850,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_reset_ctrl(&dev->ctrl); - nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); return 0; @@ -2962,9 +2970,15 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). + * + * If a host memory buffer is enabled, shut down the device as the NVMe + * specification allows the device to access the host memory buffer in + * host DRAM from all power states, but hosts will fail access to DRAM + * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || + ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3096,12 +3110,15 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | - NVME_QUIRK_MEDIUM_PRIO_SQ }, + NVME_QUIRK_MEDIUM_PRIO_SQ | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS | NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ @@ -3125,6 +3142,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 73e8475ddc8a..abe4fe496d05 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -110,6 +110,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; @@ -451,7 +452,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) * Spread I/O queues completion vectors according their queue index. * Admin queues can always go on completion vector 0. */ - comp_vector = idx == 0 ? idx : idx - 1; + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; /* Polling queues need direct cq polling context */ if (nvme_rdma_poll_queue(queue)) @@ -768,6 +769,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { + cancel_work_sync(&ctrl->ctrl.async_event_work); nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); ctrl->async_event_sqe.data = NULL; @@ -890,17 +892,33 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->ctrl.connect_q); @@ -915,6 +933,7 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -925,12 +944,15 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { @@ -942,6 +964,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) @@ -1090,6 +1113,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_reset_wq, &ctrl->err_work); } @@ -1693,6 +1717,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&ctrl->teardown_lock); + nvme_rdma_stop_queue(queue); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&ctrl->teardown_lock); +} + static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq, bool reserved) { @@ -1703,29 +1743,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved) dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", rq->tag, nvme_rdma_queue_idx(queue)); - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_rdma_teardown_io_queues(ctrl, false); - nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ nvme_rdma_error_recovery(ctrl); - return BLK_EH_RESET_TIMER; } @@ -1982,6 +2022,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = @@ -2047,8 +2088,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); @@ -2058,6 +2097,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 11e84ed4de36..e159b78b5f3b 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -110,6 +110,7 @@ struct nvme_tcp_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; @@ -420,6 +421,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->device, "starting error recovery\n"); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } @@ -784,11 +786,11 @@ static void nvme_tcp_data_ready(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue && queue->rd_enabled)) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void nvme_tcp_write_space(struct sock *sk) @@ -859,12 +861,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else flags |= MSG_MORE; - /* can't zcopy slab pages */ - if (unlikely(PageSlab(page))) { - ret = sock_no_sendpage(queue->sock, page, offset, len, + if (sendpage_ok(page)) { + ret = kernel_sendpage(queue->sock, page, offset, len, flags); } else { - ret = kernel_sendpage(queue->sock, page, offset, len, + ret = sock_no_sendpage(queue->sock, page, offset, len, flags); } if (ret <= 0) @@ -1319,6 +1320,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } } + /* Set 10 seconds timeout for icresp recvmsg */ + queue->sock->sk->sk_rcvtimeo = 10 * HZ; + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -1435,7 +1439,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) return; - __nvme_tcp_stop_queue(queue); } @@ -1503,6 +1506,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) { if (to_tcp_ctrl(ctrl)->async_req.pdu) { + cancel_work_sync(&ctrl->async_event_work); nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); to_tcp_ctrl(ctrl)->async_req.pdu = NULL; } @@ -1681,17 +1685,33 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(ctrl->tagset, - ctrl->queue_count - 1); } ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->connect_q); @@ -1777,6 +1797,7 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1787,13 +1808,17 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); if (ctrl->queue_count <= 1) - return; + goto out; + blk_mq_quiesce_queue(ctrl->admin_q); + nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { @@ -1804,6 +1829,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); +out: + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) @@ -2042,40 +2069,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) nvme_tcp_queue_request(&ctrl->async_req); } +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); +} + static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq, bool reserved) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - - dev_warn(ctrl->ctrl.device, + dev_warn(ctrl->device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + if (ctrl->state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } @@ -2302,6 +2344,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, nvme_tcp_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = @@ -2360,8 +2403,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); mutex_unlock(&nvme_tcp_ctrl_mutex); @@ -2371,6 +2412,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 57a4062cbb59..6b2f1e290fa7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -369,6 +369,9 @@ static void nvmet_keep_alive_timer(struct work_struct *work) static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d start keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); @@ -378,6 +381,9 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); cancel_delayed_work_sync(&ctrl->ka_work); @@ -1042,7 +1048,8 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * in case a host died before it enabled the controller. Hence, simply * reset the keep alive timer when the controller is enabled. */ - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + if (ctrl->kato) + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index ce8d819f86cc..fc35f7ae67b0 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1994,9 +1994,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) return; if (fcpreq->fcp_error || fcpreq->transferred_length != fcpreq->transfer_length) { - spin_lock(&fod->flock); + spin_lock_irqsave(&fod->flock, flags); fod->abort = true; - spin_unlock(&fod->flock); + spin_unlock_irqrestore(&fod->flock, flags); nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 11f5aea97d1b..82b87a4c50f6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -619,8 +619,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - nvme_get_ctrl(&ctrl->ctrl); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); @@ -638,6 +636,7 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 36d906a7f70d..b5314164479e 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -75,6 +75,7 @@ enum nvmet_rdma_queue_state { struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; @@ -464,7 +465,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, if (ndev->srq) ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); else - ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); @@ -503,7 +504,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } @@ -587,7 +588,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; @@ -742,7 +743,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { @@ -1025,6 +1026,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -1053,11 +1055,10 @@ err_destroy_cq: static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp *qp = queue->cm_id->qp; - - ib_drain_qp(qp); - rdma_destroy_id(queue->cm_id); - ib_destroy_qp(qp); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); ib_free_cq(queue->cq); } @@ -1291,9 +1292,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { - schedule_work(&queue->release_work); - /* Destroying rdma_cm id is not needed here */ - return 0; + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); @@ -1302,6 +1306,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; +free_queue: + nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 22014e76d771..e31823f19a0f 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -150,6 +150,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { + if (unlikely(!queue->nr_cmds)) { + /* We didn't allocate cmds yet, send 0xffff */ + return USHRT_MAX; + } + return cmd - queue->cmds; } @@ -847,7 +852,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - cmd = &queue->cmds[data->ttag]; + if (likely(queue->nr_cmds)) + cmd = &queue->cmds[data->ttag]; + else + cmd = &queue->connect; if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", |