From 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 21 Feb 2014 14:13:44 -0700 Subject: NVMe: RCU protected access to io queues This adds rcu protected access to nvme_queue to fix a race between a surprise removal freeing the queue and a thread with open reference on a NVMe block device using that queue. The queues do not need to be rcu protected during the initialization or shutdown parts, so I've added a helper function for raw deferencing to get around the sparse errors. There is still a hole in the IOCTL path for the same problem, which is fixed in a subsequent patch. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ae03f6eb15..98d367b06f9c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -73,7 +73,7 @@ enum { */ struct nvme_dev { struct list_head node; - struct nvme_queue **queues; + struct nvme_queue __rcu **queues; u32 __iomem *dbs; struct pci_dev *pci_dev; struct dma_pool *prp_page_pool; -- cgit v1.2.3 From 4f5099af4f3d5f999d8ab7784472d93e810e3912 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 3 Mar 2014 16:39:13 -0700 Subject: NVMe: IOCTL path RCU protect queue access This adds rcu protected access to a queue in the nvme IOCTL path to fix potential races between a surprise removal and queue usage in nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent the nvme_queue from freeing while this path is executing so it can't sleep, and so this path will no longer wait for a available command id should they all be in use at the time a passthrough IOCTL request is received. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 98d367b06f9c..7c3f85bc10f1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -151,10 +151,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length); void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod); -struct nvme_queue *get_nvmeq(struct nvme_dev *dev); -void put_nvmeq(struct nvme_queue *nvmeq); -int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - u32 *result, unsigned timeout); +int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *); int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, u32 *result); -- cgit v1.2.3 From 42f614201e80ff4cfb8b285d7190149a8e1e6cec Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Mar 2014 10:46:25 -0600 Subject: NVMe: per-cpu io queues The device's IO queues are associated with CPUs, so we can use a per-cpu variable to map the a qid to a cpu. This provides a convienient way to optimally assign queues to multiple cpus when the device supports fewer queues than the host has cpus. The previous implementation may have assigned these poorly in these situations. This patch addresses this by sharing queues among cpus that are "close" together and should have a lower lock contention penalty. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7c3f85bc10f1..f0f95c719685 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -74,12 +74,16 @@ enum { struct nvme_dev { struct list_head node; struct nvme_queue __rcu **queues; + unsigned short __percpu *io_queue; u32 __iomem *dbs; struct pci_dev *pci_dev; struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool; int instance; - int queue_count; + unsigned queue_count; + unsigned online_queues; + unsigned max_qid; + int q_depth; u32 db_stride; u32 ctrl_config; struct msix_entry *entry; -- cgit v1.2.3 From 33b1e95c90447ea73e37e837ea0268a894919f19 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Mar 2014 10:46:26 -0600 Subject: NVMe: CPU hot plug notification Registers with hot cpu notification to rebalance, and potentially allocate additional, io queues. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f0f95c719685..15d071eba8b8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -92,6 +92,7 @@ struct nvme_dev { struct kref kref; struct miscdevice miscdev; struct work_struct reset_work; + struct notifier_block nb; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From b355084a891985d4cd0ca23b1a83366af2c4232d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 4 Apr 2014 11:43:36 -0600 Subject: NVMe: Make I/O timeout a module parameter Increase the default timeout to 30 seconds to match SCSI. Signed-off-by: Keith Busch [use byte instead of ushort] Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 15d071eba8b8..1da0807c65bc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -66,7 +66,8 @@ enum { #define NVME_VS(major, minor) (major << 16 | minor) -#define NVME_IO_TIMEOUT (5 * HZ) +extern unsigned char io_timeout; +#define NVME_IO_TIMEOUT (io_timeout * HZ) /* * Represents an NVM Express device. Each nvme_dev is a PCI function. -- cgit v1.2.3 From edd10d33283899fb15d99a290dcc9ceb3604ca78 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 3 Apr 2014 16:45:23 -0600 Subject: NVMe: Retry failed commands with non-fatal errors For commands returned with failed status, queue these for resubmission and continue retrying them until success or for a limited amount of time. The final timeout was arbitrarily chosen so requests can't be retried indefinitely. Since these are requeued on the nvmeq that submitted the command, the callbacks have to take an nvmeq instead of an nvme_dev as a parameter so that we can use the locked queue to append the iod to retry later. The nvme_iod conviently can be used to track how long we've been trying to successfully complete an iod request. The nvme_iod also provides the nvme prp dma mappings, so I had to move a few things around so we can keep those mappings. Signed-off-by: Keith Busch [fixed checkpatch issue with long line] Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1da0807c65bc..b95431d0338b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -136,6 +136,7 @@ struct nvme_iod { int length; /* Of data, in bytes */ unsigned long start_time; dma_addr_t first_dma; + struct list_head node; struct scatterlist sg[0]; }; @@ -151,8 +152,7 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) */ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); -int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, - struct nvme_iod *iod, int total_len, gfp_t gfp); +int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t); struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length); void nvme_unmap_user_pages(struct nvme_dev *dev, int write, -- cgit v1.2.3