From b62c21b71f08b7a4bfd025616ff1da2913a82904 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Mar 2015 23:56:02 -0400
Subject: blk-mq: add blk_mq_init_allocated_queue and export
 blk_mq_register_disk

Add a variant of blk_mq_init_queue that allows a previously allocated
queue to be initialized.  blk_mq_init_allocated_queue models
blk_init_allocated_queue -- which was also created for DM's use.

DM's approach to device creation requires a placeholder request_queue be
allocated for use with alloc_dev() but the decision about what type of
request_queue will be ultimately created is deferred until all component
devices referenced in the DM table are processed to determine the table
type (request-based, blk-mq request-based, or bio-based).

Also, because of DM's late finalization of the request_queue type
the call to blk_mq_register_disk() doesn't happen during alloc_dev().
Must export blk_mq_register_disk() so that DM can backfill the 'mq' dir
once the blk-mq queue is fully allocated.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c |  1 +
 block/blk-mq.c       | 30 ++++++++++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 1630a20d5dcf..b79685e06b70 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -436,6 +436,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blk_mq_register_disk);
 
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b7b8933ec241..3000121840bb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1890,10 +1890,26 @@ void blk_mq_release(struct request_queue *q)
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+{
+	struct request_queue *uninit_q, *q;
+
+	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
+	if (!uninit_q)
+		return ERR_PTR(-ENOMEM);
+
+	q = blk_mq_init_allocated_queue(set, uninit_q);
+	if (IS_ERR(q))
+		blk_cleanup_queue(uninit_q);
+
+	return q;
+}
+EXPORT_SYMBOL(blk_mq_init_queue);
+
+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+						  struct request_queue *q)
 {
 	struct blk_mq_hw_ctx **hctxs;
 	struct blk_mq_ctx __percpu *ctx;
-	struct request_queue *q;
 	unsigned int *map;
 	int i;
 
@@ -1928,17 +1944,13 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 		hctxs[i]->queue_num = i;
 	}
 
-	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
-	if (!q)
-		goto err_hctxs;
-
 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
 	 * See blk_register_queue() for details.
 	 */
 	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
 			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-		goto err_mq_usage;
+		goto err_hctxs;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, 30000);
@@ -1981,7 +1993,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
 	if (blk_mq_init_hw_queues(q, set))
-		goto err_mq_usage;
+		goto err_hctxs;
 
 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
@@ -1993,8 +2005,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	return q;
 
-err_mq_usage:
-	blk_cleanup_queue(q);
 err_hctxs:
 	kfree(map);
 	for (i = 0; i < set->nr_hw_queues; i++) {
@@ -2009,7 +2019,7 @@ err_percpu:
 	free_percpu(ctx);
 	return ERR_PTR(-ENOMEM);
 }
-EXPORT_SYMBOL(blk_mq_init_queue);
+EXPORT_SYMBOL(blk_mq_init_allocated_queue);
 
 void blk_mq_free_queue(struct request_queue *q)
 {
-- 
cgit v1.2.3


From b94ec296403e99d5ac9a8c48332cec4118d44b94 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 11 Mar 2015 23:56:38 -0400
Subject: blk-mq: export blk_mq_run_hw_queues

Rename blk_mq_run_queues to blk_mq_run_hw_queues, add async argument,
and export it.

DM's suspend support must be able to run the queue without starting
stopped hw queues.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3000121840bb..06614ce0f475 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,7 +33,6 @@ static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-static void blk_mq_run_queues(struct request_queue *q);
 
 /*
  * Check if any of the ctx's have pending work in this hardware queue
@@ -118,7 +117,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 
 	if (freeze) {
 		percpu_ref_kill(&q->mq_usage_counter);
-		blk_mq_run_queues(q);
+		blk_mq_run_hw_queues(q, false);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@ -904,7 +903,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 			&hctx->run_work, 0);
 }
 
-static void blk_mq_run_queues(struct request_queue *q)
+void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -915,9 +914,10 @@ static void blk_mq_run_queues(struct request_queue *q)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		blk_mq_run_hw_queue(hctx, false);
+		blk_mq_run_hw_queue(hctx, async);
 	}
 }
+EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-- 
cgit v1.2.3


From bfd343aa1718457d34b99ce6573085ac340da288 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 11 Mar 2015 23:56:39 -0400
Subject: blk-mq: don't wait in blk_mq_queue_enter() if __GFP_WAIT isn't set

Return -EBUSY if we're unable to enter a queue immediately when
allocating a blk-mq request without __GFP_WAIT.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 06614ce0f475..59fa23935a0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -77,7 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
-static int blk_mq_queue_enter(struct request_queue *q)
+static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 {
 	while (true) {
 		int ret;
@@ -85,6 +85,9 @@ static int blk_mq_queue_enter(struct request_queue *q)
 		if (percpu_ref_tryget_live(&q->mq_usage_counter))
 			return 0;
 
+		if (!(gfp & __GFP_WAIT))
+			return -EBUSY;
+
 		ret = wait_event_interruptible(q->mq_freeze_wq,
 				!q->mq_freeze_depth || blk_queue_dying(q));
 		if (blk_queue_dying(q))
@@ -256,7 +259,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
-	ret = blk_mq_queue_enter(q);
+	ret = blk_mq_queue_enter(q, gfp);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -1186,7 +1189,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	int rw = bio_data_dir(bio);
 	struct blk_mq_alloc_data alloc_data;
 
-	if (unlikely(blk_mq_queue_enter(q))) {
+	if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
 		bio_endio(bio, -EIO);
 		return NULL;
 	}
-- 
cgit v1.2.3


From 271508dba2c3fc307e7c44e2731a2ece70a4025e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 24 Mar 2015 16:21:16 -0700
Subject: block: allocate request memory local to request queue

blk_init_rl() allocates a mempool using mempool_create_node() with node
local memory.  This only allocates the mempool and element list locally
to the requeue queue node.

What we really want to do is allocate the request itself local to the
queue.  To do this, we need our own alloc and free functions that will
allocate from request_cachep and pass the request queue node in to prefer
node local memory.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 794c3e7f01cf..fd154b94447a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -557,6 +557,18 @@ void blk_cleanup_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 
+/* Allocate memory local to the request queue */
+static void *alloc_request_struct(gfp_t gfp_mask, void *data)
+{
+	int nid = (int)(long)data;
+	return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
+}
+
+static void free_request_struct(void *element, void *unused)
+{
+	kmem_cache_free(request_cachep, element);
+}
+
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
@@ -569,9 +581,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
-	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-					  mempool_free_slab, request_cachep,
-					  gfp_mask, q->node);
+	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
+					  free_request_struct,
+					  (void *)(long)q->node, gfp_mask,
+					  q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From f9018ac9308ea415e659cfbdda040504ef92597b Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Date: Mon, 30 Mar 2015 13:19:14 +0800
Subject: block: remove redundant check about 'set->nr_hw_queues' in
 blk_mq_alloc_tag_set()

At the beginning of blk_mq_alloc_tag_set(), we have already checked whether
'set->nr_hw_queues' is zero, so here remove this redundant check.

Signed-off-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 59fa23935a0f..37f14362aa15 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2174,7 +2174,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
 
-	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
+	if (!set->ops->queue_rq || !set->ops->map_queue)
 		return -EINVAL;
 
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
-- 
cgit v1.2.3


From c76cbbcf404475f8885b2252049dac99b0614868 Mon Sep 17 00:00:00 2001
From: Wei Fang <fangwei1@huawei.com>
Date: Mon, 30 Mar 2015 09:07:00 -0600
Subject: blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue()

Don't assign ->rq_timeout twice.

Signed-off-by: Wei Fang <fangwei1@huawei.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 37f14362aa15..1192f85e5ff3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1956,7 +1956,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		goto err_hctxs;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
-	blk_queue_rq_timeout(q, 30000);
+	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000);
 
 	q->nr_queues = nr_cpu_ids;
 	q->nr_hw_queues = set->nr_hw_queues;
@@ -1982,9 +1982,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	else
 		blk_queue_make_request(q, blk_sq_make_request);
 
-	if (set->timeout)
-		blk_queue_rq_timeout(q, set->timeout);
-
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
 	 */
-- 
cgit v1.2.3


From 889fa31f00b218a2cef96c32a6b3f57e6d3bf918 Mon Sep 17 00:00:00 2001
From: Chong Yuan <chong.yuan@memblaze.com>
Date: Wed, 15 Apr 2015 11:39:29 -0600
Subject: blk-mq: reduce unnecessary software queue looping

In flush_busy_ctxs() and blk_mq_hctx_has_pending(), regardless of how many
ctxs assigned to one hctx, they will all loop hctx->ctx_map.map_size
times. Here hctx->ctx_map.map_size is a const ALIGN(nr_cpu_ids, 8) / 8.
Especially, flush_busy_ctxs() is in hot code path. And it's unnecessary.
Change ->map_size to contain the actually mapped software queues, so we
only loop for as many iterations as we have to.

And remove cpumask setting and nr_ctx count in blk_mq_init_cpu_queues()
since they are all re-done in blk_mq_map_swqueue().
blk_mq_map_swqueue().

Signed-off-by: Chong Yuan <chong.yuan@memblaze.com>
Reviewed-by: Wenbo Wang <wenbo.wang@memblaze.com>

Updated by me for formatting and commenting.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1192f85e5ff3..0b49e42e5310 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1522,8 +1522,6 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
 	if (!bitmap->map)
 		return -ENOMEM;
 
-	bitmap->map_size = num_maps;
-
 	total = nr_cpu_ids;
 	for (i = 0; i < num_maps; i++) {
 		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
@@ -1764,8 +1762,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 			continue;
 
 		hctx = q->mq_ops->map_queue(q, i);
-		cpumask_set_cpu(i, hctx->cpumask);
-		hctx->nr_ctx++;
 
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
@@ -1802,6 +1798,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
+		struct blk_mq_ctxmap *map = &hctx->ctx_map;
+
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
@@ -1817,6 +1815,13 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			continue;
 		}
 
+		/*
+		 * Set the map size to the number of mapped software queues.
+		 * This is more accurate and more efficient than looping
+		 * over all possibly mapped software queues.
+		 */
+		map->map_size = hctx->nr_ctx / map->bits_per_word;
+
 		/*
 		 * Initialize batch roundrobin counts
 		 */
-- 
cgit v1.2.3