From 762380ad9322951cea4ce9d24864265f9c66a916 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2014 13:38:39 -0600 Subject: block: add notion of a chunk size for request merging Some drivers have different limits on what size a request should optimally be, depending on the offset of the request. Similar to dividing a device into chunks. Add a setting that allows the driver to inform the block layer of such a chunk size. The block layer will then prevent merging across the chunks. This is needed to optimally support NVMe with a non-zero stripe size. Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- block/blk-settings.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 96d28eee8a1e..97e832cc9b9c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,7 +849,8 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); + + return __bio_add_page(q, bio, page, len, offset, blk_max_size_offset(q, bio->bi_iter.bi_sector)); } EXPORT_SYMBOL(bio_add_page); diff --git a/block/blk-settings.c b/block/blk-settings.c index 5d21239bc859..a2b9cb195e70 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; lim->discard_granularity = 0; @@ -276,6 +277,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto } EXPORT_SYMBOL(blk_queue_max_hw_sectors); +/** + * blk_queue_chunk_sectors - set size of the chunk for this queue + * @q: the request queue for the device + * @chunk_sectors: chunk sectors in the usual 512b unit + * + * Description: + * If a driver doesn't want IOs to cross a given chunk size, it can set + * this limit and prevent merging across chunks. Note that the chunk size + * must currently be a power-of-2 in sectors. + **/ +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +{ + BUG_ON(!is_power_of_2(chunk_sectors)); + q->limits.chunk_sectors = chunk_sectors; +} +EXPORT_SYMBOL(blk_queue_chunk_sectors); + /** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device -- cgit v1.2.3 From f27b087b81b70513b8c61ec20596c868f7b93474 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2014 07:57:37 -0600 Subject: block: add blk_rq_set_block_pc() With the optimizations around not clearing the full request at alloc time, we are leaving some of the needed init for REQ_TYPE_BLOCK_PC up to the user allocating the request. Add a blk_rq_set_block_pc() that sets the command type to REQ_TYPE_BLOCK_PC, and properly initializes the members associated with this type of request. Update callers to use this function instead of manipulating rq->cmd_type directly. Includes fixes from Christoph Hellwig for my half-assed attempt. Signed-off-by: Jens Axboe --- block/blk-core.c | 18 ++++++++++++++++++ block/bsg.c | 3 ++- block/scsi_ioctl.c | 6 +++--- 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 40d654861c33..9aca8c71e70b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1218,6 +1218,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, if (unlikely(!rq)) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + for_each_bio(bio) { struct bio *bounce_bio = bio; int ret; @@ -1234,6 +1236,22 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(blk_make_request); +/** + * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC + * @rq: request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted diff --git a/block/bsg.c b/block/bsg.c index e5214c148096..ff46addde5d8 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -196,7 +196,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->request_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -273,6 +272,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9c28a5b38042..14695c6221c8 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -229,7 +229,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->cmd_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -311,6 +310,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) return -ENOMEM; + blk_rq_set_block_pc(rq); if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { blk_put_request(rq); @@ -491,7 +491,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -524,7 +524,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; rq->cmd[4] = data; -- cgit v1.2.3 From a4391c6465d9c978fd4bded12e34bdde3f5458f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2014 15:21:56 -0600 Subject: blk-mq: bump max tag depth to 10K tags For some scsi-mq cases, the tag map can be huge. So increase the max number of tags we support. Additionally, don't fail with EINVAL if a user requests too many tags. Warn that the tag depth has been adjusted down, and store the new value inside the tag_set passed in. Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e4cd6208052..a6ee74e27957 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1967,13 +1967,19 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if if it too large. In that case, the set + * value will be stored in set->queue_depth. + */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i; if (!set->nr_hw_queues) return -EINVAL; - if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH) + if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; @@ -1981,6 +1987,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) return -EINVAL; + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_info("blk-mq: reduced tag depth to %u\n", + BLK_MQ_MAX_DEPTH); + set->queue_depth = BLK_MQ_MAX_DEPTH; + } set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), -- cgit v1.2.3 From 3b632cf0eaa2e89a12c18f043e6e7c5bcc003645 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Jun 2014 10:22:07 -0600 Subject: blk-mq: don't allow queue entering for a dying queue If the queue is going away, don't let new allocs or queueing happen on it. Go through the normal wait process, and exit with ENODEV in that case. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a6ee74e27957..75fc33f34251 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -82,8 +82,10 @@ static int blk_mq_queue_enter(struct request_queue *q) __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); smp_wmb(); - /* we have problems to freeze the queue if it's initializing */ - if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) + + /* we have problems freezing the queue if it's initializing */ + if (!blk_queue_dying(q) && + (!blk_queue_bypass(q) || !blk_queue_init_done(q))) return 0; __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); -- cgit v1.2.3 From f6be4fb4bcb396fc3b1c134b7863351972de081f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2014 11:03:48 -0600 Subject: blk-mq: ->timeout should be cleared in blk_mq_rq_ctx_init() It'll be used in blk_mq_start_request() to set a potential timeout for the request, so clear it to zero at alloc time to ensure that we know if someone has set it or not. Fixes random early timeouts on NVMe testing. Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 75fc33f34251..fafea52281ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -204,6 +204,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->sense = NULL; INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + rq->end_io = NULL; rq->end_io_data = NULL; rq->next_rq = NULL; -- cgit v1.2.3 From de83953f9d710f84c4a162a1d498a73475c07d98 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Sat, 7 Jun 2014 00:37:26 +0200 Subject: block: blk-exec.c: Cleaning up local variable address returnd Address of local variable assigned to a function parameter This was partly found using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Signed-off-by: Jens Axboe --- block/blk-exec.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index dbf4502b1d67..f4d27b12c90b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -132,6 +132,11 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, if (rq->errors) err = -EIO; + if (rq->sense == sense) { + rq->sense = NULL; + rq->sense_len = 0; + } + return err; } EXPORT_SYMBOL(blk_execute_rq); -- cgit v1.2.3 From 3ee3237239583a6555db4f297d00eebdbb6d76ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Jun 2014 09:36:53 -0600 Subject: blk-mq: always initialize request->start_time The blk-mq core only initializes this if io stats are enabled, since blk-mq only reads the field in that case. But drivers could potentially use it internally, so ensure that we always set it to the current time when the request is allocated. Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fafea52281ac..a5ea37d7e820 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -185,6 +185,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; + rq->start_time = jiffies; #ifdef CONFIG_BLK_CGROUP rq->rl = NULL; set_start_time_ns(rq); @@ -1104,10 +1105,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - if (blk_do_io_stat(rq)) { - rq->start_time = jiffies; + if (blk_do_io_stat(rq)) blk_account_io_start(rq, 1); - } } static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3 From 2b8393b43ec672bb263009cd74c056ab01d6ac17 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 10 Jun 2014 00:16:41 +0800 Subject: blk-mq: add timer in blk_mq_start_request This way will become consistent with non-mq case, also avoid to update rq->deadline twice for mq. The comment said: "We do this early, to ensure we are on the right CPU.", but no percpu stuff is used in blk_add_timer(), so it isn't necessary. Even when inserting from plug list, there is no such guarantee at all. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a5ea37d7e820..e11f5f8e0313 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -411,16 +411,7 @@ static void blk_mq_start_request(struct request *rq, bool last) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); - /* - * Just mark start time and set the started bit. Due to memory - * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, - * unless one has been set in the request. - */ - if (!rq->timeout) - rq->deadline = jiffies + q->rq_timeout; - else - rq->deadline = jiffies + rq->timeout; + blk_add_timer(rq); /* * Mark us as started and clear complete. Complete might have been @@ -972,11 +963,6 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); - - /* - * We do this early, to ensure we are on the right CPU. - */ - blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -1219,7 +1205,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); blk_mq_start_request(rq, true); - blk_add_timer(rq); /* * For OK queue, we are done. For error, kill it. Any other -- cgit v1.2.3 From 58a4915ad2f8a87f4456aac260396df7e300e6f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Jun 2014 12:53:56 -0600 Subject: block: ensure that bio_add_page() always accepts a page for an empty bio With commit 762380ad9322 added support for chunk sizes and no merging across them, it broke the rule of always allowing adding of a single page to an empty bio. So relax the restriction a bit to allow for that, similarly to what we have always done. This fixes a crash with mkfs.xfs and 512b sector sizes on NVMe. Reported-by: Keith Busch Signed-off-by: Jens Axboe --- block/bio.c | 7 ++++++- block/blk-settings.c | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 97e832cc9b9c..2d64488e51c6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,8 +849,13 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int max_sectors; - return __bio_add_page(q, bio, page, len, offset, blk_max_size_offset(q, bio->bi_iter.bi_sector)); + max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) + max_sectors = len >> 9; + + return __bio_add_page(q, bio, page, len, offset, max_sectors); } EXPORT_SYMBOL(bio_add_page); diff --git a/block/blk-settings.c b/block/blk-settings.c index a2b9cb195e70..f1a1795a5683 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -285,7 +285,10 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); * Description: * If a driver doesn't want IOs to cross a given chunk size, it can set * this limit and prevent merging across chunks. Note that the chunk size - * must currently be a power-of-2 in sectors. + * must currently be a power-of-2 in sectors. Also note that the block + * layer must accept a page worth of data at any offset. So if the + * crossing of chunks is a hard limitation in the driver, it must still be + * prepared to split single page bios. **/ void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { -- cgit v1.2.3 From b5097e956a4d2919ee248d6481e4204c5568ed5c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Jun 2014 20:04:50 +0200 Subject: block: add __init to elv_register elv_register is only called by elevator init functions: __init cfq_init __init deadline_init __init noop_init Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 1e01b66a0b92..f35edddfe9b5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -845,7 +845,7 @@ void elv_unregister_queue(struct request_queue *q) } EXPORT_SYMBOL(elv_unregister_queue); -int elv_register(struct elevator_type *e) +int __init elv_register(struct elevator_type *e) { char *def = ""; -- cgit v1.2.3 From a2d445d440003f2d70ee4cd4970ea82ace616fee Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 10 Jun 2014 20:18:36 +0200 Subject: block: add __init to blkcg_policy_register blkcg_policy_register is only called by __init functions: __init cfq_init __init throtl_init Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1039fb9ff5f5..5aa9b8b8a362 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1093,7 +1093,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ -int blkcg_policy_register(struct blkcg_policy *pol) +int __init blkcg_policy_register(struct blkcg_policy *pol) { int i, ret; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 371fe8e92ab5..5480adedd387 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -145,7 +145,7 @@ void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); /* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); +int __init blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol); @@ -580,7 +580,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { ret static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline int __init blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { return 0; } -- cgit v1.2.3