From 4495a7d41dbda03841c2a1c2a5ce7135a45131ba Mon Sep 17 00:00:00 2001 From: Kyungmin Park Date: Tue, 31 May 2011 10:04:09 +0200 Subject: CFQ: Fix typo and remove unnecessary semicolon Fix comment typo and remove unnecessary semicolon at macro Signed-off-by: Kyungmin Park Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7c52d6888924..8a02c955c610 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -185,7 +185,7 @@ struct cfq_group { int nr_cfqq; /* - * Per group busy queus average. Useful for workload slice calc. We + * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. @@ -369,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy); #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args); + blkg_path(&(cfqq)->cfqg->blkg), ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args); \ + blkg_path(&(cfqg)->blkg), ##args) \ #else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) #endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) -- cgit v1.2.3 From 28304f485c3627cc5e1665b92e26eb7fcfe98088 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 2 Jun 2011 13:05:02 +0200 Subject: cfq-iosched: Remove bogus check in queue_fail path queue_fail can only be reached if cic is NULL, so its check for cic must be bogus. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8a02c955c610..3c7b537bf908 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3786,9 +3786,6 @@ new_queue: return 0; queue_fail: - if (cic) - put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); -- cgit v1.2.3 From e2bd9678fc0085acf540dc4cb48ff961cd4d88c0 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 2 Jun 2011 13:05:02 +0200 Subject: block: Use hlist_entry() for io_context.cic_list.first list_entry() and hlist_entry() are both simply aliases for container_of(), but since io_context.cic_list.first is an hlist_node one should at least use the correct alias. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/blk-ioc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index c898049dafd5..342eae9b0d3c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc) if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, cic_list); cic->dtor(ioc); } @@ -57,7 +57,7 @@ static void cfq_exit(struct io_context *ioc) if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, cic_list); cic->exit(ioc); } -- cgit v1.2.3 From ab4bd22d3cce6977dc039664cc2d052e3147d662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Jun 2011 06:01:13 +0200 Subject: cfq-iosched: fix locking around ioc->ioc_data assignment Since we are modifying this RCU pointer, we need to hold the lock protecting it around it. This fixes a potential reuse and double free of a cfq io_context structure. The bug has been in CFQ for a long time, it hit very few people but those it did hit seemed to see it a lot. Tracked in RH bugzilla here: https://bugzilla.redhat.com/show_bug.cgi?id=577968 Credit goes to Paul Bolle for figuring out that the issue was around the one-hit ioc->ioc_data cache. Thanks to his hard work the issue is now fixed. Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3c7b537bf908..545b8d4c829d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2772,8 +2772,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); - if (ioc->ioc_data == cic) + if (rcu_dereference(ioc->ioc_data) == cic) { + spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); -- cgit v1.2.3 From a9dce2a3b4f0686dd66cb44d4826a59508bce969 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:54 +0200 Subject: block: don't use non-syncing event blocking in disk_check_events() This patch is part of fix for triggering of WARN_ON_ONCE() in disk_clear_events() reported in bug#34662. https://bugzilla.kernel.org/show_bug.cgi?id=34662 disk_clear_events() blocks events, schedules and flushes the event work. It expects the work to have started execution on schedule and finished on return from flush. WARN_ON_ONCE() triggers if the event work hasn't executed as expected. This problem happens because __disk_block_events() fails to guarantee that the event work item is not in flight on return from the function in race-free manner. The problem is two-fold and this patch addresses one of them. When __disk_block_events() is called with @sync == %false, it bumps event block count, calls cancel_delayed_work() and return. This makes it impossible to guarantee that event polling is not in flight on return from syncing __disk_block_events() - if the first blocker was non-syncing, polling could still be in progress and later syncing ones would assume that the first blocker already canceled it. Making __disk_block_events() cancel_sync regardless of block count isn't feasible either as it may race with forced event checking in disk_clear_events(). As disk_check_events() is the only user of non-syncing __disk_block_events(), updating it to directly cancel and schedule event work is the easiest way to solve the issue. Note that there's another bug in __disk_block_events() and this patch doesn't fix the issue completely. Later patch will fix the other bug. Signed-off-by: Tejun Heo Tested-by: Sitsofe Wheeler Reported-by: Sitsofe Wheeler Reported-by: Borislav Petkov Reported-by: Meelis Roos Reported-by: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 95822ae25cfe..3f0933077642 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1508,10 +1508,18 @@ void disk_unblock_events(struct gendisk *disk) */ void disk_check_events(struct gendisk *disk) { - if (disk->ev) { - __disk_block_events(disk, false); - __disk_unblock_events(disk, true); + struct disk_events *ev = disk->ev; + unsigned long flags; + + if (!ev) + return; + + spin_lock_irqsave(&ev->lock, flags); + if (!ev->block) { + cancel_delayed_work(&ev->dwork); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); } + spin_unlock_irqrestore(&ev->lock, flags); } EXPORT_SYMBOL_GPL(disk_check_events); -- cgit v1.2.3 From c3af54afbac3675337cedf326b7b127ffa7f7327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:55 +0200 Subject: block: remove non-syncing __disk_block_events() and fold it into disk_block_events() After the previous update to disk_check_events(), nobody is using non-syncing __disk_block_events(). Remove @sync and, as this makes __disk_block_events() virtually identical to disk_block_events(), remove the underscore prefixed version. Signed-off-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/genhd.c | 55 ++++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 3f0933077642..ab0731d8976d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1414,22 +1414,36 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk) return msecs_to_jiffies(intv_msecs); } -static void __disk_block_events(struct gendisk *disk, bool sync) +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; + if (!ev) + return; + spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); - if (cancel) { - if (sync) - cancel_delayed_work_sync(&disk->ev->dwork); - else - cancel_delayed_work(&disk->ev->dwork); - } + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) @@ -1460,27 +1474,6 @@ out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } -/** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_block_events(disk, true); -} - /** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for @@ -1554,7 +1547,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); /* uncondtionally schedule event check and wait for it to finish */ - __disk_block_events(disk, true); + disk_block_events(disk); queue_delayed_work(system_nrt_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); @@ -1672,7 +1665,7 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, if (intv < 0 && intv != -1) return -EINVAL; - __disk_block_events(disk, true); + disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); @@ -1778,7 +1771,7 @@ static void disk_del_events(struct gendisk *disk) if (!disk->ev) return; - __disk_block_events(disk, true); + disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); -- cgit v1.2.3 From fdd514e16bb2531c0c61ae8a1f87740ce217f630 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jun 2011 20:43:59 +0200 Subject: block: make disk_block_events() properly wait for work cancellation disk_block_events() should guarantee that the event work is not in flight on return and once blocked it shouldn't issue further cancellations. Because there was no synchronization between the first blocker doing cancel_delayed_work_sync() and the following blockers, the following blockers could finish before cancellation was complete, which broke both guarantees - event work could be in flight and cancellation could happen after return. This bug triggered WARN_ON_ONCE() in disk_clear_events() reported in bug#34662. https://bugzilla.kernel.org/show_bug.cgi?id=34662 Fix it by adding an outer mutex which protects both block count manipulation and work cancellation. -v2: Use outer mutex instead of bit waitqueue per Linus. Signed-off-by: Tejun Heo Tested-by: Sitsofe Wheeler Reported-by: Sitsofe Wheeler Reported-by: Borislav Petkov Reported-by: Meelis Roos Reported-by: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ab0731d8976d..3608289c8ecd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1371,6 +1371,7 @@ struct disk_events { struct gendisk *disk; /* the associated disk */ spinlock_t lock; + struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ @@ -1438,12 +1439,20 @@ void disk_block_events(struct gendisk *disk) if (!ev) return; + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); if (cancel) cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) @@ -1751,6 +1760,7 @@ static void disk_add_events(struct gendisk *disk) INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); -- cgit v1.2.3 From 08e8138adebdd511e0955e8d6c051904bb4082af Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Jun 2011 10:42:49 +0200 Subject: block: Add __attribute__((format(printf...) and fix fallout Use the compiler to verify format strings and arguments. Fix fallout. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- block/cfq-iosched.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a62be8d0dc1b..3689f833afdc 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q) bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", total_nr_queued(td), td->nr_queued[READ], td->nr_queued[WRITE]); @@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) } queue_bio: - throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" " iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 545b8d4c829d..f3799432676d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" - " sect=%u", used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, + "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", + used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, unaccounted_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); @@ -2023,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ if (sample_valid(cic->ttime_samples) && (cfqq->slice_end - jiffies < cic->ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", - cic->ttime_mean); + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", + cic->ttime_mean); return; } -- cgit v1.2.3 From 3181faa85bda3dc3f5e630a1846526c9caaa38e3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Jun 2011 09:03:47 +0200 Subject: cfq-iosched: fix a rcu warning I got a rcu warnning at boot. the ioc->ioc_data is rcu_deferenced, but doesn't hold rcu_read_lock. Signed-off-by: Shaohua Li Cc: stable@kernel.org # after ab4bd22d Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f3799432676d..fd566c1e2617 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2773,11 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); + rcu_read_lock(); if (rcu_dereference(ioc->ioc_data) == cic) { + rcu_read_unlock(); spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); spin_unlock(&ioc->lock); - } + } else + rcu_read_unlock(); if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); -- cgit v1.2.3 From 726e99ab88db059fe1422e15376ae404f8c66eb4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Jun 2011 09:03:48 +0200 Subject: cfq-iosched: make code consistent ioc->ioc_data is rcu protectd, so uses correct API to access it. This doesn't change any behavior, but just make code consistent. Signed-off-by: Shaohua Li Cc: stable@kernel.org # after ab4bd22d Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd566c1e2617..ae21919f15e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3087,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); - BUG_ON(ioc->ioc_data == cic); + BUG_ON(rcu_dereference_check(ioc->ioc_data, + lockdep_is_held(&ioc->lock)) == cic); radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); -- cgit v1.2.3