From f36f21ecca9ee688301174e5f2e0827827a7a7ff Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 12 May 2008 14:02:33 -0700 Subject: Fix misuses of bdevname() bdevname() fills the buffer that it is given as a parameter, so calling strcpy() or snprintf() on the returned value is redundant (and probably not guaranteed to work - I don't think strcpy and snprintf support overlapping buffers.) Signed-off-by: Jean Delvare Cc: Stephen Tweedie Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blktrace.c | 2 +- block/compat_ioctl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 568588cd16b2..b2cbb4e5d767 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -476,7 +476,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: - strcpy(b, bdevname(bdev, b)); + bdevname(bdev, b); ret = blk_trace_setup(q, b, bdev->bd_dev, arg); break; case BLKTRACESTART: diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index c70d0b6f666f..c23177e4623f 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -555,7 +555,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg) if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; - strcpy(b, bdevname(bdev, b)); + bdevname(bdev, b); buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, -- cgit v1.2.3 From 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 6 May 2008 22:31:33 +0200 Subject: block: do_mounts - accept root= Some devices, like md, may create partitions only at first access, so allow root= to be set to a valid non-existant partition of an existing disk. This applies only to non-initramfs root mounting. This fixes a regression from 2.6.24 which did allow this to happen and broke some users machines :( Acked-by: Neil Brown Tested-by: Joao Luis Meloni Assirati Cc: stable Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index fda9c7a63c29..129ad939f9dd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -653,7 +653,7 @@ void genhd_media_change_notify(struct gendisk *disk) EXPORT_SYMBOL_GPL(genhd_media_change_notify); #endif /* 0 */ -dev_t blk_lookup_devt(const char *name) +dev_t blk_lookup_devt(const char *name, int part) { struct device *dev; dev_t devt = MKDEV(0, 0); @@ -661,7 +661,11 @@ dev_t blk_lookup_devt(const char *name) mutex_lock(&block_class_lock); list_for_each_entry(dev, &block_class.devices, node) { if (strcmp(dev->bus_id, name) == 0) { - devt = dev->devt; + struct gendisk *disk = dev_to_disk(dev); + + if (part < disk->minors) + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + part); break; } } @@ -669,7 +673,6 @@ dev_t blk_lookup_devt(const char *name) return devt; } - EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *alloc_disk(int minors) -- cgit v1.2.3 From e7e72bf641b1fc7b9df6f40bd2c36dfccd8d647c Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Wed, 14 May 2008 16:05:54 -0700 Subject: Remove blkdev warning triggered by using md As setting and clearing queue flags now requires that we hold a spinlock on the queue, and as blk_queue_stack_limits is called without that lock, get the lock inside blk_queue_stack_limits. For blk_queue_stack_limits to be able to find the right lock, each md personality needs to set q->queue_lock to point to the appropriate lock. Those personalities which didn't previously use a spin_lock, us q->__queue_lock. So always initialise that lock when allocated. With this in place, setting/clearing of the QUEUE_FLAG_PLUGGED bit will no longer cause warnings as it will be clear that the proper lock is held. Thanks to Dan Williams for review and fixing the silly bugs. Signed-off-by: NeilBrown Cc: Dan Williams Cc: Jens Axboe Cc: Alistair John Strachan Cc: Nick Piggin Cc: "Rafael J. Wysocki" Cc: Jacek Luczak Cc: Prakash Punnoor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-core.c | 5 ++--- block/blk-settings.c | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2987fe47b5ee..6a9cc0d22a61 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -482,6 +482,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->sysfs_lock); + spin_lock_init(&q->__queue_lock); return q; } @@ -544,10 +545,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) * if caller didn't supply a lock, they get per-queue locking with * our embedded lock */ - if (!lock) { - spin_lock_init(&q->__queue_lock); + if (!lock) lock = &q->__queue_lock; - } q->request_fn = rfn; q->prep_rq_fn = NULL; diff --git a/block/blk-settings.c b/block/blk-settings.c index bb93d4c32775..8dd86418f35d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -286,8 +286,14 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments); t->max_segment_size = min(t->max_segment_size, b->max_segment_size); t->hardsect_size = max(t->hardsect_size, b->hardsect_size); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + if (!t->queue_lock) + WARN_ON_ONCE(1); + else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { + unsigned long flags; + spin_lock_irqsave(t->queue_lock, flags); queue_flag_clear(QUEUE_FLAG_CLUSTER, t); + spin_unlock_irqrestore(t->queue_lock, flags); + } } EXPORT_SYMBOL(blk_queue_stack_limits); -- cgit v1.2.3 From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 22 May 2008 15:13:29 +0200 Subject: block: Move the second call to get_request to the end of the loop In function get_request_wait, the second call to get_request could be moved to the end of the while loop, because if the first call to get_request fails, the second call will fail without sleep. Signed-off-by: Zhang Yanmin Signed-off-by: Jens Axboe --- block/blk-core.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6a9cc0d22a61..1905aaba49fb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); + struct io_context *ioc; struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - - if (!rq) { - struct io_context *ioc; + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + io_schedule(); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); + /* + * After sleeping, we become a "batching" process and + * will be able to allocate at least one request, and + * up to a big batch of them for a small period time. + * See ioc_batching, ioc_set_batching + */ + ioc = current_io_context(GFP_NOIO, q->node); + ioc_set_batching(q, ioc); - spin_lock_irq(q->queue_lock); - } + spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[rw], &wait); - } + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + }; return rq; } -- cgit v1.2.3 From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 23 May 2008 06:52:00 +0200 Subject: block: reorder cfq_queue to save space on 64bit builds saves 8 bytes of padding & increases objects/slab from 30 to 32 on my AMD64 config Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b399c62936e0..4df3f0522435 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -124,6 +124,8 @@ struct cfq_data { struct cfq_queue { /* reference count */ atomic_t ref; + /* various state flags, see below */ + unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ @@ -138,14 +140,14 @@ struct cfq_queue { int queued[2]; /* currently allocated requests */ int allocated[2]; - /* pending metadata requests */ - int meta_pending; /* fifo list of requests in sort_list */ struct list_head fifo; unsigned long slice_end; long slice_resid; + /* pending metadata requests */ + int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -153,8 +155,6 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - /* various state flags, see below */ - unsigned int flags; }; enum cfqq_state_flags { -- cgit v1.2.3 From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:54:41 +0200 Subject: Added in MESSAGE notes for blktraces Allows messages to be inserted into blktrace streams. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index b2cbb4e5d767..20e11f354f11 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + static char bt_msg_buf[BLK_TN_MAX_MSG]; + + va_start(args, fmt); + n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { -- cgit v1.2.3 From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:55:00 +0200 Subject: Added in elevator switch message to blktrace stream Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/elevator.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 980f8ae147b4..902dd1344d56 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); + blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); + return 1; fail_register: -- cgit v1.2.3 From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:45:33 +0200 Subject: block: make blktrace use per-cpu buffers for message notes Currently it uses a single static char array, but that risks being corrupted when multiple users issue message notes at the same time. Make the buffers dynamically allocated when the trace is setup and make them per-cpu instead. The default max message size of 1k is also very large, the interface is mainly for small text notes. So shrink it to 128 bytes. Signed-off-by: Jens Axboe --- block/blktrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 20e11f354f11..7ae87cc4a163 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; - static char bt_msg_buf[BLK_TN_MAX_MSG]; + char *buf; + preempt_disable(); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); - n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + preempt_enable(); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); + free_percpu(bt->msg_data); kfree(bt); } @@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + ret = -ENOENT; dir = blk_create_tree(buts->name); if (!dir) @@ -406,6 +414,7 @@ err: if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); + free_percpu(bt->msg_data); if (bt->rchan) relay_close(bt->rchan); kfree(bt); -- cgit v1.2.3 From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:46:59 +0200 Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup() cfq_cic_lookup() needs to properly protect ioc->ioc_data before dereferencing it and also exclude updaters of ioc->ioc_data as well. Also add a number of comments documenting why the existing RCU usage is OK. Thanks a lot to "Paul E. McKenney" for review and comments! Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4df3f0522435..d01b411c72f0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq) kmem_cache_free(cfq_pool, cfqq); } +/* + * Must always be called with the rcu_read_lock() held + */ static void __call_for_each_cic(struct io_context *ioc, void (*func)(struct io_context *, struct cfq_io_context *)) @@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) cfq_cic_free(cic); } +/* + * Must be called with rcu_read_lock() held or preemption otherwise disabled. + * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), + * and ->trim() which is called with the task lock held + */ static void cfq_free_io_context(struct io_context *ioc) { /* @@ -1502,20 +1510,24 @@ static struct cfq_io_context * cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; + unsigned long flags; void *k; if (unlikely(!ioc)) return NULL; + rcu_read_lock(); + /* * we maintain a last-hit cache, to avoid browsing over the tree */ cic = rcu_dereference(ioc->ioc_data); - if (cic && cic->key == cfqd) + if (cic && cic->key == cfqd) { + rcu_read_unlock(); return cic; + } do { - rcu_read_lock(); cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); rcu_read_unlock(); if (!cic) @@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) k = cic->key; if (unlikely(!k)) { cfq_drop_dead_cic(cfqd, ioc, cic); + rcu_read_lock(); continue; } + spin_lock_irqsave(&ioc->lock, flags); rcu_assign_pointer(ioc->ioc_data, cic); + spin_unlock_irqrestore(&ioc->lock, flags); break; } while (1); @@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q) static void cfq_slab_kill(void) { + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ if (cfq_pool) kmem_cache_destroy(cfq_pool); if (cfq_ioc_pool) @@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); + + /* + * this also protects us from entering cfq_slab_kill() with + * pending RCU callbacks + */ if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); cfq_slab_kill(); -- cgit v1.2.3 From d5791d13b1d45542895104edf4b09476d5ad24b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 9 Jun 2008 10:06:24 -0700 Subject: Fix invalid access errors in blk_lookup_devt Commit 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 ("block: do_mounts - accept root=") extended blk_lookup_devt() to be able to look up partitions that had not yet been registered, but in the process made the assumption that the '&block_class.devices' list only contains disk devices and that you can do 'dev_to_disk(dev)' on them. That isn't actually true. The block_class device list also contains the partitions we've discovered so far, and you can't just do a 'dev_to_disk()' on those. So make sure to only work on devices that block/genhd.c has registered itself, something we can test by checking the 'dev->type' member. This makes the loop in blk_lookup_devt() match the other such loops in this file. [ We may want to do an alternate version that knows to handle _either_ whole-disk devices or partitions, but for now this is the minimal fix for a series of crashes reported by Mariusz Kozlowski in http://lkml.org/lkml/2008/5/25/25 and Ingo in http://lkml.org/lkml/2008/6/9/39 ] Reported-by: Mariusz Kozlowski Reported-by: Ingo Molnar Cc: Neil Brown Cc: Joao Luis Meloni Assirati Acked-by: Kay Sievers Cc: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- block/genhd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 129ad939f9dd..b922d4801c87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -660,6 +660,8 @@ dev_t blk_lookup_devt(const char *name, int part) mutex_lock(&block_class_lock); list_for_each_entry(dev, &block_class.devices, node) { + if (dev->type != &disk_type) + continue; if (strcmp(dev->bus_id, name) == 0) { struct gendisk *disk = dev_to_disk(dev); -- cgit v1.2.3 From 14a73f54798f39854e521fb596da7d50b7566bbd Mon Sep 17 00:00:00 2001 From: Carl Henrik Lunde Date: Thu, 12 Jun 2008 20:13:58 +0200 Subject: block: disable IRQs until data is written to relay channel As we may run relay_reserve from interrupt context we must always disable IRQs. This is because a call to relay_reserve may expose previously written data to use space. Updated new message code and an old but related comment. Signed-off-by: Carl Henrik Lunde Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/blktrace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 7ae87cc4a163..8d3a27780260 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,16 +79,17 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; + unsigned long flags; char *buf; - preempt_disable(); + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - preempt_enable(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -158,10 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. Once reserved, it's - * enough to get preemption disabled to prevent read of this data - * before we are through filling it. get_cpu()/put_cpu() does this - * for us + * from coming in and stepping on our toes. */ local_irq_save(flags); -- cgit v1.2.3