diff options
Diffstat (limited to 'drivers/md')
38 files changed, 439 insertions, 205 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 02a5345a44a6..197e29d1c2e6 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -357,6 +357,7 @@ config DM_LOG_USERSPACE config DM_RAID tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM + select MD_RAID0 select MD_RAID1 select MD_RAID10 select MD_RAID456 diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ca4abe1ccd8d..537903bf9add 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -404,7 +404,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); @@ -476,7 +477,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, if (b == -1) goto err; - k->ptr[i] = PTR(ca->buckets[b].gen, + k->ptr[i] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), ca->sb.nr_this_dev); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b420a55c745..02619cabda8b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -333,6 +333,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; @@ -425,7 +426,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc:1; + unsigned invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -593,8 +594,8 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; + wait_queue_head_t gc_wait; - wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81d3db40cd7b..2efdce07247c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1757,32 +1757,34 @@ static void bch_btree_gc(struct cache_set *c) bch_moving_gc(c); } -static int bch_gc_thread(void *arg) +static bool gc_should_run(struct cache_set *c) { - struct cache_set *c = arg; struct cache *ca; unsigned i; - while (1) { -again: - bch_btree_gc(c); + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) + return true; - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; + if (atomic_read(&c->sectors_to_gc) < 0) + return true; - mutex_lock(&c->bucket_lock); + return false; +} - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) { - mutex_unlock(&c->bucket_lock); - set_current_state(TASK_RUNNING); - goto again; - } +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; - mutex_unlock(&c->bucket_lock); + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || gc_should_run(c)); - schedule(); + if (kthread_should_stop()) + break; + + set_gc_sectors(c); + bch_btree_gc(c); } return 0; @@ -1790,11 +1792,10 @@ again: int bch_gc_thread_start(struct cache_set *c) { - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); if (IS_ERR(c->gc_thread)) return PTR_ERR(c->gc_thread); - set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); return 0; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 5c391fa01bed..9b80417cd547 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { - if (c->gc_thread) - wake_up_process(c->gc_thread); + wake_up(&c->gc_wait); } #define MAP_DONE 0 diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 243de0bf15cd..4bf15182c4da 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -584,7 +584,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey return false; for (i = 0; i < KEY_PTRS(l); i++) - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) return false; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6925023e12d4..08f20b7cd199 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -508,7 +508,7 @@ static void journal_reclaim(struct cache_set *c) continue; ja->cur_idx = next; - k->ptr[n++] = PTR(0, + k->ptr[n++] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 40ffe5e424b3..e0f1c6d534fe 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,14 +196,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { - set_gc_sectors(op->c); - wake_up_gc(op->c); - } - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 849ad441cd76..f4557f558b24 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1025,7 +1025,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1058,6 +1058,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); @@ -1229,6 +1231,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); @@ -1491,6 +1494,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); spin_lock_init(&c->btree_gc_time.lock); @@ -1550,6 +1554,7 @@ static void run_cache_set(struct cache_set *c) for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; + set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { LIST_HEAD(journal); @@ -1965,6 +1970,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3ff57d61dde..4fbb5532f24c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -191,7 +191,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -226,7 +226,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index dde6172f3f10..eb70f6894f05 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -73,24 +73,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; - - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } - - if (!u) - return sprintf(buf, "%llu", v); - - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); - - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + int u = 0, t; + + uint64_t q; + + if (v < 0) + q = -v; + else + q = v; + + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; + + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e51644e503a5..4ce2b19fe120 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -20,7 +20,8 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); @@ -186,7 +187,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -206,7 +207,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -482,17 +483,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); + d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -516,6 +517,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 301eaf565167..cdf8d253209e 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { @@ -85,7 +104,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *); diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2d826927a3bf..fb02c3979bf4 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1992,6 +1992,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, long pages; struct bitmap_page *new_bp; + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space. diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 125aedc3875f..7643f72adb1c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -215,7 +215,7 @@ static DEFINE_SPINLOCK(param_spinlock); * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; -static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; +static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -923,10 +923,11 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { - mutex_lock(&dm_bufio_clients_lock); - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); + if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { + if (mutex_trylock(&dm_bufio_clients_lock)) { + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + } } buffers = dm_bufio_cache_size_per_client >> @@ -936,7 +937,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, buffers = c->minimum_buffers; *limit_buffers = buffers; - *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; + *threshold_buffers = mult_frac(buffers, + DM_BUFIO_WRITEBACK_PERCENT, 100); } /* @@ -1540,10 +1542,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) return true; } -static unsigned get_retain_buffers(struct dm_bufio_client *c) +static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); - return retain_bytes / c->block_size; + unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, @@ -1553,7 +1555,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, struct dm_buffer *b, *tmp; unsigned long freed = 0; unsigned long count = nr_to_scan; - unsigned retain_target = get_retain_buffers(c); + unsigned long retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { @@ -1779,11 +1781,19 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz) static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) { struct dm_buffer *b, *tmp; - unsigned retain_target = get_retain_buffers(c); - unsigned count; + unsigned long retain_target = get_retain_buffers(c); + unsigned long count; + LIST_HEAD(write_list); dm_bufio_lock(c); + __check_watermark(c, &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + __flush_write_list(&write_list); + dm_bufio_lock(c); + } + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { if (count <= retain_target) @@ -1808,6 +1818,8 @@ static void cleanup_old_buffers(void) mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + list_for_each_entry(c, &dm_bufio_all_clients, client_list) __evict_old_buffers(c, max_age_hz); @@ -1845,19 +1857,15 @@ static int __init dm_bufio_init(void) memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)((totalram_pages - totalhigh_pages) * - DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; + mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) mem = ULONG_MAX; #ifdef CONFIG_MMU - /* - * Get the size of vmalloc space the same way as VMALLOC_TOTAL - * in fs/proc/internal.h - */ - if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) - mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; + if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100)) + mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100); #endif dm_bufio_default_cache_size = mem; @@ -1930,7 +1938,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); -module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 695577812cf6..6937ca42be8c 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1383,17 +1383,19 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown) { - int r; + int r = -EINVAL; flags_mutator mutator = (clean_shutdown ? set_clean_shutdown : clear_clean_shutdown); WRITE_LOCK(cmd); + if (cmd->fail_io) + goto out; + r = __commit_transaction(cmd, mutator); if (r) goto out; r = __begin_transaction(cmd); - out: WRITE_UNLOCK(cmd); return r; @@ -1405,7 +1407,8 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, int r = -EINVAL; READ_LOCK(cmd); - r = dm_sm_get_nr_free(cmd->metadata_sm, result); + if (!cmd->fail_io) + r = dm_sm_get_nr_free(cmd->metadata_sm, result); READ_UNLOCK(cmd); return r; @@ -1417,7 +1420,8 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, int r = -EINVAL; READ_LOCK(cmd); - r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); + if (!cmd->fail_io) + r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); READ_UNLOCK(cmd); return r; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 59b2c50562e4..c817627d09ca 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -248,7 +248,7 @@ struct cache { /* * Fields for converting from sectors to blocks. */ - uint32_t sectors_per_block; + sector_t sectors_per_block; int sectors_per_block_shift; spinlock_t lock; @@ -3546,11 +3546,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); - DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", + DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, - cache->sectors_per_block, + (unsigned long long)cache->sectors_per_block, (unsigned long long) from_cblock(residency), (unsigned long long) from_cblock(cache->cache_size), (unsigned) atomic_read(&cache->stats.read_hit), diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 40ceba1fe8be..1609d4971104 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -29,7 +29,6 @@ struct dm_kobject_holder { * DM targets must _not_ deference a mapped_device to directly access its members! */ struct mapped_device { - struct srcu_struct io_barrier; struct mutex suspend_lock; /* @@ -127,6 +126,8 @@ struct mapped_device { struct blk_mq_tag_set *tag_set; bool use_blk_mq:1; bool init_tio_pdu:1; + + struct srcu_struct io_barrier; }; void dm_init_md_queue(struct mapped_device *md); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index bf2b2676cb8a..80e3df1f1f7d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -961,15 +961,15 @@ static int metadata_commit(struct era_metadata *md) } } - r = save_sm_root(md); + r = dm_tm_pre_commit(md->tm); if (r) { - DMERR("%s: save_sm_root failed", __func__); + DMERR("%s: pre commit failed", __func__); return r; } - r = dm_tm_pre_commit(md->tm); + r = save_sm_root(md); if (r) { - DMERR("%s: pre commit failed", __func__); + DMERR("%s: save_sm_root failed", __func__); return r; } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 966eb4b61aed..a68c650aad11 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1847,7 +1847,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (r) goto out; - param->data_size = sizeof(*param); + param->data_size = offsetof(struct dm_ioctl, data); r = fn(param, input_param_size); if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e477af8596e2..0d437c98ab08 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -119,7 +119,8 @@ static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); -static void activate_path(struct work_struct *work); +static void activate_or_offline_path(struct pgpath *pgpath); +static void activate_path_work(struct work_struct *work); static void process_queued_bios(struct work_struct *work); /*----------------------------------------------- @@ -144,7 +145,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = true; - INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); } return pgpath; @@ -430,7 +431,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned long flags; struct priority_group *pg; struct pgpath *pgpath; - bool bypassed = true; + unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { clear_bit(MPATHF_QUEUE_IO, &m->flags); @@ -469,7 +470,7 @@ check_current_pg: */ do { list_for_each_entry(pg, &m->priority_groups, list) { - if (pg->bypassed == bypassed) + if (pg->bypassed == !!bypassed) continue; pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) { @@ -1515,10 +1516,8 @@ out: spin_unlock_irqrestore(&m->lock, flags); } -static void activate_path(struct work_struct *work) +static void activate_or_offline_path(struct pgpath *pgpath) { - struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path.work); struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); if (pgpath->is_active && !blk_queue_dying(q)) @@ -1527,6 +1526,14 @@ static void activate_path(struct work_struct *work) pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); } +static void activate_path_work(struct work_struct *work) +{ + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path.work); + + activate_or_offline_path(pgpath); +} + static int noretry_error(int error) { switch (error) { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index af2d79b52484..ee75e3510be6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3589,7 +3589,7 @@ static int raid_preresume(struct dm_target *ti) return r; /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { r = bitmap_resize(mddev->bitmap, mddev->dev_sectors, to_bytes(rs->requested_bitmap_chunk_sectors), 0); @@ -3621,6 +3621,8 @@ static int raid_preresume(struct dm_target *ti) return r; } +#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) + static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; @@ -3638,7 +3640,15 @@ static void raid_resume(struct dm_target *ti) mddev->ro = 0; mddev->in_sync = 0; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + /* + * Keep the RAID set frozen if reshape/rebuild flags are set. + * The RAID set is unfrozen once the next table load/resume, + * which clears the reshape/rebuild flags, occurs. + * This ensures that the constructor for the inactive table + * retrieves an up-to-date reshape_position. + */ + if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->suspended) mddev_resume(mddev); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6c25213ab38c..bdbb7e6e8212 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,8 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" -#define RR_MIN_IO 1000 -#define RR_VERSION "1.1.0" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -47,44 +47,19 @@ struct selector { struct list_head valid_paths; struct list_head invalid_paths; spinlock_t lock; - struct dm_path * __percpu *current_path; - struct percpu_counter repeat_count; }; -static void set_percpu_current_path(struct selector *s, struct dm_path *path) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(s->current_path, cpu) = path; -} - static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - spin_lock_init(&s->lock); - - s->current_path = alloc_percpu(struct dm_path *); - if (!s->current_path) - goto out_current_path; - set_percpu_current_path(s, NULL); - - if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) - goto out_repeat_count; + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } return s; - -out_repeat_count: - free_percpu(s->current_path); -out_current_path: - kfree(s); - return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps) free_paths(&s->valid_paths); free_paths(&s->invalid_paths); - free_percpu(s->current_path); - percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* allocate the path */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p) struct path_info *pi = p->pscontext; spin_lock_irqsave(&s->lock, flags); - if (p == *this_cpu_ptr(s->current_path)) - set_percpu_current_path(s, NULL); - list_move(&pi->list, &s->invalid_paths); spin_unlock_irqrestore(&s->lock, flags); } @@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) unsigned long flags; struct selector *s = ps->context; struct path_info *pi = NULL; - struct dm_path *current_path = NULL; - - local_irq_save(flags); - current_path = *this_cpu_ptr(s->current_path); - if (current_path) { - percpu_counter_dec(&s->repeat_count); - if (percpu_counter_read_positive(&s->repeat_count) > 0) { - local_irq_restore(flags); - return current_path; - } - } - spin_lock(&s->lock); + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - percpu_counter_set(&s->repeat_count, pi->repeat_count); - set_percpu_current_path(s, pi->path); - current_path = pi->path; } spin_unlock_irqrestore(&s->lock, flags); - return current_path; + return pi ? pi->path : NULL; } static struct path_selector_type rr_ps = { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 2c965424d383..ba7c4c685db3 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -997,10 +997,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - blk_mq_register_dev(disk_to_dev(md->disk), q); + err = blk_mq_register_dev(disk_to_dev(md->disk), q); + if (err) + goto out_cleanup_queue; return 0; +out_cleanup_queue: + blk_cleanup_queue(q); out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 38b05f23b96c..0250e7e521ab 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head) int cpu; struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + kfree(s->histogram_boundaries); kfree(s->program_id); kfree(s->aux_data); for_each_possible_cpu(cpu) { diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index a15091a0d40c..4477bf930cf4 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -485,11 +485,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) if (r < 0) return r; - r = save_sm_roots(pmd); + r = dm_tm_pre_commit(pmd->tm); if (r < 0) return r; - r = dm_tm_pre_commit(pmd->tm); + r = save_sm_roots(pmd); if (r < 0) return r; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d1c05c12a9db..0b678b5da4c4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1070,6 +1070,7 @@ static void passdown_endio(struct bio *bio) * to unmap (we ignore err). */ queue_passdown_pt2(bio->bi_private); + bio_put(bio); } static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) @@ -1094,6 +1095,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) return; } + /* + * Increment the unmapped blocks. This prevents a race between the + * passdown io and reallocation of freed blocks. + */ + r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); + if (r) { + metadata_operation_failed(pool, "dm_pool_inc_data_range", r); + bio_io_error(m->bio); + cell_defer_no_holder(tc, m->cell); + mempool_free(m, pool->mapping_pool); + return; + } + discard_parent = bio_alloc(GFP_NOIO, 1); if (!discard_parent) { DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.", @@ -1114,19 +1128,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) end_discard(&op, r); } } - - /* - * Increment the unmapped blocks. This prevents a race between the - * passdown io and reallocation of freed blocks. - */ - r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); - if (r) { - metadata_operation_failed(pool, "dm_pool_inc_data_range", r); - bio_io_error(m->bio); - cell_defer_no_holder(tc, m->cell); - mempool_free(m, pool->mapping_pool); - return; - } } static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0f0eb8a3d922..78f36012eaca 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -146,8 +146,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, block = fec_buffer_rs_block(v, fio, n, i); res = fec_decode_rs8(v, fio, block, &par[offset], neras); if (res < 0) { - dm_bufio_release(buf); - r = res; goto error; } @@ -172,6 +170,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, done: r = corrected; error: + dm_bufio_release(buf); + if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); @@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, &is_zero) == 0) { /* skip known zero blocks entirely */ if (is_zero) - continue; + goto done; /* * skip if we have already found the theoretical @@ -439,6 +439,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + if (type == DM_VERITY_BLOCK_TYPE_METADATA) block += v->data_blocks; @@ -470,7 +477,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (r < 0) { r = fec_decode_rsb(v, io, fio, rsb, offset, true); if (r < 0) - return r; + goto done; } if (dest) @@ -480,6 +487,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, r = verity_for_bv_block(v, io, iter, fec_bv_copy); } +done: + fio->level--; return r; } @@ -520,6 +529,7 @@ void verity_fec_init_io(struct dm_verity_io *io) memset(fio->bufs, 0, sizeof(fio->bufs)); fio->nbufs = 0; fio->output = NULL; + fio->level = 0; } /* diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 7fa0298b995e..bb31ce87a933 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -27,6 +27,9 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -58,6 +61,7 @@ struct dm_verity_fec_io { unsigned nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ size_t output_pos; + unsigned level; /* recursion level */ }; #ifdef CONFIG_DM_VERITY_FEC diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ef7bf1dd6900..c5522551122f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -21,6 +21,7 @@ #include <linux/delay.h> #include <linux/wait.h> #include <linux/pr.h> +#include <linux/vmalloc.h> #define DM_MSG_PREFIX "core" @@ -972,10 +973,64 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +/* + * Flush current->bio_list when the target map method blocks. + * This fixes deadlocks in snapshot and possibly in other targets. + */ +struct dm_offload { + struct blk_plug plug; + struct blk_plug_cb cb; +}; + +static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) +{ + struct dm_offload *o = container_of(cb, struct dm_offload, cb); + struct bio_list list; + struct bio *bio; + int i; + + INIT_LIST_HEAD(&o->cb.list); + + if (unlikely(!current->bio_list)) + return; + + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); + } + } +} + +static void dm_offload_start(struct dm_offload *o) +{ + blk_start_plug(&o->plug); + o->cb.callback = flush_current_bio_list; + list_add(&o->cb.list, ¤t->plug->cb_list); +} + +static void dm_offload_end(struct dm_offload *o) +{ + list_del(&o->cb.list); + blk_finish_plug(&o->plug); +} + static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; + struct dm_offload o; struct bio *clone = &tio->clone; struct dm_target *ti = tio->ti; @@ -988,7 +1043,11 @@ static void __map_bio(struct dm_target_io *tio) */ atomic_inc(&tio->io->io_count); sector = clone->bi_iter.bi_sector; + + dm_offload_start(&o); r = ti->type->map(ti, clone); + dm_offload_end(&o); + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ @@ -1453,7 +1512,7 @@ static struct mapped_device *alloc_dev(int minor) struct mapped_device *md; void *old_md; - md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); + md = vzalloc_node(sizeof(*md), numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -1547,7 +1606,7 @@ bad_io_barrier: bad_minor: module_put(THIS_MODULE); bad_module_get: - kfree(md); + kvfree(md); return NULL; } @@ -1566,7 +1625,7 @@ static void free_dev(struct mapped_device *md) free_minor(minor); module_put(THIS_MODULE); - kfree(md); + kvfree(md); } static void __bind_mempools(struct mapped_device *md, struct dm_table *t) @@ -2456,11 +2515,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) md = container_of(kobj, struct mapped_device, kobj_holder.kobj); - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) - return NULL; - + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } dm_get(md); +out: + spin_unlock(&_minor_lock); + return md; } diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 86f5d435901d..12abf69d568a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -52,18 +52,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -143,6 +151,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -195,15 +216,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 24925f2aa235..8ebf1b97e1d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1861,7 +1861,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); @@ -2270,7 +2270,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; @@ -6752,6 +6752,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; int ro; + bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) return -ENOTTY; @@ -6841,7 +6842,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } + WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); set_bit(MD_CLOSING, &mddev->flags); + did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } @@ -7041,6 +7044,8 @@ unlock: mddev->hold_active = 0; mddev_unlock(mddev); out: + if(did_set_md_closing) + clear_bit(MD_CLOSING, &mddev->flags); return err; } #ifdef CONFIG_COMPAT diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 20a40329d84a..7a75b5010f73 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -897,8 +897,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, else *result_key = le64_to_cpu(ro_node(s)->keys[0]); - if (next_block || flags & INTERNAL_NODE) - block = value64(ro_node(s), i); + if (next_block || flags & INTERNAL_NODE) { + if (find_highest) + block = value64(ro_node(s), i); + else + block = value64(ro_node(s), 0); + } } while (flags & INTERNAL_NODE); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index ebb280a14325..32adf6b4a9c7 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { + int r; + uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return sm_ll_dec(&smd->ll, b, &ev); + r = sm_ll_dec(&smd->ll, b, &ev); + if (!r && (ev == SM_FREE)) { + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (!r && !old_count) + smd->nr_allocated_this_transaction--; + } + + return r; } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 29e2df5cd77b..81a78757bc78 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1073,7 +1073,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) */ DEFINE_WAIT(w); for (;;) { - flush_signals(current); + sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || @@ -1082,7 +1082,10 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 39fddda2fef2..b19b551bb34b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -941,7 +941,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1406,11 +1407,24 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } @@ -1470,7 +1484,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) split = bio; } + /* + * If a bio is splitted, the first part of bio will pass + * barrier but the bio is queued in current->bio_list (see + * generic_make_request). If there is a raise_barrier() called + * here, the second part of bio can't pass barrier. But since + * the first part bio isn't dispatched to underlaying disks + * yet, the barrier is never released, hence raise_barrier will + * alays wait. We have a deadlock. + * Note, this only happens in read path. For write path, the + * first part of bio is dispatched in a schedule() call + * (because of blk plug) or offloaded to raid10d. + * Quitting from the function immediately can change the bio + * order queued in bio_list and avoid the deadlock. + */ __make_request(mddev, split); + if (split != bio && bio_data_dir(bio) == READ) { + generic_make_request(bio); + break; + } } while (split != bio); /* In case raid10d snuck in to freeze_array */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cce6057b9aca..7aea0221530c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -829,6 +829,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -836,8 +844,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; @@ -2253,6 +2259,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) err = -ENOMEM; mutex_unlock(&conf->cache_size_mutex); + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2270,8 +2280,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } /* critical section pass, GFP_NOIO no longer needed */ - conf->slab_cache = sc; - conf->active_name = 1-conf->active_name; if (!err) conf->pool_size = newsize; return err; @@ -4275,7 +4283,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; @@ -5298,12 +5307,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * userspace, we want an interruptible * wait. */ - flush_signals(current); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_INTERRUPTIBLE); if (logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { + sigset_t full, old; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); do_prepare = true; } goto retry; @@ -5838,6 +5850,10 @@ static void raid5_do_work(struct work_struct *work) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); + + r5l_flush_stripe_to_raid(conf->log); + + async_tx_issue_pending_all(); blk_finish_plug(&plug); pr_debug("--- raid5worker inactive\n"); @@ -7555,12 +7571,10 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - rdev_for_each(rdev, conf->mddev) - rdev->data_offset = rdev->new_data_offset; + md_finish_reshape(conf->mddev); smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; |