summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/bcache.h5
-rw-r--r--drivers/md/bcache/btree.c39
-rw-r--r--drivers/md/bcache/btree.h3
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/request.c8
-rw-r--r--drivers/md/bcache/super.c9
-rw-r--r--drivers/md/bcache/sysfs.c4
-rw-r--r--drivers/md/bcache/util.c50
-rw-r--r--drivers/md/bcache/writeback.c20
-rw-r--r--drivers/md/bcache/writeback.h21
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/dm-bufio.c50
-rw-r--r--drivers/md/dm-cache-metadata.c12
-rw-r--r--drivers/md/dm-cache-target.c6
-rw-r--r--drivers/md/dm-core.h3
-rw-r--r--drivers/md/dm-era-target.c8
-rw-r--r--drivers/md/dm-ioctl.c2
-rw-r--r--drivers/md/dm-mpath.c21
-rw-r--r--drivers/md/dm-raid.c14
-rw-r--r--drivers/md/dm-round-robin.c67
-rw-r--r--drivers/md/dm-rq.c6
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm-thin-metadata.c4
-rw-r--r--drivers/md/dm-thin.c27
-rw-r--r--drivers/md/dm-verity-fec.c18
-rw-r--r--drivers/md/dm-verity-fec.h4
-rw-r--r--drivers/md/dm.c77
-rw-r--r--drivers/md/linear.c40
-rw-r--r--drivers/md/linear.h1
-rw-r--r--drivers/md/md.c9
-rw-r--r--drivers/md/persistent-data/dm-btree.c8
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c15
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid10.c40
-rw-r--r--drivers/md/raid5.c32
38 files changed, 439 insertions, 205 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 02a5345a44a6..197e29d1c2e6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -357,6 +357,7 @@ config DM_LOG_USERSPACE
config DM_RAID
tristate "RAID 1/4/5/6/10 target"
depends on BLK_DEV_DM
+ select MD_RAID0
select MD_RAID1
select MD_RAID10
select MD_RAID456
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1ccd8d..537903bf9add 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -404,7 +404,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
finish_wait(&ca->set->bucket_wait, &w);
out:
- wake_up_process(ca->alloc_thread);
+ if (ca->alloc_thread)
+ wake_up_process(ca->alloc_thread);
trace_bcache_alloc(ca, reserve);
@@ -476,7 +477,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
if (b == -1)
goto err;
- k->ptr[i] = PTR(ca->buckets[b].gen,
+ k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
ca->sb.nr_this_dev);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6b420a55c745..02619cabda8b 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -333,6 +333,7 @@ struct cached_dev {
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
struct task_struct *writeback_thread;
+ struct workqueue_struct *writeback_write_wq;
struct keybuf writeback_keys;
@@ -425,7 +426,7 @@ struct cache {
* until a gc finishes - otherwise we could pointlessly burn a ton of
* cpu
*/
- unsigned invalidate_needs_gc:1;
+ unsigned invalidate_needs_gc;
bool discard; /* Get rid of? */
@@ -593,8 +594,8 @@ struct cache_set {
/* Counts how many sectors bio_insert has added to the cache */
atomic_t sectors_to_gc;
+ wait_queue_head_t gc_wait;
- wait_queue_head_t moving_gc_wait;
struct keybuf moving_gc_keys;
/* Number of moving GC bios in flight */
struct semaphore moving_in_flight;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81d3db40cd7b..2efdce07247c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1757,32 +1757,34 @@ static void bch_btree_gc(struct cache_set *c)
bch_moving_gc(c);
}
-static int bch_gc_thread(void *arg)
+static bool gc_should_run(struct cache_set *c)
{
- struct cache_set *c = arg;
struct cache *ca;
unsigned i;
- while (1) {
-again:
- bch_btree_gc(c);
+ for_each_cache(ca, c, i)
+ if (ca->invalidate_needs_gc)
+ return true;
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
+ if (atomic_read(&c->sectors_to_gc) < 0)
+ return true;
- mutex_lock(&c->bucket_lock);
+ return false;
+}
- for_each_cache(ca, c, i)
- if (ca->invalidate_needs_gc) {
- mutex_unlock(&c->bucket_lock);
- set_current_state(TASK_RUNNING);
- goto again;
- }
+static int bch_gc_thread(void *arg)
+{
+ struct cache_set *c = arg;
- mutex_unlock(&c->bucket_lock);
+ while (1) {
+ wait_event_interruptible(c->gc_wait,
+ kthread_should_stop() || gc_should_run(c));
- schedule();
+ if (kthread_should_stop())
+ break;
+
+ set_gc_sectors(c);
+ bch_btree_gc(c);
}
return 0;
@@ -1790,11 +1792,10 @@ again:
int bch_gc_thread_start(struct cache_set *c)
{
- c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+ c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
if (IS_ERR(c->gc_thread))
return PTR_ERR(c->gc_thread);
- set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
return 0;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 5c391fa01bed..9b80417cd547 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
static inline void wake_up_gc(struct cache_set *c)
{
- if (c->gc_thread)
- wake_up_process(c->gc_thread);
+ wake_up(&c->gc_wait);
}
#define MAP_DONE 0
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 243de0bf15cd..4bf15182c4da 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -584,7 +584,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
return false;
for (i = 0; i < KEY_PTRS(l); i++)
- if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+ if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
return false;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6925023e12d4..08f20b7cd199 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -508,7 +508,7 @@ static void journal_reclaim(struct cache_set *c)
continue;
ja->cur_idx = next;
- k->ptr[n++] = PTR(0,
+ k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 40ffe5e424b3..e0f1c6d534fe 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,14 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
- if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
- set_gc_sectors(op->c);
- wake_up_gc(op->c);
- }
-
if (op->bypass)
return bch_data_invalidate(cl);
+ if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+ wake_up_gc(op->c);
+
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
* flush, it'll wait on the journal write.
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 849ad441cd76..f4557f558b24 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1025,7 +1025,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- bch_sectors_dirty_init(dc);
+ bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
atomic_inc(&dc->count);
bch_writeback_queue(dc);
@@ -1058,6 +1058,8 @@ static void cached_dev_free(struct closure *cl)
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
+ if (dc->writeback_write_wq)
+ destroy_workqueue(dc->writeback_write_wq);
mutex_lock(&bch_register_lock);
@@ -1229,6 +1231,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err;
bcache_device_attach(d, c, u - c->uuids);
+ bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
add_disk(d->disk);
@@ -1491,6 +1494,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
init_waitqueue_head(&c->bucket_wait);
+ init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
spin_lock_init(&c->btree_gc_time.lock);
@@ -1550,6 +1554,7 @@ static void run_cache_set(struct cache_set *c)
for_each_cache(ca, c, i)
c->nbuckets += ca->sb.nbuckets;
+ set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
LIST_HEAD(journal);
@@ -1965,6 +1970,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
+ if (!IS_ERR(bdev))
+ bdput(bdev);
if (attr == &ksysfs_register_quiet)
goto out;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d61dde..4fbb5532f24c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -191,7 +191,7 @@ STORE(__cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
- unsigned v = size;
+ ssize_t v = size;
struct cache_set *c;
struct kobj_uevent_env *env;
@@ -226,7 +226,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
- ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+ v = bch_read_string_list(buf, bch_cache_modes + 1);
if (v < 0)
return v;
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index dde6172f3f10..eb70f6894f05 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -73,24 +73,44 @@ STRTO_H(strtouint, unsigned int)
STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
+/**
+ * bch_hprint() - formats @v to human readable string for sysfs.
+ *
+ * @v - signed 64 bit integer
+ * @buf - the (at least 8 byte) buffer to format the result into.
+ *
+ * Returns the number of bytes used by format.
+ */
ssize_t bch_hprint(char *buf, int64_t v)
{
static const char units[] = "?kMGTPEZY";
- char dec[4] = "";
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0 << 10);
- v >>= 10;
- }
-
- if (!u)
- return sprintf(buf, "%llu", v);
-
- if (v < 100 && v > -100)
- snprintf(dec, sizeof(dec), ".%i", t / 100);
-
- return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+ int u = 0, t;
+
+ uint64_t q;
+
+ if (v < 0)
+ q = -v;
+ else
+ q = v;
+
+ /* For as long as the number is more than 3 digits, but at least
+ * once, shift right / divide by 1024. Keep the remainder for
+ * a digit after the decimal point.
+ */
+ do {
+ u++;
+
+ t = q & ~(~0 << 10);
+ q >>= 10;
+ } while (q >= 1000);
+
+ if (v < 0)
+ /* '-', up to 3 digits, '.', 1 digit, 1 character, null;
+ * yields 8 bytes.
+ */
+ return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]);
+ else
+ return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index e51644e503a5..4ce2b19fe120 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -20,7 +20,8 @@
static void __update_writeback_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+ bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
@@ -186,7 +187,7 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
- continue_at(cl, write_dirty_finish, system_wq);
+ continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
static void read_dirty_endio(struct bio *bio)
@@ -206,7 +207,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl);
- continue_at(cl, write_dirty, system_wq);
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
static void read_dirty(struct cached_dev *dc)
@@ -482,17 +483,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
return MAP_CONTINUE;
}
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
bch_btree_op_init(&op.op, -1);
- op.inode = dc->disk.id;
+ op.inode = d->id;
- bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+ bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
- dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+ d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -516,6 +517,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
+ dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
+ WQ_MEM_RECLAIM, 0);
+ if (!dc->writeback_write_wq)
+ return -ENOMEM;
+
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 301eaf565167..cdf8d253209e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
+static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
+{
+ uint64_t i, ret = 0;
+
+ mutex_lock(&bch_register_lock);
+
+ for (i = 0; i < c->nr_uuids; i++) {
+ struct bcache_device *d = c->devices[i];
+
+ if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+ ret += bcache_dev_sectors_dirty(d);
+ }
+
+ mutex_unlock(&bch_register_lock);
+
+ return ret;
+}
+
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
@@ -85,7 +104,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
void bch_cached_dev_writeback_init(struct cached_dev *);
int bch_cached_dev_writeback_start(struct cached_dev *);
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2d826927a3bf..fb02c3979bf4 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1992,6 +1992,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
long pages;
struct bitmap_page *new_bp;
+ if (bitmap->storage.file && !init) {
+ pr_info("md: cannot resize file-based bitmap\n");
+ return -EINVAL;
+ }
+
if (chunksize == 0) {
/* If there is enough space, leave the chunk size unchanged,
* else increase by factor of two until there is enough space.
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 125aedc3875f..7643f72adb1c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -215,7 +215,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -923,10 +923,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
{
unsigned long buffers;
- if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
- mutex_lock(&dm_bufio_clients_lock);
- __cache_size_refresh();
- mutex_unlock(&dm_bufio_clients_lock);
+ if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+ if (mutex_trylock(&dm_bufio_clients_lock)) {
+ __cache_size_refresh();
+ mutex_unlock(&dm_bufio_clients_lock);
+ }
}
buffers = dm_bufio_cache_size_per_client >>
@@ -936,7 +937,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
buffers = c->minimum_buffers;
*limit_buffers = buffers;
- *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
+ *threshold_buffers = mult_frac(buffers,
+ DM_BUFIO_WRITEBACK_PERCENT, 100);
}
/*
@@ -1540,10 +1542,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
return true;
}
-static unsigned get_retain_buffers(struct dm_bufio_client *c)
+static unsigned long get_retain_buffers(struct dm_bufio_client *c)
{
- unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
- return retain_bytes / c->block_size;
+ unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
}
static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
@@ -1553,7 +1555,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
struct dm_buffer *b, *tmp;
unsigned long freed = 0;
unsigned long count = nr_to_scan;
- unsigned retain_target = get_retain_buffers(c);
+ unsigned long retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
@@ -1779,11 +1781,19 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz)
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
{
struct dm_buffer *b, *tmp;
- unsigned retain_target = get_retain_buffers(c);
- unsigned count;
+ unsigned long retain_target = get_retain_buffers(c);
+ unsigned long count;
+ LIST_HEAD(write_list);
dm_bufio_lock(c);
+ __check_watermark(c, &write_list);
+ if (unlikely(!list_empty(&write_list))) {
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+ dm_bufio_lock(c);
+ }
+
count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
if (count <= retain_target)
@@ -1808,6 +1818,8 @@ static void cleanup_old_buffers(void)
mutex_lock(&dm_bufio_clients_lock);
+ __cache_size_refresh();
+
list_for_each_entry(c, &dm_bufio_all_clients, client_list)
__evict_old_buffers(c, max_age_hz);
@@ -1845,19 +1857,15 @@ static int __init dm_bufio_init(void)
memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
- mem = (__u64)((totalram_pages - totalhigh_pages) *
- DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
+ mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
+ DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
if (mem > ULONG_MAX)
mem = ULONG_MAX;
#ifdef CONFIG_MMU
- /*
- * Get the size of vmalloc space the same way as VMALLOC_TOTAL
- * in fs/proc/internal.h
- */
- if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
- mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
+ if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
+ mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
#endif
dm_bufio_default_cache_size = mem;
@@ -1930,7 +1938,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
-module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 695577812cf6..6937ca42be8c 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1383,17 +1383,19 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
{
- int r;
+ int r = -EINVAL;
flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
clear_clean_shutdown);
WRITE_LOCK(cmd);
+ if (cmd->fail_io)
+ goto out;
+
r = __commit_transaction(cmd, mutator);
if (r)
goto out;
r = __begin_transaction(cmd);
-
out:
WRITE_UNLOCK(cmd);
return r;
@@ -1405,7 +1407,8 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
int r = -EINVAL;
READ_LOCK(cmd);
- r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_free(cmd->metadata_sm, result);
READ_UNLOCK(cmd);
return r;
@@ -1417,7 +1420,8 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
int r = -EINVAL;
READ_LOCK(cmd);
- r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
READ_UNLOCK(cmd);
return r;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 59b2c50562e4..c817627d09ca 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -248,7 +248,7 @@ struct cache {
/*
* Fields for converting from sectors to blocks.
*/
- uint32_t sectors_per_block;
+ sector_t sectors_per_block;
int sectors_per_block_shift;
spinlock_t lock;
@@ -3546,11 +3546,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+ DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
- cache->sectors_per_block,
+ (unsigned long long)cache->sectors_per_block,
(unsigned long long) from_cblock(residency),
(unsigned long long) from_cblock(cache->cache_size),
(unsigned) atomic_read(&cache->stats.read_hit),
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 40ceba1fe8be..1609d4971104 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -29,7 +29,6 @@ struct dm_kobject_holder {
* DM targets must _not_ deference a mapped_device to directly access its members!
*/
struct mapped_device {
- struct srcu_struct io_barrier;
struct mutex suspend_lock;
/*
@@ -127,6 +126,8 @@ struct mapped_device {
struct blk_mq_tag_set *tag_set;
bool use_blk_mq:1;
bool init_tio_pdu:1;
+
+ struct srcu_struct io_barrier;
};
void dm_init_md_queue(struct mapped_device *md);
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index bf2b2676cb8a..80e3df1f1f7d 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -961,15 +961,15 @@ static int metadata_commit(struct era_metadata *md)
}
}
- r = save_sm_root(md);
+ r = dm_tm_pre_commit(md->tm);
if (r) {
- DMERR("%s: save_sm_root failed", __func__);
+ DMERR("%s: pre commit failed", __func__);
return r;
}
- r = dm_tm_pre_commit(md->tm);
+ r = save_sm_root(md);
if (r) {
- DMERR("%s: pre commit failed", __func__);
+ DMERR("%s: save_sm_root failed", __func__);
return r;
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 966eb4b61aed..a68c650aad11 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1847,7 +1847,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
if (r)
goto out;
- param->data_size = sizeof(*param);
+ param->data_size = offsetof(struct dm_ioctl, data);
r = fn(param, input_param_size);
if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e477af8596e2..0d437c98ab08 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -119,7 +119,8 @@ static struct kmem_cache *_mpio_cache;
static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void trigger_event(struct work_struct *work);
-static void activate_path(struct work_struct *work);
+static void activate_or_offline_path(struct pgpath *pgpath);
+static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
/*-----------------------------------------------
@@ -144,7 +145,7 @@ static struct pgpath *alloc_pgpath(void)
if (pgpath) {
pgpath->is_active = true;
- INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
}
return pgpath;
@@ -430,7 +431,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
unsigned long flags;
struct priority_group *pg;
struct pgpath *pgpath;
- bool bypassed = true;
+ unsigned bypassed = 1;
if (!atomic_read(&m->nr_valid_paths)) {
clear_bit(MPATHF_QUEUE_IO, &m->flags);
@@ -469,7 +470,7 @@ check_current_pg:
*/
do {
list_for_each_entry(pg, &m->priority_groups, list) {
- if (pg->bypassed == bypassed)
+ if (pg->bypassed == !!bypassed)
continue;
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath)) {
@@ -1515,10 +1516,8 @@ out:
spin_unlock_irqrestore(&m->lock, flags);
}
-static void activate_path(struct work_struct *work)
+static void activate_or_offline_path(struct pgpath *pgpath)
{
- struct pgpath *pgpath =
- container_of(work, struct pgpath, activate_path.work);
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
if (pgpath->is_active && !blk_queue_dying(q))
@@ -1527,6 +1526,14 @@ static void activate_path(struct work_struct *work)
pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
}
+static void activate_path_work(struct work_struct *work)
+{
+ struct pgpath *pgpath =
+ container_of(work, struct pgpath, activate_path.work);
+
+ activate_or_offline_path(pgpath);
+}
+
static int noretry_error(int error)
{
switch (error) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index af2d79b52484..ee75e3510be6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3589,7 +3589,7 @@ static int raid_preresume(struct dm_target *ti)
return r;
/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
- if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
+ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
to_bytes(rs->requested_bitmap_chunk_sectors), 0);
@@ -3621,6 +3621,8 @@ static int raid_preresume(struct dm_target *ti)
return r;
}
+#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
+
static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
@@ -3638,7 +3640,15 @@ static void raid_resume(struct dm_target *ti)
mddev->ro = 0;
mddev->in_sync = 0;
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ /*
+ * Keep the RAID set frozen if reshape/rebuild flags are set.
+ * The RAID set is unfrozen once the next table load/resume,
+ * which clears the reshape/rebuild flags, occurs.
+ * This ensures that the constructor for the inactive table
+ * retrieves an up-to-date reshape_position.
+ */
+ if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->suspended)
mddev_resume(mddev);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6c25213ab38c..bdbb7e6e8212 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,8 +17,8 @@
#include <linux/module.h>
#define DM_MSG_PREFIX "multipath round-robin"
-#define RR_MIN_IO 1000
-#define RR_VERSION "1.1.0"
+#define RR_MIN_IO 1
+#define RR_VERSION "1.2.0"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
@@ -47,44 +47,19 @@ struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
spinlock_t lock;
- struct dm_path * __percpu *current_path;
- struct percpu_counter repeat_count;
};
-static void set_percpu_current_path(struct selector *s, struct dm_path *path)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(s->current_path, cpu) = path;
-}
-
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return NULL;
-
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->invalid_paths);
- spin_lock_init(&s->lock);
-
- s->current_path = alloc_percpu(struct dm_path *);
- if (!s->current_path)
- goto out_current_path;
- set_percpu_current_path(s, NULL);
-
- if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
- goto out_repeat_count;
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ spin_lock_init(&s->lock);
+ }
return s;
-
-out_repeat_count:
- free_percpu(s->current_path);
-out_current_path:
- kfree(s);
- return NULL;;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps)
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
- free_percpu(s->current_path);
- percpu_counter_destroy(&s->repeat_count);
kfree(s);
ps->context = NULL;
}
@@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
+ if (repeat_count > 1) {
+ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+ repeat_count = 1;
+ }
+
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
@@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
struct path_info *pi = p->pscontext;
spin_lock_irqsave(&s->lock, flags);
- if (p == *this_cpu_ptr(s->current_path))
- set_percpu_current_path(s, NULL);
-
list_move(&pi->list, &s->invalid_paths);
spin_unlock_irqrestore(&s->lock, flags);
}
@@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
unsigned long flags;
struct selector *s = ps->context;
struct path_info *pi = NULL;
- struct dm_path *current_path = NULL;
-
- local_irq_save(flags);
- current_path = *this_cpu_ptr(s->current_path);
- if (current_path) {
- percpu_counter_dec(&s->repeat_count);
- if (percpu_counter_read_positive(&s->repeat_count) > 0) {
- local_irq_restore(flags);
- return current_path;
- }
- }
- spin_lock(&s->lock);
+ spin_lock_irqsave(&s->lock, flags);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
- percpu_counter_set(&s->repeat_count, pi->repeat_count);
- set_percpu_current_path(s, pi->path);
- current_path = pi->path;
}
spin_unlock_irqrestore(&s->lock, flags);
- return current_path;
+ return pi ? pi->path : NULL;
}
static struct path_selector_type rr_ps = {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 2c965424d383..ba7c4c685db3 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -997,10 +997,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
- blk_mq_register_dev(disk_to_dev(md->disk), q);
+ err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+ if (err)
+ goto out_cleanup_queue;
return 0;
+out_cleanup_queue:
+ blk_cleanup_queue(q);
out_tag_set:
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 38b05f23b96c..0250e7e521ab 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head)
int cpu;
struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+ kfree(s->histogram_boundaries);
kfree(s->program_id);
kfree(s->aux_data);
for_each_possible_cpu(cpu) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a15091a0d40c..4477bf930cf4 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -485,11 +485,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
- r = save_sm_roots(pmd);
+ r = dm_tm_pre_commit(pmd->tm);
if (r < 0)
return r;
- r = dm_tm_pre_commit(pmd->tm);
+ r = save_sm_roots(pmd);
if (r < 0)
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d1c05c12a9db..0b678b5da4c4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1070,6 +1070,7 @@ static void passdown_endio(struct bio *bio)
* to unmap (we ignore err).
*/
queue_passdown_pt2(bio->bi_private);
+ bio_put(bio);
}
static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
@@ -1094,6 +1095,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
return;
}
+ /*
+ * Increment the unmapped blocks. This prevents a race between the
+ * passdown io and reallocation of freed blocks.
+ */
+ r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+ bio_io_error(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
+ return;
+ }
+
discard_parent = bio_alloc(GFP_NOIO, 1);
if (!discard_parent) {
DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
@@ -1114,19 +1128,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
end_discard(&op, r);
}
}
-
- /*
- * Increment the unmapped blocks. This prevents a race between the
- * passdown io and reallocation of freed blocks.
- */
- r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
- if (r) {
- metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
- bio_io_error(m->bio);
- cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
- return;
- }
}
static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 0f0eb8a3d922..78f36012eaca 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -146,8 +146,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
block = fec_buffer_rs_block(v, fio, n, i);
res = fec_decode_rs8(v, fio, block, &par[offset], neras);
if (res < 0) {
- dm_bufio_release(buf);
-
r = res;
goto error;
}
@@ -172,6 +170,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
done:
r = corrected;
error:
+ dm_bufio_release(buf);
+
if (r < 0 && neras)
DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
v->data_dev->name, (unsigned long long)rsb, r);
@@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
&is_zero) == 0) {
/* skip known zero blocks entirely */
if (is_zero)
- continue;
+ goto done;
/*
* skip if we have already found the theoretical
@@ -439,6 +439,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (!verity_fec_is_enabled(v))
return -EOPNOTSUPP;
+ if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
+ DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+ return -EIO;
+ }
+
+ fio->level++;
+
if (type == DM_VERITY_BLOCK_TYPE_METADATA)
block += v->data_blocks;
@@ -470,7 +477,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (r < 0) {
r = fec_decode_rsb(v, io, fio, rsb, offset, true);
if (r < 0)
- return r;
+ goto done;
}
if (dest)
@@ -480,6 +487,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
r = verity_for_bv_block(v, io, iter, fec_bv_copy);
}
+done:
+ fio->level--;
return r;
}
@@ -520,6 +529,7 @@ void verity_fec_init_io(struct dm_verity_io *io)
memset(fio->bufs, 0, sizeof(fio->bufs));
fio->nbufs = 0;
fio->output = NULL;
+ fio->level = 0;
}
/*
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 7fa0298b995e..bb31ce87a933 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -27,6 +27,9 @@
#define DM_VERITY_FEC_BUF_MAX \
(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+/* maximum recursion level for verity_fec_decode */
+#define DM_VERITY_FEC_MAX_RECURSION 4
+
#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device"
#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks"
#define DM_VERITY_OPT_FEC_START "fec_start"
@@ -58,6 +61,7 @@ struct dm_verity_fec_io {
unsigned nbufs; /* number of buffers allocated */
u8 *output; /* buffer for corrected output */
size_t output_pos;
+ unsigned level; /* recursion level */
};
#ifdef CONFIG_DM_VERITY_FEC
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ef7bf1dd6900..c5522551122f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/pr.h>
+#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "core"
@@ -972,10 +973,64 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+/*
+ * Flush current->bio_list when the target map method blocks.
+ * This fixes deadlocks in snapshot and possibly in other targets.
+ */
+struct dm_offload {
+ struct blk_plug plug;
+ struct blk_plug_cb cb;
+};
+
+static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct dm_offload *o = container_of(cb, struct dm_offload, cb);
+ struct bio_list list;
+ struct bio *bio;
+ int i;
+
+ INIT_LIST_HEAD(&o->cb.list);
+
+ if (unlikely(!current->bio_list))
+ return;
+
+ for (i = 0; i < 2; i++) {
+ list = current->bio_list[i];
+ bio_list_init(&current->bio_list[i]);
+
+ while ((bio = bio_list_pop(&list))) {
+ struct bio_set *bs = bio->bi_pool;
+ if (unlikely(!bs) || bs == fs_bio_set) {
+ bio_list_add(&current->bio_list[i], bio);
+ continue;
+ }
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_add(&bs->rescue_list, bio);
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ spin_unlock(&bs->rescue_lock);
+ }
+ }
+}
+
+static void dm_offload_start(struct dm_offload *o)
+{
+ blk_start_plug(&o->plug);
+ o->cb.callback = flush_current_bio_list;
+ list_add(&o->cb.list, &current->plug->cb_list);
+}
+
+static void dm_offload_end(struct dm_offload *o)
+{
+ list_del(&o->cb.list);
+ blk_finish_plug(&o->plug);
+}
+
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
+ struct dm_offload o;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
@@ -988,7 +1043,11 @@ static void __map_bio(struct dm_target_io *tio)
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_iter.bi_sector;
+
+ dm_offload_start(&o);
r = ti->type->map(ti, clone);
+ dm_offload_end(&o);
+
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
@@ -1453,7 +1512,7 @@ static struct mapped_device *alloc_dev(int minor)
struct mapped_device *md;
void *old_md;
- md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
+ md = vzalloc_node(sizeof(*md), numa_node_id);
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
@@ -1547,7 +1606,7 @@ bad_io_barrier:
bad_minor:
module_put(THIS_MODULE);
bad_module_get:
- kfree(md);
+ kvfree(md);
return NULL;
}
@@ -1566,7 +1625,7 @@ static void free_dev(struct mapped_device *md)
free_minor(minor);
module_put(THIS_MODULE);
- kfree(md);
+ kvfree(md);
}
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
@@ -2456,11 +2515,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
- if (test_bit(DMF_FREEING, &md->flags) ||
- dm_deleting_md(md))
- return NULL;
-
+ spin_lock(&_minor_lock);
+ if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
+ md = NULL;
+ goto out;
+ }
dm_get(md);
+out:
+ spin_unlock(&_minor_lock);
+
return md;
}
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 86f5d435901d..12abf69d568a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,18 +52,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
static int linear_congested(struct mddev *mddev, int bits)
{
struct linear_conf *conf;
int i, ret = 0;
- conf = mddev->private;
+ rcu_read_lock();
+ conf = rcu_dereference(mddev->private);
- for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ for (i = 0; i < conf->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
+ rcu_read_unlock();
return ret;
}
@@ -143,6 +151,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
return conf;
out:
@@ -195,15 +216,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
mddev_suspend(mddev);
- oldconf = mddev->private;
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
- mddev->private = newconf;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree(oldconf);
+ kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f7..8d392e6098b3 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,7 @@ struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
+ int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[0];
};
#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 24925f2aa235..8ebf1b97e1d2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1861,7 +1861,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
}
sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
- sb->super_offset = rdev->sb_start;
+ sb->super_offset = cpu_to_le64(rdev->sb_start);
sb->sb_csum = calc_sb_1_csum(sb);
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
@@ -2270,7 +2270,7 @@ static bool does_sb_need_changing(struct mddev *mddev)
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
- (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->layout != le32_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
@@ -6752,6 +6752,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
int ro;
+ bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
@@ -6841,7 +6842,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto out;
}
+ WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
set_bit(MD_CLOSING, &mddev->flags);
+ did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
@@ -7041,6 +7044,8 @@ unlock:
mddev->hold_active = 0;
mddev_unlock(mddev);
out:
+ if(did_set_md_closing)
+ clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
#ifdef CONFIG_COMPAT
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 20a40329d84a..7a75b5010f73 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -897,8 +897,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
else
*result_key = le64_to_cpu(ro_node(s)->keys[0]);
- if (next_block || flags & INTERNAL_NODE)
- block = value64(ro_node(s), i);
+ if (next_block || flags & INTERNAL_NODE) {
+ if (find_highest)
+ block = value64(ro_node(s), i);
+ else
+ block = value64(ro_node(s), 0);
+ }
} while (flags & INTERNAL_NODE);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index ebb280a14325..32adf6b4a9c7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
{
+ int r;
+ uint32_t old_count;
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- return sm_ll_dec(&smd->ll, b, &ev);
+ r = sm_ll_dec(&smd->ll, b, &ev);
+ if (!r && (ev == SM_FREE)) {
+ /*
+ * It's only free if it's also free in the last
+ * transaction.
+ */
+ r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+ if (!r && !old_count)
+ smd->nr_allocated_this_transaction--;
+ }
+
+ return r;
}
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 29e2df5cd77b..81a78757bc78 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1073,7 +1073,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
*/
DEFINE_WAIT(w);
for (;;) {
- flush_signals(current);
+ sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio_end_sector(bio) <= mddev->suspend_lo ||
@@ -1082,7 +1082,10 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))
break;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 39fddda2fef2..b19b551bb34b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -941,7 +941,8 @@ static void wait_barrier(struct r10conf *conf)
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
current->bio_list &&
- !bio_list_empty(current->bio_list)),
+ (!bio_list_empty(&current->bio_list[0]) ||
+ !bio_list_empty(&current->bio_list[1]))),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1406,11 +1407,24 @@ retry_write:
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev,
+ sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ else
+ plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!mddev_check_plugged(mddev))
+ if (!plug)
md_wakeup_thread(mddev->thread);
}
}
@@ -1470,7 +1484,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
split = bio;
}
+ /*
+ * If a bio is splitted, the first part of bio will pass
+ * barrier but the bio is queued in current->bio_list (see
+ * generic_make_request). If there is a raise_barrier() called
+ * here, the second part of bio can't pass barrier. But since
+ * the first part bio isn't dispatched to underlaying disks
+ * yet, the barrier is never released, hence raise_barrier will
+ * alays wait. We have a deadlock.
+ * Note, this only happens in read path. For write path, the
+ * first part of bio is dispatched in a schedule() call
+ * (because of blk plug) or offloaded to raid10d.
+ * Quitting from the function immediately can change the bio
+ * order queued in bio_list and avoid the deadlock.
+ */
__make_request(mddev, split);
+ if (split != bio && bio_data_dir(bio) == READ) {
+ generic_make_request(bio);
+ break;
+ }
} while (split != bio);
/* In case raid10d snuck in to freeze_array */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cce6057b9aca..7aea0221530c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -829,6 +829,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
spin_unlock(&head->batch_head->batch_lock);
goto unlock_out;
}
+ /*
+ * We must assign batch_head of this stripe within the
+ * batch_lock, otherwise clear_batch_ready of batch head
+ * stripe could clear BATCH_READY bit of this stripe and
+ * this stripe->batch_head doesn't get assigned, which
+ * could confuse clear_batch_ready for this stripe
+ */
+ sh->batch_head = head->batch_head;
/*
* at this point, head's BATCH_READY could be cleared, but we
@@ -836,8 +844,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
*/
list_add(&sh->batch_list, &head->batch_list);
spin_unlock(&head->batch_head->batch_lock);
-
- sh->batch_head = head->batch_head;
} else {
head->batch_head = head;
sh->batch_head = head->batch_head;
@@ -2253,6 +2259,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2270,8 +2280,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -4275,7 +4283,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED)),
+ (1 << STRIPE_DEGRADED) |
+ (1 << STRIPE_ON_UNPLUG_LIST)),
head_sh->state & (1 << STRIPE_INSYNC));
sh->check_state = head_sh->check_state;
@@ -5298,12 +5307,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* userspace, we want an interruptible
* wait.
*/
- flush_signals(current);
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
+ sigset_t full, old;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
do_prepare = true;
}
goto retry;
@@ -5838,6 +5850,10 @@ static void raid5_do_work(struct work_struct *work)
pr_debug("%d stripes handled\n", handled);
spin_unlock_irq(&conf->device_lock);
+
+ r5l_flush_stripe_to_raid(conf->log);
+
+ async_tx_issue_pending_all();
blk_finish_plug(&plug);
pr_debug("--- raid5worker inactive\n");
@@ -7555,12 +7571,10 @@ static void end_reshape(struct r5conf *conf)
{
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
- rdev_for_each(rdev, conf->mddev)
- rdev->data_offset = rdev->new_data_offset;
+ md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;