summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/bset.c16
-rw-r--r--drivers/md/bcache/bset.h34
-rw-r--r--drivers/md/bcache/btree.c2
-rw-r--r--drivers/md/bcache/journal.c37
-rw-r--r--drivers/md/bcache/super.c25
-rw-r--r--drivers/md/bcache/sysfs.c17
-rw-r--r--drivers/md/bcache/sysfs.h13
-rw-r--r--drivers/md/bcache/writeback.h3
-rw-r--r--drivers/md/bitmap.c10
-rw-r--r--drivers/md/bitmap.h2
-rw-r--r--drivers/md/dm-bio-prison-v1.c2
-rw-r--r--drivers/md/dm-bio-prison-v2.c2
-rw-r--r--drivers/md/dm-cache-metadata.c9
-rw-r--r--drivers/md/dm-crypt.c12
-rw-r--r--drivers/md/dm-delay.c3
-rw-r--r--drivers/md/dm-flakey.c33
-rw-r--r--drivers/md/dm-integrity.c18
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-kcopyd.c7
-rw-r--r--drivers/md/dm-log-writes.c23
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c178
-rw-r--r--drivers/md/dm-table.c44
-rw-r--r--drivers/md/dm-thin.c70
-rw-r--r--drivers/md/dm-verity-target.c4
-rw-r--r--drivers/md/dm-zoned-metadata.c117
-rw-r--r--drivers/md/dm-zoned-reclaim.c40
-rw-r--r--drivers/md/dm-zoned-target.c87
-rw-r--r--drivers/md/dm-zoned.h40
-rw-r--r--drivers/md/md-cluster.c6
-rw-r--r--drivers/md/md.c92
-rw-r--r--drivers/md/md.h3
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c8
-rw-r--r--drivers/md/persistent-data/dm-btree.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c2
-rw-r--r--drivers/md/raid0.c35
-rw-r--r--drivers/md/raid0.h14
-rw-r--r--drivers/md/raid1.c70
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c22
43 files changed, 844 insertions, 309 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 8c13a9036d07..ada94a01e142 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -325,10 +325,11 @@ static int bch_allocator_thread(void *arg)
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (!fifo_empty(&ca->free_inc)) {
+ while (1) {
long bucket;
- fifo_pop(&ca->free_inc, bucket);
+ if (!fifo_pop(&ca->free_inc, bucket))
+ break;
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index e56d3ecdbfcb..1edf9515345e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -825,12 +825,22 @@ unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- m = bch_btree_iter_init(b, &iter, b->ops->is_extents
- ? PRECEDING_KEY(&START_KEY(k))
- : PRECEDING_KEY(k));
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+ * to NULL inside preceding_key().
+ */
+ if (b->ops->is_extents)
+ preceding_key(&START_KEY(k), &preceding_key_p);
+ else
+ preceding_key(k, &preceding_key_p);
+
+ m = bch_btree_iter_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index fa506c1aa524..8d1964b472e7 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -418,20 +418,26 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
return __bch_cut_back(where, k);
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
+/*
+ * Pointer '*preceding_key_p' points to a memory object to store preceding
+ * key of k. If the preceding key does not exist, set '*preceding_key_p' to
+ * NULL. So the caller of preceding_key() needs to take care of memory
+ * which '*preceding_key_p' pointed to before calling preceding_key().
+ * Currently the only caller of preceding_key() is bch_btree_insert_key(),
+ * and it points to an on-stack variable, so the memory release is handled
+ * by stackframe itself.
+ */
+static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
+{
+ if (KEY_INODE(k) || KEY_OFFSET(k)) {
+ (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
+ if (!(*preceding_key_p)->low)
+ (*preceding_key_p)->high--;
+ (*preceding_key_p)->low--;
+ } else {
+ (*preceding_key_p) = NULL;
+ }
+}
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 9406326216f1..96a6583e7b52 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -685,6 +685,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
* IO can always make forward progress:
*/
nr /= c->btree_pages;
+ if (nr == 0)
+ nr = 1;
nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 03cc0722ae48..6394be5ee9a8 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -310,6 +310,18 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
}
+bool is_discard_enabled(struct cache_set *s)
+{
+ struct cache *ca;
+ unsigned int i;
+
+ for_each_cache(ca, s, i)
+ if (ca->discard)
+ return true;
+
+ return false;
+}
+
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -323,9 +335,17 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1);
- cache_set_err_on(n != i->j.seq, s,
-"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
- n, i->j.seq - 1, start, end);
+ if (n != i->j.seq) {
+ if (n == start && is_discard_enabled(s))
+ pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ else {
+ pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
+ }
+ }
for (k = i->j.start;
k < bset_bkey_last(&i->j);
@@ -512,11 +532,11 @@ static void journal_reclaim(struct cache_set *c)
ca->sb.nr_this_dev);
}
- bkey_init(k);
- SET_KEY_PTRS(k, n);
-
- if (n)
+ if (n) {
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+ }
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
@@ -641,6 +661,9 @@ static void journal_write_unlocked(struct closure *cl)
ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
}
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
atomic_dec_bug(&fifo_back(&c->journal.pin));
bch_journal_next(&c->journal);
journal_reclaim(c);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fe6e4c319b7c..690aeb09bbf5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -905,6 +905,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
+ calc_cached_dev_sectors(dc->disk.c);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
@@ -1045,12 +1046,13 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
atomic_inc(&dc->count);
bch_writeback_queue(dc);
}
+ bch_sectors_dirty_init(&dc->disk);
+
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
@@ -1356,6 +1358,7 @@ static void cache_set_free(struct closure *cl)
bch_btree_cache_free(c);
bch_journal_free(c);
+ mutex_lock(&bch_register_lock);
for_each_cache(ca, c, i)
if (ca) {
ca->set = NULL;
@@ -1378,7 +1381,6 @@ static void cache_set_free(struct closure *cl)
mempool_destroy(c->search);
kfree(c->devices);
- mutex_lock(&bch_register_lock);
list_del(&c->list);
mutex_unlock(&bch_register_lock);
@@ -1401,7 +1403,7 @@ static void cache_set_flush(struct closure *cl)
kobject_put(&c->internal);
kobject_del(&c->kobj);
- if (c->gc_thread)
+ if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
@@ -1559,7 +1561,7 @@ err:
return NULL;
}
-static void run_cache_set(struct cache_set *c)
+static int run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
@@ -1651,7 +1653,9 @@ static void run_cache_set(struct cache_set *c)
if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
- bch_journal_replay(c, &journal);
+ err = "bcache: replay journal failed";
+ if (bch_journal_replay(c, &journal))
+ goto err;
} else {
pr_notice("invalidating existing data");
@@ -1719,11 +1723,13 @@ static void run_cache_set(struct cache_set *c)
flash_devs_run(c);
set_bit(CACHE_SET_RUNNING, &c->flags);
- return;
+ return 0;
err:
closure_sync(&cl);
/* XXX: test this, it's broken */
bch_cache_set_error(c, "%s", err);
+
+ return -EIO;
}
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -1787,8 +1793,11 @@ found:
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
- if (c->caches_loaded == c->sb.nr_in_set)
- run_cache_set(c);
+ if (c->caches_loaded == c->sb.nr_in_set) {
+ err = "failed to run cache set";
+ if (run_cache_set(c) < 0)
+ goto err;
+ }
return NULL;
err:
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 5d81cd06af00..def9c3478b89 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -217,7 +217,9 @@ STORE(__cached_dev)
d_strtoul(writeback_rate_d_term);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
- d_strtoi_h(sequential_cutoff);
+ sysfs_strtoul_clamp(sequential_cutoff,
+ dc->sequential_cutoff,
+ 0, UINT_MAX);
d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats)
@@ -660,8 +662,17 @@ STORE(__bch_cache_set)
c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
/* See count_io_errors() for why 88 */
- if (attr == &sysfs_io_error_halflife)
- c->error_decay = strtoul_or_return(buf) / 88;
+ if (attr == &sysfs_io_error_halflife) {
+ unsigned long v = 0;
+ ssize_t ret;
+
+ ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
+ if (!ret) {
+ c->error_decay = v / 88;
+ return size;
+ }
+ return ret;
+ }
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
sysfs_strtoul(verify, c->verify);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index b54fe9602529..e6e258f897ca 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -81,9 +81,16 @@ do { \
#define sysfs_strtoul_clamp(file, var, min, max) \
do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long v = 0; \
+ ssize_t ret; \
+ ret = strtoul_safe_clamp(buf, v, min, max); \
+ if (!ret) { \
+ var = v; \
+ return size; \
+ } \
+ return ret; \
+ } \
} while (0)
#define strtoul_or_return(cp) \
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 151544740148..973847e027a8 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -69,6 +69,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
in_use > CUTOFF_WRITEBACK_SYNC)
return false;
+ if (bio_op(bio) == REQ_OP_DISCARD)
+ return false;
+
if (dc->partial_stripes_expensive &&
bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
bio_sectors(bio)))
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 0cabf31fb163..7eb76a1a2505 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1729,7 +1729,7 @@ void bitmap_flush(struct mddev *mddev)
/*
* free memory that was allocated
*/
-void bitmap_free(struct bitmap *bitmap)
+void md_bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
@@ -1763,7 +1763,7 @@ void bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
-EXPORT_SYMBOL(bitmap_free);
+EXPORT_SYMBOL(md_bitmap_free);
void bitmap_wait_behind_writes(struct mddev *mddev)
{
@@ -1796,7 +1796,7 @@ void bitmap_destroy(struct mddev *mddev)
if (mddev->thread)
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
}
/*
@@ -1887,7 +1887,7 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
return bitmap;
error:
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
return ERR_PTR(err);
}
@@ -1958,7 +1958,7 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
rv = bitmap_init_from_disk(bitmap, 0);
if (rv) {
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
return ERR_PTR(rv);
}
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5df35ca90f58..dd53a978c5f2 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -271,7 +271,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
-void bitmap_free(struct bitmap *bitmap);
+void md_bitmap_free(struct bitmap *bitmap);
void bitmap_wait_behind_writes(struct mddev *mddev);
#endif
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 874841f0fc83..10532a76688e 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -33,7 +33,7 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison *dm_bio_prison_create(void)
{
- struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ce3a1a588cf..c34ec615420f 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -35,7 +35,7 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
{
- struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison_v2 *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index bc60db87e6f1..394e53afc259 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1166,11 +1166,18 @@ static int __load_discards(struct dm_cache_metadata *cmd,
if (r)
return r;
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ for (b = 0; ; b++) {
r = fn(context, cmd->discard_block_size, to_dblock(b),
dm_bitset_cursor_get_value(&c));
if (r)
break;
+
+ if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
+ break;
+
+ r = dm_bitset_cursor_next(&c);
+ if (r)
+ break;
}
dm_bitset_cursor_end(&c);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1f6d8b6be5c7..94b8d81f6020 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -334,7 +334,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
sg_init_one(&sg, cc->key, cc->key_size);
ahash_request_set_tfm(req, essiv->hash_tfm);
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+ ahash_request_set_callback(req, 0, NULL, NULL);
ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size);
err = crypto_ahash_digest(req);
@@ -609,7 +609,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
int i, r;
desc->tfm = lmk->hash_tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ desc->flags = 0;
r = crypto_shash_init(desc);
if (r)
@@ -771,7 +771,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ desc->flags = 0;
for (i = 0; i < 4; i++) {
r = crypto_shash_init(desc);
if (r)
@@ -935,7 +935,7 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
if (IS_ERR(bip))
return PTR_ERR(bip);
- tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+ tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
@@ -1254,7 +1254,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
* requests if driver request queue is full.
*/
skcipher_request_set_callback(ctx->r.req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
}
@@ -1271,7 +1271,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
* requests if driver request queue is full.
*/
aead_request_set_callback(ctx->r.req_aead,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2209a9700acd..cb733a5afdc8 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -222,7 +222,8 @@ static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- destroy_workqueue(dc->kdelayd_wq);
+ if (dc->kdelayd_wq)
+ destroy_workqueue(dc->kdelayd_wq);
dm_put_device(ti, dc->dev_read);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 0c1ef63c3461..b1b68e01b889 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -282,20 +282,31 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
{
- unsigned bio_bytes = bio_cur_bytes(bio);
- char *data = bio_data(bio);
+ unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
+
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+
+ if (!bio_has_data(bio))
+ return;
/*
- * Overwrite the Nth byte of the data returned.
+ * Overwrite the Nth byte of the bio's data, on whichever page
+ * it falls.
*/
- if (data && bio_bytes >= fc->corrupt_bio_byte) {
- data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
-
- DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
- "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n",
- bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
- (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
+ bio_for_each_segment(bvec, bio, iter) {
+ if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
+ char *segment = (page_address(bio_iter_page(bio, iter))
+ + bio_iter_offset(bio, iter));
+ segment[corrupt_bio_byte] = fc->corrupt_bio_value;
+ DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+ "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
+ bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+ (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
+ (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size);
+ break;
+ }
+ corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index b10e4c5641ea..23f0f4eaaa2e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -493,7 +493,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
unsigned j, size;
desc->tfm = ic->journal_mac;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ desc->flags = 0;
r = crypto_shash_init(desc);
if (unlikely(r)) {
@@ -637,7 +637,7 @@ static void complete_journal_encrypt(struct crypto_async_request *req, int err)
static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
{
int r;
- skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
complete_journal_encrypt, comp);
if (likely(encrypt))
r = crypto_skcipher_encrypt(req);
@@ -1276,8 +1276,8 @@ again:
checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
if (r > 0) {
- DMERR("Checksum failed at sector 0x%llx",
- (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+ DMERR_LIMIT("Checksum failed at sector 0x%llx",
+ (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
r = -EILSEQ;
atomic64_inc(&ic->number_of_mismatches);
}
@@ -1469,8 +1469,8 @@ retry_kmap:
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
- DMERR("Checksum failed when reading from journal, at sector 0x%llx",
- (unsigned long long)logical_sector);
+ DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
+ (unsigned long long)logical_sector);
}
}
#endif
@@ -2917,17 +2917,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
ic->sectors_per_block = val >> SECTOR_SHIFT;
- } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
+ } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
"Invalid internal_hash argument");
if (r)
goto bad;
- } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
+ } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
"Invalid journal_crypt argument");
if (r)
goto bad;
- } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
+ } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
"Invalid journal_mac argument");
if (r)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b4357ed4d541..56e2c0e079d7 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -50,7 +50,7 @@ struct dm_io_client *dm_io_client_create(void)
struct dm_io_client *client;
unsigned min_ios = dm_get_reserved_bio_based_ios();
- client = kmalloc(sizeof(*client), GFP_KERNEL);
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
if (!client)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index b9d1897bcf5b..7ca2b1aaa79d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -545,8 +545,10 @@ static int run_io_job(struct kcopyd_job *job)
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- job->master_job->write_err)
+ job->master_job->write_err) {
+ job->write_err = job->master_job->write_err;
return -EIO;
+ }
io_job_start(job->kc->throttle);
@@ -598,6 +600,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
else
job->read_err = 1;
push(&kc->complete_jobs, job);
+ wake(kc);
break;
}
@@ -889,7 +892,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
int r = -ENOMEM;
struct dm_kcopyd_client *kc;
- kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+ kc = kzalloc(sizeof(*kc), GFP_KERNEL);
if (!kc)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 8b80a9ce9ea9..dafedbc28bcc 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -57,6 +57,7 @@
#define WRITE_LOG_VERSION 1ULL
#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+#define WRITE_LOG_SUPER_SECTOR 0
/*
* The disk format for this is braindead simple.
@@ -112,6 +113,7 @@ struct log_writes_c {
struct list_head logging_blocks;
wait_queue_head_t wait;
struct task_struct *log_kthread;
+ struct completion super_done;
};
struct pending_block {
@@ -177,6 +179,14 @@ static void log_end_io(struct bio *bio)
bio_put(bio);
}
+static void log_end_super(struct bio *bio)
+{
+ struct log_writes_c *lc = bio->bi_private;
+
+ complete(&lc->super_done);
+ log_end_io(bio);
+}
+
/*
* Meant to be called if there is an error, it will free all the pages
* associated with the block.
@@ -212,7 +222,8 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, lc->logdev->bdev);
- bio->bi_end_io = log_end_io;
+ bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
+ log_end_super : log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -334,11 +345,18 @@ static int log_super(struct log_writes_c *lc)
super.nr_entries = cpu_to_le64(lc->logged_entries);
super.sectorsize = cpu_to_le32(lc->sectorsize);
- if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+ if (write_metadata(lc, &super, sizeof(super), NULL, 0,
+ WRITE_LOG_SUPER_SECTOR)) {
DMERR("Couldn't write super");
return -1;
}
+ /*
+ * Super sector should be writen in-order, otherwise the
+ * nr_entries could be rewritten incorrectly by an old bio.
+ */
+ wait_for_completion_io(&lc->super_done);
+
return 0;
}
@@ -447,6 +465,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&lc->unflushed_blocks);
INIT_LIST_HEAD(&lc->logging_blocks);
init_waitqueue_head(&lc->wait);
+ init_completion(&lc->super_done);
atomic_set(&lc->io_blocks, 0);
atomic_set(&lc->pending_blocks, 0);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 151211b4cb1b..2c5912e75514 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2441,7 +2441,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
}
/* Enable bitmap creation for RAID levels != 0 */
- mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
+ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 85c32b22a420..91c6f6d72eee 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -179,7 +179,7 @@ struct dm_region_hash *dm_region_hash_create(
;
nr_buckets >>= 1;
- rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+ rh = kzalloc(sizeof(*rh), GFP_KERNEL);
if (!rh) {
DMERR("unable to allocate region hash memory");
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index c5534d294773..00025569e807 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -17,7 +17,7 @@
#include "dm-bufio.h"
#define DM_MSG_PREFIX "persistent snapshot"
-#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */
#define DM_PREFETCH_CHUNKS 12
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index b502debc6df3..2170f6c118b8 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
-#include <linux/semaphore.h>
#include "dm.h"
@@ -48,7 +47,7 @@ struct dm_exception_table {
};
struct dm_snapshot {
- struct rw_semaphore lock;
+ struct mutex lock;
struct dm_dev *origin;
struct dm_dev *cow;
@@ -106,8 +105,8 @@ struct dm_snapshot {
/* The on disk metadata handler */
struct dm_exception_store *store;
- /* Maximum number of in-flight COW jobs. */
- struct semaphore cow_count;
+ unsigned in_progress;
+ struct wait_queue_head in_progress_wait;
struct dm_kcopyd_client *kcopyd_client;
@@ -158,8 +157,8 @@ struct dm_snapshot {
*/
#define DEFAULT_COW_THRESHOLD 2048
-static int cow_threshold = DEFAULT_COW_THRESHOLD;
-module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
@@ -456,9 +455,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
continue;
- down_read(&s->lock);
+ mutex_lock(&s->lock);
active = s->active;
- up_read(&s->lock);
+ mutex_unlock(&s->lock);
if (active) {
if (snap_src)
@@ -926,7 +925,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
int r;
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
- down_write(&s->lock);
+ mutex_lock(&s->lock);
/*
* Process chunks (and associated exceptions) in reverse order
@@ -941,7 +940,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
b = __release_queued_bios_after_merge(s);
out:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
if (b)
flush_bios(b);
@@ -1000,9 +999,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
if (linear_chunks < 0) {
DMERR("Read error in exception store: "
"shutting down merge");
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->merge_failed = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
}
goto shut;
}
@@ -1043,10 +1042,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
previous_count = read_pending_exceptions_done_count();
}
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->first_merging_chunk = old_chunk;
s->num_merging_chunks = linear_chunks;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
/* Wait until writes to all 'linear_chunks' drain */
for (i = 0; i < linear_chunks; i++)
@@ -1088,10 +1087,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
return;
shut:
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->merge_failed = 1;
b = __release_queued_bios_after_merge(s);
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
error_bios(b);
merge_shutdown(s);
@@ -1137,7 +1136,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
origin_mode = FMODE_WRITE;
}
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
ti->error = "Cannot allocate private snapshot structure";
r = -ENOMEM;
@@ -1190,7 +1189,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
INIT_LIST_HEAD(&s->out_of_order_list);
- init_rwsem(&s->lock);
+ mutex_init(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
@@ -1206,7 +1205,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_hash_tables;
}
- sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+ init_waitqueue_head(&s->in_progress_wait);
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
@@ -1357,9 +1356,9 @@ static void snapshot_dtr(struct dm_target *ti)
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest && (s == snap_src)) {
- down_write(&snap_dest->lock);
+ mutex_lock(&snap_dest->lock);
snap_dest->valid = 0;
- up_write(&snap_dest->lock);
+ mutex_unlock(&snap_dest->lock);
DMERR("Cancelling snapshot handover.");
}
up_read(&_origins_lock);
@@ -1390,13 +1389,62 @@ static void snapshot_dtr(struct dm_target *ti)
dm_exception_store_destroy(s->store);
+ mutex_destroy(&s->lock);
+
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
+ WARN_ON(s->in_progress);
+
kfree(s);
}
+static void account_start_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ s->in_progress++;
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static void account_end_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ BUG_ON(!s->in_progress);
+ s->in_progress--;
+ if (likely(s->in_progress <= cow_threshold) &&
+ unlikely(waitqueue_active(&s->in_progress_wait)))
+ wake_up_locked(&s->in_progress_wait);
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
+{
+ if (unlikely(s->in_progress > cow_threshold)) {
+ spin_lock(&s->in_progress_wait.lock);
+ if (likely(s->in_progress > cow_threshold)) {
+ /*
+ * NOTE: this throttle doesn't account for whether
+ * the caller is servicing an IO that will trigger a COW
+ * so excess throttling may result for chunks not required
+ * to be COW'd. But if cow_threshold was reached, extra
+ * throttling is unlikely to negatively impact performance.
+ */
+ DECLARE_WAITQUEUE(wait, current);
+ __add_wait_queue(&s->in_progress_wait, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&s->in_progress_wait.lock);
+ if (unlock_origins)
+ up_read(&_origins_lock);
+ io_schedule();
+ remove_wait_queue(&s->in_progress_wait, &wait);
+ return false;
+ }
+ spin_unlock(&s->in_progress_wait.lock);
+ }
+ return true;
+}
+
/*
* Flush a list of buffers.
*/
@@ -1412,7 +1460,7 @@ static void flush_bios(struct bio *bio)
}
}
-static int do_origin(struct dm_dev *origin, struct bio *bio);
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
/*
* Flush a list of buffers.
@@ -1425,7 +1473,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- r = do_origin(s->origin, bio);
+ r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
bio = n;
@@ -1477,7 +1525,7 @@ static void pending_complete(void *context, int success)
if (!success) {
/* Read/write error - snapshot is unusable */
- down_write(&s->lock);
+ mutex_lock(&s->lock);
__invalidate_snapshot(s, -EIO);
error = 1;
goto out;
@@ -1485,14 +1533,14 @@ static void pending_complete(void *context, int success)
e = alloc_completed_exception(GFP_NOIO);
if (!e) {
- down_write(&s->lock);
+ mutex_lock(&s->lock);
__invalidate_snapshot(s, -ENOMEM);
error = 1;
goto out;
}
*e = pe->e;
- down_write(&s->lock);
+ mutex_lock(&s->lock);
if (!s->valid) {
free_completed_exception(e);
error = 1;
@@ -1517,7 +1565,7 @@ out:
full_bio->bi_end_io = pe->full_bio_end_io;
increment_pending_exceptions_done_count();
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
/* Submit any pending write bios */
if (error) {
@@ -1579,7 +1627,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
}
list_add(&pe->out_of_order_entry, lh);
}
- up(&s->cow_count);
+ account_end_copy(s);
}
/*
@@ -1603,7 +1651,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dest.count = src.count;
/* Hand over to kcopyd */
- down(&s->cow_count);
+ account_start_copy(s);
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
@@ -1623,7 +1671,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
- down(&s->cow_count);
+ account_start_copy(s);
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
@@ -1714,9 +1762,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid)
return DM_MAPIO_KILL;
- /* FIXME: should only take write lock if we need
- * to copy an exception */
- down_write(&s->lock);
+ if (bio_data_dir(bio) == WRITE) {
+ while (unlikely(!wait_for_in_progress(s, false)))
+ ; /* wait_for_in_progress() has slept */
+ }
+
+ mutex_lock(&s->lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
@@ -1739,9 +1790,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(bio) == WRITE) {
pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
pe = alloc_pending_exception(s);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
if (!s->valid || s->snapshot_overflowed) {
free_pending_exception(pe);
@@ -1776,7 +1827,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
bio->bi_iter.bi_size ==
(s->store->chunk_size << SECTOR_SHIFT)) {
pe->started = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
start_full_bio(pe, bio);
goto out;
}
@@ -1786,7 +1837,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!pe->started) {
/* this is protected by snap->lock */
pe->started = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
start_copy(pe);
goto out;
}
@@ -1796,7 +1847,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
}
out_unlock:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
out:
return r;
}
@@ -1832,7 +1883,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
/* Full merging snapshots are redirected to the origin */
if (!s->valid)
@@ -1863,12 +1914,12 @@ redirect_to_origin:
bio_set_dev(bio, s->origin->bdev);
if (bio_data_dir(bio) == WRITE) {
- up_write(&s->lock);
- return do_origin(s->origin, bio);
+ mutex_unlock(&s->lock);
+ return do_origin(s->origin, bio, false);
}
out_unlock:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
return r;
}
@@ -1900,7 +1951,7 @@ static int snapshot_preresume(struct dm_target *ti)
down_read(&_origins_lock);
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- down_read(&snap_src->lock);
+ mutex_lock(&snap_src->lock);
if (s == snap_src) {
DMERR("Unable to resume snapshot source until "
"handover completes.");
@@ -1910,7 +1961,7 @@ static int snapshot_preresume(struct dm_target *ti)
"source is suspended.");
r = -EINVAL;
}
- up_read(&snap_src->lock);
+ mutex_unlock(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1956,11 +2007,11 @@ static void snapshot_resume(struct dm_target *ti)
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- down_write(&snap_src->lock);
- down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&snap_src->lock);
+ mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
__handover_exceptions(snap_src, snap_dest);
- up_write(&snap_dest->lock);
- up_write(&snap_src->lock);
+ mutex_unlock(&snap_dest->lock);
+ mutex_unlock(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1975,9 +2026,9 @@ static void snapshot_resume(struct dm_target *ti)
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->active = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
}
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2017,7 +2068,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
if (!snap->valid)
DMEMIT("Invalid");
@@ -2042,7 +2093,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
DMEMIT("Unknown");
}
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
break;
@@ -2108,7 +2159,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
if (dm_target_is_snapshot_merge(snap->ti))
continue;
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
/* Only deal with valid and active snapshots */
if (!snap->valid || !snap->active)
@@ -2135,9 +2186,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
pe = alloc_pending_exception(snap);
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
if (!snap->valid) {
free_pending_exception(pe);
@@ -2180,7 +2231,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
}
next_snapshot:
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
if (pe_to_start_now) {
start_copy(pe_to_start_now);
@@ -2201,15 +2252,24 @@ next_snapshot:
/*
* Called on a write from the origin driver.
*/
-static int do_origin(struct dm_dev *origin, struct bio *bio)
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
{
struct origin *o;
int r = DM_MAPIO_REMAPPED;
+again:
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
- if (o)
+ if (o) {
+ if (limit) {
+ struct dm_snapshot *s;
+ list_for_each_entry(s, &o->snapshots, list)
+ if (unlikely(!wait_for_in_progress(s, true)))
+ goto again;
+ }
+
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+ }
up_read(&_origins_lock);
return r;
@@ -2322,7 +2382,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, available_sectors);
/* Only tell snapshots if this is a write */
- return do_origin(o->dev, bio);
+ return do_origin(o->dev, bio, true);
}
static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9cd81375f28..8f070debe498 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1308,7 +1308,7 @@ void dm_table_event(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_event);
-sector_t dm_table_get_size(struct dm_table *t)
+inline sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
@@ -1333,6 +1333,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
unsigned int l, n = 0, k = 0;
sector_t *node;
+ if (unlikely(sector >= dm_table_get_size(t)))
+ return &t->targets[t->num_targets];
+
for (l = 0; l < t->depth; l++) {
n = get_child(n, k);
node = get_node(t, l, n);
@@ -1789,6 +1792,36 @@ static bool dm_table_supports_discards(struct dm_table *t)
return true;
}
+static int device_requires_stable_pages(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && bdi_cap_stable_pages_required(q->backing_dev_info);
+}
+
+/*
+ * If any underlying device requires stable pages, a table must require
+ * them as well. Only targets that support iterate_devices are considered:
+ * don't want error, zero, etc to require stable pages.
+ */
+static bool dm_table_requires_stable_pages(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_requires_stable_pages, NULL))
+ return true;
+ }
+
+ return false;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
@@ -1838,6 +1871,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
dm_table_verify_integrity(t);
/*
+ * Some devices don't use blk_integrity but still want stable pages
+ * because they do their own checksumming.
+ */
+ if (dm_table_requires_stable_pages(t))
+ q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ else
+ q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+
+ /*
* Determine whether or not this queue's I/O timings contribute
* to the entropy pool, Only request-based targets use this.
* Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 40b624d8255d..0ee5eae71690 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -257,6 +257,7 @@ struct pool {
spinlock_t lock;
struct bio_list deferred_flush_bios;
+ struct bio_list deferred_flush_completions;
struct list_head prepared_mappings;
struct list_head prepared_discards;
struct list_head prepared_discards_pt2;
@@ -950,6 +951,39 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
mempool_free(m, m->tc->pool->mapping_pool);
}
+static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ /*
+ * If the bio has the REQ_FUA flag set we must commit the metadata
+ * before signaling its completion.
+ */
+ if (!bio_triggers_commit(tc, bio)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * Complete bio with an error if earlier I/O caused changes to the
+ * metadata that can't be committed, e.g, due to I/O errors on the
+ * metadata device.
+ */
+ if (dm_thin_aborted_changes(tc->td)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in process_deferred_bios().
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_flush_completions, bio);
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
@@ -982,7 +1016,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
- bio_endio(bio);
+ complete_overwrite_bio(tc, bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -2328,7 +2362,7 @@ static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
struct bio *bio;
- struct bio_list bios;
+ struct bio_list bios, bio_completions;
struct thin_c *tc;
tc = get_first_thin(pool);
@@ -2339,26 +2373,36 @@ static void process_deferred_bios(struct pool *pool)
}
/*
- * If there are any deferred flush bios, we must commit
- * the metadata before issuing them.
+ * If there are any deferred flush bios, we must commit the metadata
+ * before issuing them or signaling their completion.
*/
bio_list_init(&bios);
+ bio_list_init(&bio_completions);
+
spin_lock_irqsave(&pool->lock, flags);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
+
+ bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
+ bio_list_init(&pool->deferred_flush_completions);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios) &&
+ if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
return;
if (commit(pool)) {
+ bio_list_merge(&bios, &bio_completions);
+
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
}
pool->last_commit_jiffies = jiffies;
+ while ((bio = bio_list_pop(&bio_completions)))
+ bio_endio(bio);
+
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
}
@@ -2918,7 +2962,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return (struct pool *)pmd;
}
- pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
*error = "Error allocating memory for pool";
err_p = ERR_PTR(-ENOMEM);
@@ -2965,6 +3009,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_flush_bios);
+ bio_list_init(&pool->deferred_flush_completions);
INIT_LIST_HEAD(&pool->prepared_mappings);
INIT_LIST_HEAD(&pool->prepared_discards);
INIT_LIST_HEAD(&pool->prepared_discards_pt2);
@@ -3247,6 +3292,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
as.argc = argc;
as.argv = argv;
+ /* make sure metadata and data are different devices */
+ if (!strcmp(argv[0], argv[1])) {
+ ti->error = "Error setting metadata or data device";
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Set default pool features.
*/
@@ -4128,6 +4180,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
tc->sort_bio_list = RB_ROOT;
if (argc == 3) {
+ if (!strcmp(argv[0], argv[2])) {
+ ti->error = "Error setting origin device";
+ r = -EINVAL;
+ goto bad_origin_dev;
+ }
+
r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
if (r) {
ti->error = "Error opening origin device";
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index ae0ec0abb667..7eeb8dd119ca 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -235,8 +235,8 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
BUG();
}
- DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
- block);
+ DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name,
+ type_str, block);
if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
DMERR("%s: reached maximum errors", v->data_dev->name);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 34968ca6b84a..9b78f4a74a12 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -401,15 +401,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return ERR_PTR(-EIO);
+
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&zmd->mblk_lock);
@@ -540,8 +543,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
if (!mblk) {
/* Cache miss: read the block from disk */
mblk = dmz_get_mblock_slow(zmd, mblk_no);
- if (!mblk)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(mblk))
+ return mblk;
}
/* Wait for on-going read I/O and check for error */
@@ -549,6 +552,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
+ dmz_check_bdev(zmd->dev);
return ERR_PTR(-EIO);
}
@@ -569,16 +573,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
/*
* Issue a metadata block write BIO.
*/
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
- unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
{
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
set_bit(DMZ_META_ERROR, &mblk->state);
- return;
+ return -ENOMEM;
}
set_bit(DMZ_META_WRITING, &mblk->state);
@@ -590,6 +597,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
+
+ return 0;
}
/*
@@ -601,6 +610,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
struct bio *bio;
int ret;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -612,6 +624,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
ret = submit_bio_wait(bio);
bio_put(bio);
+ if (ret)
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -658,22 +672,30 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
{
struct dmz_mblock *mblk;
struct blk_plug plug;
- int ret = 0;
+ int ret = 0, nr_mblks_submitted = 0;
/* Issue writes */
blk_start_plug(&plug);
- list_for_each_entry(mblk, write_list, link)
- dmz_write_mblock(zmd, mblk, set);
+ list_for_each_entry(mblk, write_list, link) {
+ ret = dmz_write_mblock(zmd, mblk, set);
+ if (ret)
+ break;
+ nr_mblks_submitted++;
+ }
blk_finish_plug(&plug);
/* Wait for completion */
list_for_each_entry(mblk, write_list, link) {
+ if (!nr_mblks_submitted)
+ break;
wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
+ dmz_check_bdev(zmd->dev);
ret = -EIO;
}
+ nr_mblks_submitted--;
}
/* Flush drive cache (this will also sync data) */
@@ -735,6 +757,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
dmz_lock_flush(zmd);
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -743,7 +770,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
- goto out;
+ goto err;
}
/*
@@ -753,7 +780,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list);
if (ret)
- goto out;
+ goto err;
/*
* The log is on disk. It is now safe to update in place
@@ -761,11 +788,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
@@ -780,16 +807,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
zmd->sb_gen++;
out:
- if (ret && !list_empty(&write_list)) {
- spin_lock(&zmd->mblk_lock);
- list_splice(&write_list, &zmd->mblk_dirty_list);
- spin_unlock(&zmd->mblk_lock);
- }
-
dmz_unlock_flush(zmd);
up_write(&zmd->mblk_sem);
return ret;
+
+err:
+ if (!list_empty(&write_list)) {
+ spin_lock(&zmd->mblk_lock);
+ list_splice(&write_list, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+ if (!dmz_check_bdev(zmd->dev))
+ ret = -EIO;
+ goto out;
}
/*
@@ -1169,6 +1200,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
goto out;
}
+ if (!nr_blkz)
+ break;
+
/* Process report */
for (i = 0; i < nr_blkz; i++) {
ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1204,9 +1238,12 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Get zone information from disk */
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
&blkz, &nr_blkz, GFP_NOIO);
+ if (!nr_blkz)
+ ret = -EIO;
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -1529,7 +1566,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_rnd_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_rnd_list, link) {
if (dmz_is_buf(zone))
@@ -1540,7 +1577,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
return dzone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1551,7 +1588,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_seq_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_seq_list, link) {
if (!zone->bzone)
@@ -1560,7 +1597,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
return zone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1589,30 +1626,6 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
}
/*
- * Activate a zone (increment its reference count).
- */
-void dmz_activate_zone(struct dm_zone *zone)
-{
- set_bit(DMZ_ACTIVE, &zone->flags);
- atomic_inc(&zone->refcount);
-}
-
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * and clears the active state of the zone once the count reaches 0,
- * indicating that all BIOs to the zone have completed. Returns
- * true if the zone was deactivated.
- */
-void dmz_deactivate_zone(struct dm_zone *zone)
-{
- if (atomic_dec_and_test(&zone->refcount)) {
- WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
- clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
- smp_mb__after_atomic();
- }
-}
-
-/*
* Get the zone mapping a chunk, if the chunk is mapped already.
* If no mapping exist and the operation is WRITE, a zone is
* allocated and used to map the chunk.
@@ -1642,6 +1655,10 @@ again:
/* Alloate a random zone */
dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!dzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1739,6 +1756,10 @@ again:
/* Alloate a random zone */
bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!bzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ bzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 44a119e12f1a..2fad512dce98 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -37,7 +37,7 @@ enum {
/*
* Number of seconds of target BIO inactivity to consider the target idle.
*/
-#define DMZ_IDLE_PERIOD (10UL * HZ)
+#define DMZ_IDLE_PERIOD (10UL * HZ)
/*
* Percentage of unmapped (free) random zones below which reclaim starts
@@ -81,6 +81,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
dmz_id(zmd, zone), (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
+ dmz_check_bdev(zrc->dev);
return ret;
}
@@ -134,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
if (ret <= 0)
@@ -217,7 +221,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -261,7 +265,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -314,7 +318,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -336,7 +340,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
/*
* Find a candidate zone for reclaim and process it.
*/
-static void dmz_reclaim(struct dmz_reclaim *zrc)
+static int dmz_do_reclaim(struct dmz_reclaim *zrc)
{
struct dmz_metadata *zmd = zrc->metadata;
struct dm_zone *dzone;
@@ -346,8 +350,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
- return;
+ if (IS_ERR(dzone))
+ return PTR_ERR(dzone);
start = jiffies;
@@ -393,13 +397,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
out:
if (ret) {
dmz_unlock_zone_reclaim(dzone);
- return;
+ return ret;
}
- (void) dmz_flush_metadata(zrc->metadata);
+ ret = dmz_flush_metadata(zrc->metadata);
+ if (ret) {
+ dmz_dev_debug(zrc->dev,
+ "Metadata flush for zone %u failed, err %d\n",
+ dmz_id(zmd, rzone), ret);
+ return ret;
+ }
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ return 0;
}
/*
@@ -444,6 +455,10 @@ static void dmz_reclaim_work(struct work_struct *work)
struct dmz_metadata *zmd = zrc->metadata;
unsigned int nr_rnd, nr_unmap_rnd;
unsigned int p_unmap_rnd;
+ int ret;
+
+ if (dmz_bdev_is_dying(zrc->dev))
+ return;
if (!dmz_should_reclaim(zrc)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
@@ -473,7 +488,12 @@ static void dmz_reclaim_work(struct work_struct *work)
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
- dmz_reclaim(zrc);
+ ret = dmz_do_reclaim(zrc);
+ if (ret) {
+ dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+ if (!dmz_check_bdev(zrc->dev))
+ return;
+ }
dmz_schedule_reclaim(zrc);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 532bfce7f072..497a2bc5da51 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -79,6 +79,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
+ if (bio->bi_status != BLK_STS_OK)
+ bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
if (atomic_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
@@ -277,8 +279,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone);
- if (!bzone)
- return -ENOSPC;
+ if (IS_ERR(bzone))
+ return PTR_ERR(bzone);
if (dmz_is_readonly(bzone))
return -EROFS;
@@ -389,6 +391,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
dmz_lock_metadata(zmd);
+ if (dmz->dev->flags & DMZ_BDEV_DYING) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
@@ -493,6 +500,8 @@ static void dmz_flush_work(struct work_struct *work)
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
+ if (ret)
+ dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
/* Process queued flush requests */
while (1) {
@@ -513,22 +522,24 @@ static void dmz_flush_work(struct work_struct *work)
* Get a chunk work and start it to process a new BIO.
* If the BIO chunk has no work yet, create one.
*/
-static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
struct dm_chunk_work *cw;
+ int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
if (!cw) {
- int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
- if (!cw)
+ if (unlikely(!cw)) {
+ ret = -ENOMEM;
goto out;
+ }
INIT_WORK(&cw->work, dmz_chunk_work);
atomic_set(&cw->refcount, 0);
@@ -539,7 +550,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
if (unlikely(ret)) {
kfree(cw);
- cw = NULL;
goto out;
}
}
@@ -547,10 +557,58 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
bio_list_add(&cw->bio_list, bio);
dmz_get_chunk_work(cw);
+ dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock);
+ return ret;
+}
+
+/*
+ * Check if the backing device is being removed. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+ if (dmz_dev->flags & DMZ_BDEV_DYING)
+ return true;
+
+ if (dmz_dev->flags & DMZ_CHECK_BDEV)
+ return !dmz_check_bdev(dmz_dev);
+
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return dmz_dev->flags & DMZ_BDEV_DYING;
+}
+
+/*
+ * Check the backing device availability. This detects such events as
+ * backing device going offline due to errors, media removals, etc.
+ * This check is less efficient than dmz_bdev_is_dying() and should
+ * only be performed as a part of error handling.
+ */
+bool dmz_check_bdev(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ dmz_dev->flags &= ~DMZ_CHECK_BDEV;
+
+ if (dmz_bdev_is_dying(dmz_dev))
+ return false;
+
+ disk = dmz_dev->bdev->bd_disk;
+ if (disk->fops->check_events &&
+ disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return !(dmz_dev->flags & DMZ_BDEV_DYING);
}
/*
@@ -564,6 +622,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
+ int ret;
+
+ if (dmz_bdev_is_dying(dmz->dev))
+ return DM_MAPIO_KILL;
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
bio_op(bio), (unsigned long long)sector, nr_sectors,
@@ -601,8 +663,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
/* Now ready to handle this BIO */
- dmz_reclaim_bio_acc(dmz->reclaim);
- dmz_queue_chunk_work(dmz, bio);
+ ret = dmz_queue_chunk_work(dmz, bio);
+ if (ret) {
+ dmz_dev_debug(dmz->dev,
+ "BIO op %d, can't process chunk %llu, err %i\n",
+ bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
+ ret);
+ return DM_MAPIO_REQUEUE;
+ }
return DM_MAPIO_SUBMITTED;
}
@@ -855,6 +923,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti,
{
struct dmz_target *dmz = ti->private;
+ if (!dmz_check_bdev(dmz->dev))
+ return -EIO;
+
*bdev = dmz->dev->bdev;
return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 12419f0bfe78..2662746ba8b9 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -56,6 +56,8 @@ struct dmz_dev {
unsigned int nr_zones;
+ unsigned int flags;
+
sector_t zone_nr_sectors;
unsigned int zone_nr_sectors_shift;
@@ -67,6 +69,10 @@ struct dmz_dev {
(dev)->zone_nr_sectors_shift)
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+/* Device flags. */
+#define DMZ_BDEV_DYING (1 << 0)
+#define DMZ_CHECK_BDEV (2 << 0)
+
/*
* Zone descriptor.
*/
@@ -115,7 +121,6 @@ enum {
DMZ_BUF,
/* Zone internal state */
- DMZ_ACTIVE,
DMZ_RECLAIM,
DMZ_SEQ_WRITE_ERR,
};
@@ -128,7 +133,6 @@ enum {
#define dmz_is_empty(z) ((z)->wp_block == 0)
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
-#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags)
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
@@ -188,8 +192,30 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
-void dmz_activate_zone(struct dm_zone *zone);
-void dmz_deactivate_zone(struct dm_zone *zone);
+/*
+ * Activate a zone (increment its reference count).
+ */
+static inline void dmz_activate_zone(struct dm_zone *zone)
+{
+ atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+ return atomic_read(&zone->refcount);
+}
int dmz_lock_zone_reclaim(struct dm_zone *zone);
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
@@ -225,4 +251,10 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+bool dmz_check_bdev(struct dmz_dev *dmz_dev);
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 717aaffc227d..10057ac85476 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1128,7 +1128,7 @@ int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
return -1;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
@@ -1142,11 +1142,11 @@ int cluster_check_sync_size(struct mddev *mddev)
sync_size = sb->sync_size;
else if (sync_size != sb->sync_size) {
kunmap_atomic(sb);
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
return -1;
}
kunmap_atomic(sb);
- bitmap_free(bitmap);
+ md_bitmap_free(bitmap);
}
return (my_sync_size == sync_size) ? 0 : -1;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5599712d478e..b942c74f1ce8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1763,8 +1763,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -2845,8 +2852,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = 0;
}
} else if (cmd_match(buf, "re-add")) {
- if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
- rdev->saved_raid_disk >= 0) {
+ if (!rdev->mddev->pers)
+ err = -EINVAL;
+ else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+ rdev->saved_raid_disk >= 0) {
/* clear_bit is performed _after_ all the devices
* have their local Faulty bit cleared. If any writes
* happen in the meantime in the local node, they
@@ -4099,7 +4108,7 @@ array_state_show(struct mddev *mddev, char *page)
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags))
switch(mddev->ro) {
case 1:
st = readonly;
@@ -5660,9 +5669,6 @@ int md_run(struct mddev *mddev)
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
abort:
@@ -5683,6 +5689,7 @@ static int do_md_run(struct mddev *mddev)
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5700,9 +5707,14 @@ static int do_md_run(struct mddev *mddev)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -7603,9 +7615,9 @@ static void status_unused(struct seq_file *seq)
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
- unsigned long dt, db;
- sector_t rt;
- int scale;
+ unsigned long dt, db = 0;
+ sector_t rt, curr_mark_cnt, resync_mark_cnt;
+ int scale, recovery_active;
unsigned int per_milli;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -7675,22 +7687,30 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
* db: blocks written from mark until now
* rt: remaining time
*
- * rt is a sector_t, so could be 32bit or 64bit.
- * So we divide before multiply in case it is 32bit and close
- * to the limit.
- * We scale the divisor (db) by 32 to avoid losing precision
- * near the end of resync when the number of remaining sectors
- * is close to 'db'.
- * We then divide rt by 32 after multiplying by db to compensate.
- * The '+1' avoids division by zero if db is very small.
+ * rt is a sector_t, which is always 64bit now. We are keeping
+ * the original algorithm, but it is not really necessary.
+ *
+ * Original algorithm:
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- - mddev->resync_mark_cnt;
+
+ curr_mark_cnt = mddev->curr_mark_cnt;
+ recovery_active = atomic_read(&mddev->recovery_active);
+ resync_mark_cnt = mddev->resync_mark_cnt;
+
+ if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
+ db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
rt = max_sectors - resync; /* number of remaining sectors */
- sector_div(rt, db/32+1);
+ rt = div64_u64(rt, db/32+1);
rt *= dt;
rt >>= 5;
@@ -8716,6 +8736,18 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ }
+
if (mddev->suspended)
return;
@@ -8745,6 +8777,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8790,7 +8823,7 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -8875,16 +8908,6 @@ void md_check_recovery(struct mddev *mddev)
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
- } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -8896,7 +8919,8 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 11696aba94e3..69bc0d5550cd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -243,6 +243,9 @@ enum mddev_flags {
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
};
enum mddev_sb_flags {
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 21ea537bd55e..eff04fa23dfa 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- unsigned threshold = 2 * merge_threshold(left) + 1;
+ /*
+ * Ensure the number of entries in each child will be greater
+ * than or equal to (max_entries / 3 + 1), so no matter which
+ * child is used for removal, the number will still be not
+ * less than (max_entries / 3).
+ */
+ unsigned int threshold = 2 * (merge_threshold(left) + 1);
if (nr_left + nr_right < threshold) {
/*
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 58b319757b1e..8aae0624a297 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
new_parent = shadow_current(s);
+ pn = dm_block_data(new_parent);
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+
+ /* create & init the left block */
r = new_block(s->info, &left);
if (r < 0)
return r;
+ ln = dm_block_data(left);
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+
+ /* create & init the right block */
r = new_block(s->info, &right);
if (r < 0) {
unlock_block(s->info, left);
return r;
}
- pn = dm_block_data(new_parent);
- ln = dm_block_data(left);
rn = dm_block_data(right);
-
- nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
- ln->header.flags = pn->header.flags;
- ln->header.nr_entries = cpu_to_le32(nr_left);
- ln->header.max_entries = pn->header.max_entries;
- ln->header.value_size = pn->header.value_size;
-
rn->header.flags = pn->header.flags;
rn->header.nr_entries = cpu_to_le32(nr_right);
rn->header.max_entries = pn->header.max_entries;
rn->header.value_size = pn->header.value_size;
-
- memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-
- size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
- sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 4aed69d9dd17..b23cac2c4738 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -248,7 +248,7 @@ static int out(struct sm_metadata *smm)
}
if (smm->recursion_count == 1)
- apply_bops(smm);
+ r = apply_bops(smm);
smm->recursion_count--;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5ecba9eef441..cdafa5e0ea6d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,6 +26,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -91,7 +94,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- unsigned short blksize = 512;
+ unsigned blksize = 512;
*private_conf = ERR_PTR(-ENOMEM);
if (!conf)
@@ -146,6 +149,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -552,10 +568,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
@@ -588,8 +606,21 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ break;
+ default:
+ WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d92642..3816e5477db1 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@ struct strip_zone {
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 205f86f1a6cb..0a9d623b13c2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -435,19 +435,21 @@ static void raid1_end_write_request(struct bio *bio)
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -1854,6 +1856,20 @@ static void end_sync_read(struct bio *bio)
reschedule_retry(r1_bio);
}
+static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
+{
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+
+ /* make sure these bits don't get cleared. */
+ do {
+ bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1865,16 +1881,7 @@ static void end_sync_write(struct bio *bio)
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
- sector_t sync_blocks = 0;
- sector_t s = r1_bio->sector;
- long sectors_to_go = r1_bio->sectors;
- /* make sure these bits doesn't get cleared. */
- do {
- bitmap_end_sync(mddev->bitmap, s,
- &sync_blocks, 1);
- s += sync_blocks;
- sectors_to_go -= sync_blocks;
- } while (sectors_to_go > 0);
+ abort_sync_write(mddev, r1_bio);
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
@@ -2164,8 +2171,10 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
- if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+ abort_sync_write(mddev, r1_bio);
continue;
+ }
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
@@ -2740,7 +2749,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
write_targets++;
}
}
- if (bio->bi_end_io) {
+ if (rdev && bio->bi_end_io) {
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
@@ -3090,6 +3099,13 @@ static int raid1_run(struct mddev *mddev)
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3123,8 +3139,12 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 52ddfa0fca94..d08d77b9674f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -226,7 +226,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
+ resync_free_pages(&rps[j]);
j = 0;
out_free_bio:
@@ -1190,7 +1190,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *split = bio_split(bio, max_sectors,
gfp, conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
@@ -1479,7 +1481,9 @@ retry_write:
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
}
@@ -3817,6 +3821,8 @@ static int raid10_run(struct mddev *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto out_free_conf;
}
return 0;
@@ -4491,7 +4497,6 @@ read_more:
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
generic_make_request(read_bio);
- sector_nr += nr_sectors;
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7dbb74cd506a..d5c14d56a714 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2538,7 +2538,8 @@ static void raid5_end_read_request(struct bio * bi)
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2570,7 +2571,9 @@ static void raid5_end_read_request(struct bio * bi)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -4182,7 +4185,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
/* now write out any block on a failed drive,
* or P or Q if they were recomputed
*/
- BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+ dev = NULL;
if (s->failed == 2) {
dev = &sh->dev[s->failed_num[1]];
s->locked++;
@@ -4207,6 +4210,14 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
}
+ if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
+ "%s: disk%td not up to date\n",
+ mdname(conf->mddev),
+ dev - (struct r5dev *) &sh->dev)) {
+ clear_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_Wantwrite, &dev->flags);
+ s->locked--;
+ }
clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
@@ -5710,7 +5721,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head || sh == sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@@ -7370,6 +7382,8 @@ static int raid5_run(struct mddev *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto abort;
}
/* Ok, everything is just fine now */