summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bset.c16
-rw-r--r--drivers/md/bcache/bset.h34
-rw-r--r--drivers/md/bcache/super.c3
-rw-r--r--drivers/md/dm-bio-prison-v1.c2
-rw-r--r--drivers/md/dm-bio-prison-v2.c2
-rw-r--r--drivers/md/dm-flakey.c33
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-kcopyd.c7
-rw-r--r--drivers/md/dm-log-writes.c23
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-snap.c178
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-thin.c2
-rw-r--r--drivers/md/dm-verity-target.c4
-rw-r--r--drivers/md/dm-zoned-metadata.c112
-rw-r--r--drivers/md/dm-zoned-reclaim.c40
-rw-r--r--drivers/md/dm-zoned-target.c87
-rw-r--r--drivers/md/dm-zoned.h40
-rw-r--r--drivers/md/md.c86
-rw-r--r--drivers/md/md.h3
-rw-r--r--drivers/md/persistent-data/dm-btree.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c2
-rw-r--r--drivers/md/raid0.c33
-rw-r--r--drivers/md/raid0.h14
-rw-r--r--drivers/md/raid1.c39
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/md/raid5.c10
28 files changed, 577 insertions, 237 deletions
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index e56d3ecdbfcb..1edf9515345e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -825,12 +825,22 @@ unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- m = bch_btree_iter_init(b, &iter, b->ops->is_extents
- ? PRECEDING_KEY(&START_KEY(k))
- : PRECEDING_KEY(k));
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+ * to NULL inside preceding_key().
+ */
+ if (b->ops->is_extents)
+ preceding_key(&START_KEY(k), &preceding_key_p);
+ else
+ preceding_key(k, &preceding_key_p);
+
+ m = bch_btree_iter_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index fa506c1aa524..8d1964b472e7 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -418,20 +418,26 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
return __bch_cut_back(where, k);
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
+/*
+ * Pointer '*preceding_key_p' points to a memory object to store preceding
+ * key of k. If the preceding key does not exist, set '*preceding_key_p' to
+ * NULL. So the caller of preceding_key() needs to take care of memory
+ * which '*preceding_key_p' pointed to before calling preceding_key().
+ * Currently the only caller of preceding_key() is bch_btree_insert_key(),
+ * and it points to an on-stack variable, so the memory release is handled
+ * by stackframe itself.
+ */
+static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
+{
+ if (KEY_INODE(k) || KEY_OFFSET(k)) {
+ (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
+ if (!(*preceding_key_p)->low)
+ (*preceding_key_p)->high--;
+ (*preceding_key_p)->low--;
+ } else {
+ (*preceding_key_p) = NULL;
+ }
+}
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 85a5afe01d39..690aeb09bbf5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -905,6 +905,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
+ calc_cached_dev_sectors(dc->disk.c);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
@@ -1402,7 +1403,7 @@ static void cache_set_flush(struct closure *cl)
kobject_put(&c->internal);
kobject_del(&c->kobj);
- if (c->gc_thread)
+ if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 874841f0fc83..10532a76688e 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -33,7 +33,7 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison *dm_bio_prison_create(void)
{
- struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ce3a1a588cf..c34ec615420f 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -35,7 +35,7 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
{
- struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison_v2 *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 0c1ef63c3461..b1b68e01b889 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -282,20 +282,31 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
{
- unsigned bio_bytes = bio_cur_bytes(bio);
- char *data = bio_data(bio);
+ unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
+
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+
+ if (!bio_has_data(bio))
+ return;
/*
- * Overwrite the Nth byte of the data returned.
+ * Overwrite the Nth byte of the bio's data, on whichever page
+ * it falls.
*/
- if (data && bio_bytes >= fc->corrupt_bio_byte) {
- data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
-
- DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
- "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n",
- bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
- (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
+ bio_for_each_segment(bvec, bio, iter) {
+ if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
+ char *segment = (page_address(bio_iter_page(bio, iter))
+ + bio_iter_offset(bio, iter));
+ segment[corrupt_bio_byte] = fc->corrupt_bio_value;
+ DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+ "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
+ bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+ (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
+ (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size);
+ break;
+ }
+ corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b4357ed4d541..56e2c0e079d7 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -50,7 +50,7 @@ struct dm_io_client *dm_io_client_create(void)
struct dm_io_client *client;
unsigned min_ios = dm_get_reserved_bio_based_ios();
- client = kmalloc(sizeof(*client), GFP_KERNEL);
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
if (!client)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index b9d1897bcf5b..7ca2b1aaa79d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -545,8 +545,10 @@ static int run_io_job(struct kcopyd_job *job)
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- job->master_job->write_err)
+ job->master_job->write_err) {
+ job->write_err = job->master_job->write_err;
return -EIO;
+ }
io_job_start(job->kc->throttle);
@@ -598,6 +600,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
else
job->read_err = 1;
push(&kc->complete_jobs, job);
+ wake(kc);
break;
}
@@ -889,7 +892,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
int r = -ENOMEM;
struct dm_kcopyd_client *kc;
- kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+ kc = kzalloc(sizeof(*kc), GFP_KERNEL);
if (!kc)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 8b80a9ce9ea9..dafedbc28bcc 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -57,6 +57,7 @@
#define WRITE_LOG_VERSION 1ULL
#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+#define WRITE_LOG_SUPER_SECTOR 0
/*
* The disk format for this is braindead simple.
@@ -112,6 +113,7 @@ struct log_writes_c {
struct list_head logging_blocks;
wait_queue_head_t wait;
struct task_struct *log_kthread;
+ struct completion super_done;
};
struct pending_block {
@@ -177,6 +179,14 @@ static void log_end_io(struct bio *bio)
bio_put(bio);
}
+static void log_end_super(struct bio *bio)
+{
+ struct log_writes_c *lc = bio->bi_private;
+
+ complete(&lc->super_done);
+ log_end_io(bio);
+}
+
/*
* Meant to be called if there is an error, it will free all the pages
* associated with the block.
@@ -212,7 +222,8 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, lc->logdev->bdev);
- bio->bi_end_io = log_end_io;
+ bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
+ log_end_super : log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -334,11 +345,18 @@ static int log_super(struct log_writes_c *lc)
super.nr_entries = cpu_to_le64(lc->logged_entries);
super.sectorsize = cpu_to_le32(lc->sectorsize);
- if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+ if (write_metadata(lc, &super, sizeof(super), NULL, 0,
+ WRITE_LOG_SUPER_SECTOR)) {
DMERR("Couldn't write super");
return -1;
}
+ /*
+ * Super sector should be writen in-order, otherwise the
+ * nr_entries could be rewritten incorrectly by an old bio.
+ */
+ wait_for_completion_io(&lc->super_done);
+
return 0;
}
@@ -447,6 +465,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&lc->unflushed_blocks);
INIT_LIST_HEAD(&lc->logging_blocks);
init_waitqueue_head(&lc->wait);
+ init_completion(&lc->super_done);
atomic_set(&lc->io_blocks, 0);
atomic_set(&lc->pending_blocks, 0);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 151211b4cb1b..2c5912e75514 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2441,7 +2441,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
}
/* Enable bitmap creation for RAID levels != 0 */
- mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
+ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 85c32b22a420..91c6f6d72eee 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -179,7 +179,7 @@ struct dm_region_hash *dm_region_hash_create(
;
nr_buckets >>= 1;
- rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+ rh = kzalloc(sizeof(*rh), GFP_KERNEL);
if (!rh) {
DMERR("unable to allocate region hash memory");
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index b502debc6df3..2170f6c118b8 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
-#include <linux/semaphore.h>
#include "dm.h"
@@ -48,7 +47,7 @@ struct dm_exception_table {
};
struct dm_snapshot {
- struct rw_semaphore lock;
+ struct mutex lock;
struct dm_dev *origin;
struct dm_dev *cow;
@@ -106,8 +105,8 @@ struct dm_snapshot {
/* The on disk metadata handler */
struct dm_exception_store *store;
- /* Maximum number of in-flight COW jobs. */
- struct semaphore cow_count;
+ unsigned in_progress;
+ struct wait_queue_head in_progress_wait;
struct dm_kcopyd_client *kcopyd_client;
@@ -158,8 +157,8 @@ struct dm_snapshot {
*/
#define DEFAULT_COW_THRESHOLD 2048
-static int cow_threshold = DEFAULT_COW_THRESHOLD;
-module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
@@ -456,9 +455,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
continue;
- down_read(&s->lock);
+ mutex_lock(&s->lock);
active = s->active;
- up_read(&s->lock);
+ mutex_unlock(&s->lock);
if (active) {
if (snap_src)
@@ -926,7 +925,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
int r;
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
- down_write(&s->lock);
+ mutex_lock(&s->lock);
/*
* Process chunks (and associated exceptions) in reverse order
@@ -941,7 +940,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
b = __release_queued_bios_after_merge(s);
out:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
if (b)
flush_bios(b);
@@ -1000,9 +999,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
if (linear_chunks < 0) {
DMERR("Read error in exception store: "
"shutting down merge");
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->merge_failed = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
}
goto shut;
}
@@ -1043,10 +1042,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
previous_count = read_pending_exceptions_done_count();
}
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->first_merging_chunk = old_chunk;
s->num_merging_chunks = linear_chunks;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
/* Wait until writes to all 'linear_chunks' drain */
for (i = 0; i < linear_chunks; i++)
@@ -1088,10 +1087,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
return;
shut:
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->merge_failed = 1;
b = __release_queued_bios_after_merge(s);
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
error_bios(b);
merge_shutdown(s);
@@ -1137,7 +1136,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
origin_mode = FMODE_WRITE;
}
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
ti->error = "Cannot allocate private snapshot structure";
r = -ENOMEM;
@@ -1190,7 +1189,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
INIT_LIST_HEAD(&s->out_of_order_list);
- init_rwsem(&s->lock);
+ mutex_init(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
@@ -1206,7 +1205,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_hash_tables;
}
- sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+ init_waitqueue_head(&s->in_progress_wait);
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
@@ -1357,9 +1356,9 @@ static void snapshot_dtr(struct dm_target *ti)
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest && (s == snap_src)) {
- down_write(&snap_dest->lock);
+ mutex_lock(&snap_dest->lock);
snap_dest->valid = 0;
- up_write(&snap_dest->lock);
+ mutex_unlock(&snap_dest->lock);
DMERR("Cancelling snapshot handover.");
}
up_read(&_origins_lock);
@@ -1390,13 +1389,62 @@ static void snapshot_dtr(struct dm_target *ti)
dm_exception_store_destroy(s->store);
+ mutex_destroy(&s->lock);
+
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
+ WARN_ON(s->in_progress);
+
kfree(s);
}
+static void account_start_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ s->in_progress++;
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static void account_end_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ BUG_ON(!s->in_progress);
+ s->in_progress--;
+ if (likely(s->in_progress <= cow_threshold) &&
+ unlikely(waitqueue_active(&s->in_progress_wait)))
+ wake_up_locked(&s->in_progress_wait);
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
+{
+ if (unlikely(s->in_progress > cow_threshold)) {
+ spin_lock(&s->in_progress_wait.lock);
+ if (likely(s->in_progress > cow_threshold)) {
+ /*
+ * NOTE: this throttle doesn't account for whether
+ * the caller is servicing an IO that will trigger a COW
+ * so excess throttling may result for chunks not required
+ * to be COW'd. But if cow_threshold was reached, extra
+ * throttling is unlikely to negatively impact performance.
+ */
+ DECLARE_WAITQUEUE(wait, current);
+ __add_wait_queue(&s->in_progress_wait, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&s->in_progress_wait.lock);
+ if (unlock_origins)
+ up_read(&_origins_lock);
+ io_schedule();
+ remove_wait_queue(&s->in_progress_wait, &wait);
+ return false;
+ }
+ spin_unlock(&s->in_progress_wait.lock);
+ }
+ return true;
+}
+
/*
* Flush a list of buffers.
*/
@@ -1412,7 +1460,7 @@ static void flush_bios(struct bio *bio)
}
}
-static int do_origin(struct dm_dev *origin, struct bio *bio);
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
/*
* Flush a list of buffers.
@@ -1425,7 +1473,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- r = do_origin(s->origin, bio);
+ r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
bio = n;
@@ -1477,7 +1525,7 @@ static void pending_complete(void *context, int success)
if (!success) {
/* Read/write error - snapshot is unusable */
- down_write(&s->lock);
+ mutex_lock(&s->lock);
__invalidate_snapshot(s, -EIO);
error = 1;
goto out;
@@ -1485,14 +1533,14 @@ static void pending_complete(void *context, int success)
e = alloc_completed_exception(GFP_NOIO);
if (!e) {
- down_write(&s->lock);
+ mutex_lock(&s->lock);
__invalidate_snapshot(s, -ENOMEM);
error = 1;
goto out;
}
*e = pe->e;
- down_write(&s->lock);
+ mutex_lock(&s->lock);
if (!s->valid) {
free_completed_exception(e);
error = 1;
@@ -1517,7 +1565,7 @@ out:
full_bio->bi_end_io = pe->full_bio_end_io;
increment_pending_exceptions_done_count();
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
/* Submit any pending write bios */
if (error) {
@@ -1579,7 +1627,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
}
list_add(&pe->out_of_order_entry, lh);
}
- up(&s->cow_count);
+ account_end_copy(s);
}
/*
@@ -1603,7 +1651,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dest.count = src.count;
/* Hand over to kcopyd */
- down(&s->cow_count);
+ account_start_copy(s);
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
@@ -1623,7 +1671,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
- down(&s->cow_count);
+ account_start_copy(s);
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
@@ -1714,9 +1762,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid)
return DM_MAPIO_KILL;
- /* FIXME: should only take write lock if we need
- * to copy an exception */
- down_write(&s->lock);
+ if (bio_data_dir(bio) == WRITE) {
+ while (unlikely(!wait_for_in_progress(s, false)))
+ ; /* wait_for_in_progress() has slept */
+ }
+
+ mutex_lock(&s->lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
@@ -1739,9 +1790,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(bio) == WRITE) {
pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
pe = alloc_pending_exception(s);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
if (!s->valid || s->snapshot_overflowed) {
free_pending_exception(pe);
@@ -1776,7 +1827,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
bio->bi_iter.bi_size ==
(s->store->chunk_size << SECTOR_SHIFT)) {
pe->started = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
start_full_bio(pe, bio);
goto out;
}
@@ -1786,7 +1837,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!pe->started) {
/* this is protected by snap->lock */
pe->started = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
start_copy(pe);
goto out;
}
@@ -1796,7 +1847,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
}
out_unlock:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
out:
return r;
}
@@ -1832,7 +1883,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
/* Full merging snapshots are redirected to the origin */
if (!s->valid)
@@ -1863,12 +1914,12 @@ redirect_to_origin:
bio_set_dev(bio, s->origin->bdev);
if (bio_data_dir(bio) == WRITE) {
- up_write(&s->lock);
- return do_origin(s->origin, bio);
+ mutex_unlock(&s->lock);
+ return do_origin(s->origin, bio, false);
}
out_unlock:
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
return r;
}
@@ -1900,7 +1951,7 @@ static int snapshot_preresume(struct dm_target *ti)
down_read(&_origins_lock);
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- down_read(&snap_src->lock);
+ mutex_lock(&snap_src->lock);
if (s == snap_src) {
DMERR("Unable to resume snapshot source until "
"handover completes.");
@@ -1910,7 +1961,7 @@ static int snapshot_preresume(struct dm_target *ti)
"source is suspended.");
r = -EINVAL;
}
- up_read(&snap_src->lock);
+ mutex_unlock(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1956,11 +2007,11 @@ static void snapshot_resume(struct dm_target *ti)
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- down_write(&snap_src->lock);
- down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&snap_src->lock);
+ mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
__handover_exceptions(snap_src, snap_dest);
- up_write(&snap_dest->lock);
- up_write(&snap_src->lock);
+ mutex_unlock(&snap_dest->lock);
+ mutex_unlock(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1975,9 +2026,9 @@ static void snapshot_resume(struct dm_target *ti)
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
- down_write(&s->lock);
+ mutex_lock(&s->lock);
s->active = 1;
- up_write(&s->lock);
+ mutex_unlock(&s->lock);
}
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2017,7 +2068,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
if (!snap->valid)
DMEMIT("Invalid");
@@ -2042,7 +2093,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
DMEMIT("Unknown");
}
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
break;
@@ -2108,7 +2159,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
if (dm_target_is_snapshot_merge(snap->ti))
continue;
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
/* Only deal with valid and active snapshots */
if (!snap->valid || !snap->active)
@@ -2135,9 +2186,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
pe = alloc_pending_exception(snap);
- down_write(&snap->lock);
+ mutex_lock(&snap->lock);
if (!snap->valid) {
free_pending_exception(pe);
@@ -2180,7 +2231,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
}
next_snapshot:
- up_write(&snap->lock);
+ mutex_unlock(&snap->lock);
if (pe_to_start_now) {
start_copy(pe_to_start_now);
@@ -2201,15 +2252,24 @@ next_snapshot:
/*
* Called on a write from the origin driver.
*/
-static int do_origin(struct dm_dev *origin, struct bio *bio)
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
{
struct origin *o;
int r = DM_MAPIO_REMAPPED;
+again:
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
- if (o)
+ if (o) {
+ if (limit) {
+ struct dm_snapshot *s;
+ list_for_each_entry(s, &o->snapshots, list)
+ if (unlikely(!wait_for_in_progress(s, true)))
+ goto again;
+ }
+
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+ }
up_read(&_origins_lock);
return r;
@@ -2322,7 +2382,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, available_sectors);
/* Only tell snapshots if this is a write */
- return do_origin(o->dev, bio);
+ return do_origin(o->dev, bio, true);
}
static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d76e685206b3..8f070debe498 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1308,7 +1308,7 @@ void dm_table_event(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_event);
-sector_t dm_table_get_size(struct dm_table *t)
+inline sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
@@ -1333,6 +1333,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
unsigned int l, n = 0, k = 0;
sector_t *node;
+ if (unlikely(sector >= dm_table_get_size(t)))
+ return &t->targets[t->num_targets];
+
for (l = 0; l < t->depth; l++) {
n = get_child(n, k);
node = get_node(t, l, n);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index aa7795990989..0ee5eae71690 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2962,7 +2962,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return (struct pool *)pmd;
}
- pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
*error = "Error allocating memory for pool";
err_p = ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 8573c70a1880..e705799976c2 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -276,8 +276,8 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
BUG();
}
- DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
- block);
+ DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name,
+ type_str, block);
if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
DMERR("%s: reached maximum errors", v->data_dev->name);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 167686189fd2..9b78f4a74a12 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -401,15 +401,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return ERR_PTR(-EIO);
+
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&zmd->mblk_lock);
@@ -540,8 +543,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
if (!mblk) {
/* Cache miss: read the block from disk */
mblk = dmz_get_mblock_slow(zmd, mblk_no);
- if (!mblk)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(mblk))
+ return mblk;
}
/* Wait for on-going read I/O and check for error */
@@ -549,6 +552,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
+ dmz_check_bdev(zmd->dev);
return ERR_PTR(-EIO);
}
@@ -569,16 +573,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
/*
* Issue a metadata block write BIO.
*/
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
- unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
{
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
set_bit(DMZ_META_ERROR, &mblk->state);
- return;
+ return -ENOMEM;
}
set_bit(DMZ_META_WRITING, &mblk->state);
@@ -590,6 +597,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
+
+ return 0;
}
/*
@@ -601,6 +610,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
struct bio *bio;
int ret;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -612,6 +624,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
ret = submit_bio_wait(bio);
bio_put(bio);
+ if (ret)
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -658,22 +672,30 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
{
struct dmz_mblock *mblk;
struct blk_plug plug;
- int ret = 0;
+ int ret = 0, nr_mblks_submitted = 0;
/* Issue writes */
blk_start_plug(&plug);
- list_for_each_entry(mblk, write_list, link)
- dmz_write_mblock(zmd, mblk, set);
+ list_for_each_entry(mblk, write_list, link) {
+ ret = dmz_write_mblock(zmd, mblk, set);
+ if (ret)
+ break;
+ nr_mblks_submitted++;
+ }
blk_finish_plug(&plug);
/* Wait for completion */
list_for_each_entry(mblk, write_list, link) {
+ if (!nr_mblks_submitted)
+ break;
wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
+ dmz_check_bdev(zmd->dev);
ret = -EIO;
}
+ nr_mblks_submitted--;
}
/* Flush drive cache (this will also sync data) */
@@ -735,6 +757,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
dmz_lock_flush(zmd);
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -743,7 +770,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
- goto out;
+ goto err;
}
/*
@@ -753,7 +780,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list);
if (ret)
- goto out;
+ goto err;
/*
* The log is on disk. It is now safe to update in place
@@ -761,11 +788,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
@@ -780,16 +807,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
zmd->sb_gen++;
out:
- if (ret && !list_empty(&write_list)) {
- spin_lock(&zmd->mblk_lock);
- list_splice(&write_list, &zmd->mblk_dirty_list);
- spin_unlock(&zmd->mblk_lock);
- }
-
dmz_unlock_flush(zmd);
up_write(&zmd->mblk_sem);
return ret;
+
+err:
+ if (!list_empty(&write_list)) {
+ spin_lock(&zmd->mblk_lock);
+ list_splice(&write_list, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+ if (!dmz_check_bdev(zmd->dev))
+ ret = -EIO;
+ goto out;
}
/*
@@ -1212,6 +1243,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -1534,7 +1566,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_rnd_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_rnd_list, link) {
if (dmz_is_buf(zone))
@@ -1545,7 +1577,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
return dzone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1556,7 +1588,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_seq_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_seq_list, link) {
if (!zone->bzone)
@@ -1565,7 +1597,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
return zone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1594,30 +1626,6 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
}
/*
- * Activate a zone (increment its reference count).
- */
-void dmz_activate_zone(struct dm_zone *zone)
-{
- set_bit(DMZ_ACTIVE, &zone->flags);
- atomic_inc(&zone->refcount);
-}
-
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * and clears the active state of the zone once the count reaches 0,
- * indicating that all BIOs to the zone have completed. Returns
- * true if the zone was deactivated.
- */
-void dmz_deactivate_zone(struct dm_zone *zone)
-{
- if (atomic_dec_and_test(&zone->refcount)) {
- WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
- clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
- smp_mb__after_atomic();
- }
-}
-
-/*
* Get the zone mapping a chunk, if the chunk is mapped already.
* If no mapping exist and the operation is WRITE, a zone is
* allocated and used to map the chunk.
@@ -1647,6 +1655,10 @@ again:
/* Alloate a random zone */
dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!dzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1744,6 +1756,10 @@ again:
/* Alloate a random zone */
bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!bzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ bzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 44a119e12f1a..2fad512dce98 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -37,7 +37,7 @@ enum {
/*
* Number of seconds of target BIO inactivity to consider the target idle.
*/
-#define DMZ_IDLE_PERIOD (10UL * HZ)
+#define DMZ_IDLE_PERIOD (10UL * HZ)
/*
* Percentage of unmapped (free) random zones below which reclaim starts
@@ -81,6 +81,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
dmz_id(zmd, zone), (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
+ dmz_check_bdev(zrc->dev);
return ret;
}
@@ -134,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
if (ret <= 0)
@@ -217,7 +221,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -261,7 +265,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -314,7 +318,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -336,7 +340,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
/*
* Find a candidate zone for reclaim and process it.
*/
-static void dmz_reclaim(struct dmz_reclaim *zrc)
+static int dmz_do_reclaim(struct dmz_reclaim *zrc)
{
struct dmz_metadata *zmd = zrc->metadata;
struct dm_zone *dzone;
@@ -346,8 +350,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
- return;
+ if (IS_ERR(dzone))
+ return PTR_ERR(dzone);
start = jiffies;
@@ -393,13 +397,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
out:
if (ret) {
dmz_unlock_zone_reclaim(dzone);
- return;
+ return ret;
}
- (void) dmz_flush_metadata(zrc->metadata);
+ ret = dmz_flush_metadata(zrc->metadata);
+ if (ret) {
+ dmz_dev_debug(zrc->dev,
+ "Metadata flush for zone %u failed, err %d\n",
+ dmz_id(zmd, rzone), ret);
+ return ret;
+ }
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ return 0;
}
/*
@@ -444,6 +455,10 @@ static void dmz_reclaim_work(struct work_struct *work)
struct dmz_metadata *zmd = zrc->metadata;
unsigned int nr_rnd, nr_unmap_rnd;
unsigned int p_unmap_rnd;
+ int ret;
+
+ if (dmz_bdev_is_dying(zrc->dev))
+ return;
if (!dmz_should_reclaim(zrc)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
@@ -473,7 +488,12 @@ static void dmz_reclaim_work(struct work_struct *work)
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
- dmz_reclaim(zrc);
+ ret = dmz_do_reclaim(zrc);
+ if (ret) {
+ dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+ if (!dmz_check_bdev(zrc->dev))
+ return;
+ }
dmz_schedule_reclaim(zrc);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 532bfce7f072..497a2bc5da51 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -79,6 +79,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
+ if (bio->bi_status != BLK_STS_OK)
+ bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
if (atomic_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
@@ -277,8 +279,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone);
- if (!bzone)
- return -ENOSPC;
+ if (IS_ERR(bzone))
+ return PTR_ERR(bzone);
if (dmz_is_readonly(bzone))
return -EROFS;
@@ -389,6 +391,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
dmz_lock_metadata(zmd);
+ if (dmz->dev->flags & DMZ_BDEV_DYING) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
@@ -493,6 +500,8 @@ static void dmz_flush_work(struct work_struct *work)
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
+ if (ret)
+ dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
/* Process queued flush requests */
while (1) {
@@ -513,22 +522,24 @@ static void dmz_flush_work(struct work_struct *work)
* Get a chunk work and start it to process a new BIO.
* If the BIO chunk has no work yet, create one.
*/
-static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
struct dm_chunk_work *cw;
+ int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
if (!cw) {
- int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
- if (!cw)
+ if (unlikely(!cw)) {
+ ret = -ENOMEM;
goto out;
+ }
INIT_WORK(&cw->work, dmz_chunk_work);
atomic_set(&cw->refcount, 0);
@@ -539,7 +550,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
if (unlikely(ret)) {
kfree(cw);
- cw = NULL;
goto out;
}
}
@@ -547,10 +557,58 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
bio_list_add(&cw->bio_list, bio);
dmz_get_chunk_work(cw);
+ dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock);
+ return ret;
+}
+
+/*
+ * Check if the backing device is being removed. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+ if (dmz_dev->flags & DMZ_BDEV_DYING)
+ return true;
+
+ if (dmz_dev->flags & DMZ_CHECK_BDEV)
+ return !dmz_check_bdev(dmz_dev);
+
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return dmz_dev->flags & DMZ_BDEV_DYING;
+}
+
+/*
+ * Check the backing device availability. This detects such events as
+ * backing device going offline due to errors, media removals, etc.
+ * This check is less efficient than dmz_bdev_is_dying() and should
+ * only be performed as a part of error handling.
+ */
+bool dmz_check_bdev(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ dmz_dev->flags &= ~DMZ_CHECK_BDEV;
+
+ if (dmz_bdev_is_dying(dmz_dev))
+ return false;
+
+ disk = dmz_dev->bdev->bd_disk;
+ if (disk->fops->check_events &&
+ disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return !(dmz_dev->flags & DMZ_BDEV_DYING);
}
/*
@@ -564,6 +622,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
+ int ret;
+
+ if (dmz_bdev_is_dying(dmz->dev))
+ return DM_MAPIO_KILL;
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
bio_op(bio), (unsigned long long)sector, nr_sectors,
@@ -601,8 +663,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
/* Now ready to handle this BIO */
- dmz_reclaim_bio_acc(dmz->reclaim);
- dmz_queue_chunk_work(dmz, bio);
+ ret = dmz_queue_chunk_work(dmz, bio);
+ if (ret) {
+ dmz_dev_debug(dmz->dev,
+ "BIO op %d, can't process chunk %llu, err %i\n",
+ bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
+ ret);
+ return DM_MAPIO_REQUEUE;
+ }
return DM_MAPIO_SUBMITTED;
}
@@ -855,6 +923,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti,
{
struct dmz_target *dmz = ti->private;
+ if (!dmz_check_bdev(dmz->dev))
+ return -EIO;
+
*bdev = dmz->dev->bdev;
return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 12419f0bfe78..2662746ba8b9 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -56,6 +56,8 @@ struct dmz_dev {
unsigned int nr_zones;
+ unsigned int flags;
+
sector_t zone_nr_sectors;
unsigned int zone_nr_sectors_shift;
@@ -67,6 +69,10 @@ struct dmz_dev {
(dev)->zone_nr_sectors_shift)
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+/* Device flags. */
+#define DMZ_BDEV_DYING (1 << 0)
+#define DMZ_CHECK_BDEV (2 << 0)
+
/*
* Zone descriptor.
*/
@@ -115,7 +121,6 @@ enum {
DMZ_BUF,
/* Zone internal state */
- DMZ_ACTIVE,
DMZ_RECLAIM,
DMZ_SEQ_WRITE_ERR,
};
@@ -128,7 +133,6 @@ enum {
#define dmz_is_empty(z) ((z)->wp_block == 0)
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
-#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags)
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
@@ -188,8 +192,30 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
-void dmz_activate_zone(struct dm_zone *zone);
-void dmz_deactivate_zone(struct dm_zone *zone);
+/*
+ * Activate a zone (increment its reference count).
+ */
+static inline void dmz_activate_zone(struct dm_zone *zone)
+{
+ atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+ return atomic_read(&zone->refcount);
+}
int dmz_lock_zone_reclaim(struct dm_zone *zone);
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
@@ -225,4 +251,10 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+bool dmz_check_bdev(struct dmz_dev *dmz_dev);
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b27a69388dcd..b942c74f1ce8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1763,8 +1763,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -4101,7 +4108,7 @@ array_state_show(struct mddev *mddev, char *page)
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags))
switch(mddev->ro) {
case 1:
st = readonly;
@@ -5662,9 +5669,6 @@ int md_run(struct mddev *mddev)
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
abort:
@@ -5685,6 +5689,7 @@ static int do_md_run(struct mddev *mddev)
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5702,9 +5707,14 @@ static int do_md_run(struct mddev *mddev)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -7605,9 +7615,9 @@ static void status_unused(struct seq_file *seq)
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
- unsigned long dt, db;
- sector_t rt;
- int scale;
+ unsigned long dt, db = 0;
+ sector_t rt, curr_mark_cnt, resync_mark_cnt;
+ int scale, recovery_active;
unsigned int per_milli;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -7677,22 +7687,30 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
* db: blocks written from mark until now
* rt: remaining time
*
- * rt is a sector_t, so could be 32bit or 64bit.
- * So we divide before multiply in case it is 32bit and close
- * to the limit.
- * We scale the divisor (db) by 32 to avoid losing precision
- * near the end of resync when the number of remaining sectors
- * is close to 'db'.
- * We then divide rt by 32 after multiplying by db to compensate.
- * The '+1' avoids division by zero if db is very small.
+ * rt is a sector_t, which is always 64bit now. We are keeping
+ * the original algorithm, but it is not really necessary.
+ *
+ * Original algorithm:
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- - mddev->resync_mark_cnt;
+
+ curr_mark_cnt = mddev->curr_mark_cnt;
+ recovery_active = atomic_read(&mddev->recovery_active);
+ resync_mark_cnt = mddev->resync_mark_cnt;
+
+ if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
+ db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
rt = max_sectors - resync; /* number of remaining sectors */
- sector_div(rt, db/32+1);
+ rt = div64_u64(rt, db/32+1);
rt *= dt;
rt >>= 5;
@@ -8718,6 +8736,18 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ }
+
if (mddev->suspended)
return;
@@ -8747,6 +8777,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8792,7 +8823,7 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -8877,16 +8908,6 @@ void md_check_recovery(struct mddev *mddev)
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
- } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -8898,7 +8919,8 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 11696aba94e3..69bc0d5550cd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -243,6 +243,9 @@ enum mddev_flags {
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
};
enum mddev_sb_flags {
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 58b319757b1e..8aae0624a297 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
new_parent = shadow_current(s);
+ pn = dm_block_data(new_parent);
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+
+ /* create & init the left block */
r = new_block(s->info, &left);
if (r < 0)
return r;
+ ln = dm_block_data(left);
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+
+ /* create & init the right block */
r = new_block(s->info, &right);
if (r < 0) {
unlock_block(s->info, left);
return r;
}
- pn = dm_block_data(new_parent);
- ln = dm_block_data(left);
rn = dm_block_data(right);
-
- nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
- ln->header.flags = pn->header.flags;
- ln->header.nr_entries = cpu_to_le32(nr_left);
- ln->header.max_entries = pn->header.max_entries;
- ln->header.value_size = pn->header.value_size;
-
rn->header.flags = pn->header.flags;
rn->header.nr_entries = cpu_to_le32(nr_right);
rn->header.max_entries = pn->header.max_entries;
rn->header.value_size = pn->header.value_size;
-
- memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-
- size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
- sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 4aed69d9dd17..b23cac2c4738 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -248,7 +248,7 @@ static int out(struct sm_metadata *smm)
}
if (smm->recursion_count == 1)
- apply_bops(smm);
+ r = apply_bops(smm);
smm->recursion_count--;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5ecba9eef441..204adde004a3 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,6 +26,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -146,6 +149,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -552,10 +568,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
@@ -588,8 +606,21 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ break;
+ default:
+ WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d92642..3816e5477db1 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@ struct strip_zone {
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 31c4391f6a62..f46ac9db9edb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -435,19 +435,21 @@ static void raid1_end_write_request(struct bio *bio)
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -3097,6 +3099,13 @@ static int raid1_run(struct mddev *mddev)
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3130,8 +3139,12 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 433e78f453da..d08d77b9674f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -226,7 +226,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
+ resync_free_pages(&rps[j]);
j = 0;
out_free_bio:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 65608c6b6836..d5c14d56a714 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2538,7 +2538,8 @@ static void raid5_end_read_request(struct bio * bi)
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2570,7 +2571,9 @@ static void raid5_end_read_request(struct bio * bi)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -5718,7 +5721,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head || sh == sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&