summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-kcopyd.c19
-rw-r--r--drivers/md/dm-snap.c22
-rw-r--r--drivers/md/dm-thin.c55
-rw-r--r--drivers/md/raid10.c3
-rw-r--r--drivers/md/raid5.c2
5 files changed, 90 insertions, 11 deletions
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 54c308e6704f..04248394843e 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -55,15 +55,17 @@ struct dm_kcopyd_client {
struct dm_kcopyd_throttle *throttle;
/*
- * We maintain three lists of jobs:
+ * We maintain four lists of jobs:
*
* i) jobs waiting for pages
* ii) jobs that have pages, and are waiting for the io to be issued.
- * iii) jobs that have completed.
+ * iii) jobs that don't need to do any IO and just run a callback
+ * iv) jobs that have completed.
*
- * All three of these are protected by job_lock.
+ * All four of these are protected by job_lock.
*/
spinlock_t job_lock;
+ struct list_head callback_jobs;
struct list_head complete_jobs;
struct list_head io_jobs;
struct list_head pages_jobs;
@@ -583,6 +585,7 @@ static void do_work(struct work_struct *work)
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
struct blk_plug plug;
+ unsigned long flags;
/*
* The order that these are called is *very* important.
@@ -591,6 +594,10 @@ static void do_work(struct work_struct *work)
* list. io jobs call wake when they complete and it all
* starts again.
*/
+ spin_lock_irqsave(&kc->job_lock, flags);
+ list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
+ spin_unlock_irqrestore(&kc->job_lock, flags);
+
blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
@@ -608,7 +615,7 @@ static void dispatch_job(struct kcopyd_job *job)
struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs);
if (unlikely(!job->source.count))
- push(&kc->complete_jobs, job);
+ push(&kc->callback_jobs, job);
else if (job->pages == &zero_page_list)
push(&kc->io_jobs, job);
else
@@ -795,7 +802,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
job->read_err = read_err;
job->write_err = write_err;
- push(&kc->complete_jobs, job);
+ push(&kc->callback_jobs, job);
wake(kc);
}
EXPORT_SYMBOL(dm_kcopyd_do_callback);
@@ -825,6 +832,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
return ERR_PTR(-ENOMEM);
spin_lock_init(&kc->job_lock);
+ INIT_LIST_HEAD(&kc->callback_jobs);
INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
@@ -874,6 +882,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
/* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+ BUG_ON(!list_empty(&kc->callback_jobs));
BUG_ON(!list_empty(&kc->complete_jobs));
BUG_ON(!list_empty(&kc->io_jobs));
BUG_ON(!list_empty(&kc->pages_jobs));
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e108deebbaaa..5d3797728b9c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,6 +19,7 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
+#include <linux/semaphore.h>
#include "dm.h"
@@ -105,6 +106,9 @@ struct dm_snapshot {
/* The on disk metadata handler */
struct dm_exception_store *store;
+ /* Maximum number of in-flight COW jobs. */
+ struct semaphore cow_count;
+
struct dm_kcopyd_client *kcopyd_client;
/* Wait for events based on state_bits */
@@ -145,6 +149,19 @@ struct dm_snapshot {
#define RUNNING_MERGE 0
#define SHUTDOWN_MERGE 1
+/*
+ * Maximum number of chunks being copied on write.
+ *
+ * The value was decided experimentally as a trade-off between memory
+ * consumption, stalling the kernel's workqueues and maintaining a high enough
+ * throughput.
+ */
+#define DEFAULT_COW_THRESHOLD 2048
+
+static int cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
+
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
"A percentage of time allocated for copy on write");
@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_hash_tables;
}
+ sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
r = PTR_ERR(s->kcopyd_client);
@@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
}
list_add(&pe->out_of_order_entry, lh);
}
+ up(&s->cow_count);
}
/*
@@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dest.count = src.count;
/* Hand over to kcopyd */
+ down(&s->cow_count);
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
@@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio_end_io = bio->bi_end_io;
pe->full_bio_private = bio->bi_private;
+ down(&s->cow_count);
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index bc4e6825ff62..07eaa9f90712 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -256,6 +256,7 @@ struct pool {
spinlock_t lock;
struct bio_list deferred_flush_bios;
+ struct bio_list deferred_flush_completions;
struct list_head prepared_mappings;
struct list_head prepared_discards;
struct list_head active_thins;
@@ -920,6 +921,39 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
mempool_free(m, m->tc->pool->mapping_pool);
}
+static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ /*
+ * If the bio has the REQ_FUA flag set we must commit the metadata
+ * before signaling its completion.
+ */
+ if (!bio_triggers_commit(tc, bio)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * Complete bio with an error if earlier I/O caused changes to the
+ * metadata that can't be committed, e.g, due to I/O errors on the
+ * metadata device.
+ */
+ if (dm_thin_aborted_changes(tc->td)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in process_deferred_bios().
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_flush_completions, bio);
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
@@ -952,7 +986,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
- bio_endio(bio);
+ complete_overwrite_bio(tc, bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -2228,7 +2262,7 @@ static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
struct bio *bio;
- struct bio_list bios;
+ struct bio_list bios, bio_completions;
struct thin_c *tc;
tc = get_first_thin(pool);
@@ -2239,26 +2273,36 @@ static void process_deferred_bios(struct pool *pool)
}
/*
- * If there are any deferred flush bios, we must commit
- * the metadata before issuing them.
+ * If there are any deferred flush bios, we must commit the metadata
+ * before issuing them or signaling their completion.
*/
bio_list_init(&bios);
+ bio_list_init(&bio_completions);
+
spin_lock_irqsave(&pool->lock, flags);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
+
+ bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
+ bio_list_init(&pool->deferred_flush_completions);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios) &&
+ if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
return;
if (commit(pool)) {
+ bio_list_merge(&bios, &bio_completions);
+
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
}
pool->last_commit_jiffies = jiffies;
+ while ((bio = bio_list_pop(&bio_completions)))
+ bio_endio(bio);
+
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
}
@@ -2885,6 +2929,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_flush_bios);
+ bio_list_init(&pool->deferred_flush_completions);
INIT_LIST_HEAD(&pool->prepared_mappings);
INIT_LIST_HEAD(&pool->prepared_discards);
INIT_LIST_HEAD(&pool->active_thins);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8d613652d0e2..69e9abf00c74 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3755,6 +3755,8 @@ static int run(struct mddev *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto out_free_conf;
}
return 0;
@@ -4442,7 +4444,6 @@ bio_full:
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
generic_make_request(read_bio);
- sector_nr += nr_sectors;
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0841d8f10a58..5e65dc6def7e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6973,6 +6973,8 @@ static int run(struct mddev *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto abort;
}
/* Ok, everything is just fine now */