From 7e5c1e830b2310359a4cfbbf89895dde4abd996a Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:09:49 +0000
Subject: dm: add missing memory barrier to dm_suspend

Add memory barrier to fix atomic_read of pending value.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f2d24eb3208c..466a6bf0742f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1410,6 +1410,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
+		smp_mb();
 		if (!atomic_read(&md->pending) || signal_pending(current))
 			break;
 
-- 
cgit v1.2.3


From b9249e556877643b940e4543824a3de5c85bce49 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 8 Feb 2008 02:09:51 +0000
Subject: dm: mark function lists static

Add a couple of statics.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 466a6bf0742f..5f0f559d3b92 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -181,7 +181,7 @@ static void local_exit(void)
 	DMINFO("cleaned up");
 }
 
-int (*_inits[])(void) __initdata = {
+static int (*_inits[])(void) __initdata = {
 	local_init,
 	dm_target_init,
 	dm_linear_init,
@@ -189,7 +189,7 @@ int (*_inits[])(void) __initdata = {
 	dm_interface_init,
 };
 
-void (*_exits[])(void) = {
+static void (*_exits[])(void) = {
 	local_exit,
 	dm_target_exit,
 	dm_linear_exit,
-- 
cgit v1.2.3


From 27238b2bea89b1808b570bece6777ab2abc52fe2 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 8 Feb 2008 02:09:53 +0000
Subject: dm ioctl: remove lock_kernel

Remove lock_kernel() from the device-mapper ioctls - there should
be sufficient internal locking already where required.

Also remove some superfluous casts.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 9627fa0f9470..4aa1f78b78f0 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -702,7 +702,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	int r;
 	char *new_name = (char *) param + param->data_start;
 
-	if (new_name < (char *) param->data ||
+	if (new_name < param->data ||
 	    invalid_str(new_name, (void *) param + param_size)) {
 		DMWARN("Invalid new logical volume name supplied.");
 		return -EINVAL;
@@ -728,7 +728,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
 	if (!md)
 		return -ENXIO;
 
-	if (geostr < (char *) param->data ||
+	if (geostr < param->data ||
 	    invalid_str(geostr, (void *) param + param_size)) {
 		DMWARN("Invalid geometry supplied.");
 		goto out;
@@ -1397,13 +1397,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	return 0;
 }
 
-static int ctl_ioctl(struct inode *inode, struct file *file,
-		     uint command, ulong u)
+static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
 	unsigned int cmd;
 	struct dm_ioctl *param;
-	struct dm_ioctl __user *user = (struct dm_ioctl __user *) u;
 	ioctl_fn fn = NULL;
 	size_t param_size;
 
@@ -1471,8 +1469,13 @@ static int ctl_ioctl(struct inode *inode, struct file *file,
 	return r;
 }
 
+static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
+{
+	return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+}
+
 static const struct file_operations _ctl_fops = {
-	.ioctl	 = ctl_ioctl,
+	.unlocked_ioctl	 = dm_ctl_ioctl,
 	.owner	 = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 76c072b48e39e9291fbf02d6c912cf27d65e093d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:09:56 +0000
Subject: dm ioctl: move compat code

Move compat_ioctl handling into dm-ioctl.c.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4aa1f78b78f0..9c491397a51d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/dm-ioctl.h>
 #include <linux/hdreg.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 
@@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 {
 	struct dm_ioctl tmp, *dmi;
 
-	if (copy_from_user(&tmp, user, sizeof(tmp)))
+	if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
 		return -EFAULT;
 
-	if (tmp.data_size < sizeof(tmp))
+	if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
 		return -EINVAL;
 
 	dmi = vmalloc(tmp.data_size);
@@ -1474,8 +1475,18 @@ static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
 	return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
 }
 
+#ifdef CONFIG_COMPAT
+static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
+{
+	return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define dm_compat_ctl_ioctl NULL
+#endif
+
 static const struct file_operations _ctl_fops = {
 	.unlocked_ioctl	 = dm_ctl_ioctl,
+	.compat_ioctl = dm_compat_ctl_ioctl,
 	.owner	 = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From afb24528f9012e5c6361ca9a9128c7c089c1cc7c Mon Sep 17 00:00:00 2001
From: Paul Jimenez <pj@place.org>
Date: Fri, 8 Feb 2008 02:09:59 +0000
Subject: dm: table use list_for_each

This patch is some minor janitorish cleanup, using some macros
from linux/list.h (already #included via dm.h) to improve
readability.

Signed-off-by: Paul Jimenez <pj@place.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 47818d8249cb..d2eb3c593682 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices)
 {
 	struct list_head *tmp, *next;
 
-	for (tmp = devices->next; tmp != devices; tmp = next) {
+	list_for_each_safe(tmp, next, devices) {
 		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
-		next = tmp->next;
 		kfree(dd);
 	}
 }
@@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t)
 
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
-	struct list_head *d, *devices;
+	struct dm_dev *dd;
+	struct list_head *devices = dm_table_get_devices(t);
 	int r = 0;
 
-	devices = dm_table_get_devices(t);
-	for (d = devices->next; d != devices; d = d->next) {
-		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->bdev);
 		r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 	}
@@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 
 void dm_table_unplug_all(struct dm_table *t)
 {
-	struct list_head *d, *devices = dm_table_get_devices(t);
+	struct dm_dev *dd;
+	struct list_head *devices = dm_table_get_devices(t);
 
-	for (d = devices->next; d != devices; d = d->next) {
-		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->bdev);
 
 		blk_unplug(q);
-- 
cgit v1.2.3


From 82d601dc076deb5f348cc3a70f57248bc976ae0c Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Fri, 8 Feb 2008 02:10:04 +0000
Subject: dm: table remove unused total

"total = 0" does nothing.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d2eb3c593682..444a4fb64328 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -804,7 +804,7 @@ static int setup_indexes(struct dm_table *t)
 		return -ENOMEM;
 
 	/* set up internal nodes, bottom-up */
-	for (i = t->depth - 2, total = 0; i >= 0; i--) {
+	for (i = t->depth - 2; i >= 0; i--) {
 		t->index[i] = indexes;
 		indexes += (KEYS_PER_NODE * t->counts[i]);
 		setup_btree_index(i, t);
-- 
cgit v1.2.3


From 8defd83084c3ce46d314c038f7c0f0ed7156d6f8 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Fri, 8 Feb 2008 02:10:06 +0000
Subject: dm snapshot: use rounddown_pow_of_two

Since the source file already includes the log2.h header file, it
seems pointless to re-invent the necessary routine.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index cee16fadd9ee..fad84654b045 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -333,16 +333,6 @@ static int calc_max_buckets(void)
 	return mem;
 }
 
-/*
- * Rounds a number down to a power of 2.
- */
-static uint32_t round_down(uint32_t n)
-{
-	while (n & (n - 1))
-		n &= (n - 1);
-	return n;
-}
-
 /*
  * Allocate room for a suitable hash table.
  */
@@ -361,8 +351,7 @@ static int init_hash_tables(struct dm_snapshot *s)
 	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
 	hash_size = min(hash_size, max_buckets);
 
-	/* Round it down to a power of 2 */
-	hash_size = round_down(hash_size);
+	hash_size = rounddown_pow_of_two(hash_size);
 	if (init_exception_table(&s->complete, hash_size))
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From e61290a4a23c3f85f883f0c8cc7c967501f82a57 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Fri, 8 Feb 2008 02:10:08 +0000
Subject: dm: convert suspend_lock semaphore to mutex

Replace semaphore with mutex.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5f0f559d3b92..d16bb5b80789 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -73,7 +73,7 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 
 struct mapped_device {
 	struct rw_semaphore io_lock;
-	struct semaphore suspend_lock;
+	struct mutex suspend_lock;
 	spinlock_t pushback_lock;
 	rwlock_t map_lock;
 	atomic_t holders;
@@ -994,7 +994,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	memset(md, 0, sizeof(*md));
 	init_rwsem(&md->io_lock);
-	init_MUTEX(&md->suspend_lock);
+	mutex_init(&md->suspend_lock);
 	spin_lock_init(&md->pushback_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
@@ -1282,7 +1282,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
 	int r = -EINVAL;
 
-	down(&md->suspend_lock);
+	mutex_lock(&md->suspend_lock);
 
 	/* device must be suspended */
 	if (!dm_suspended(md))
@@ -1297,7 +1297,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	r = __bind(md, table);
 
 out:
-	up(&md->suspend_lock);
+	mutex_unlock(&md->suspend_lock);
 	return r;
 }
 
@@ -1353,7 +1353,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
 
-	down(&md->suspend_lock);
+	mutex_lock(&md->suspend_lock);
 
 	if (dm_suspended(md))
 		goto out_unlock;
@@ -1475,7 +1475,7 @@ out:
 	dm_table_put(map);
 
 out_unlock:
-	up(&md->suspend_lock);
+	mutex_unlock(&md->suspend_lock);
 	return r;
 }
 
@@ -1485,7 +1485,7 @@ int dm_resume(struct mapped_device *md)
 	struct bio *def;
 	struct dm_table *map = NULL;
 
-	down(&md->suspend_lock);
+	mutex_lock(&md->suspend_lock);
 	if (!dm_suspended(md))
 		goto out;
 
@@ -1521,7 +1521,7 @@ int dm_resume(struct mapped_device *md)
 
 out:
 	dm_table_put(map);
-	up(&md->suspend_lock);
+	mutex_unlock(&md->suspend_lock);
 
 	return r;
 }
-- 
cgit v1.2.3


From e48b9db251122b88783844b1d2d69c6780f898ff Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 8 Feb 2008 02:10:11 +0000
Subject: dm snapshot: use uninitialized_var

drivers/md/dm-exception-store.c: In function 'persistent_read_metadata':
drivers/md/dm-exception-store.c:452: warning: 'new_snapshot' may be used uninitialized in this function

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 8fe81e1807e0..5bbce29f143a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store)
 
 static int persistent_read_metadata(struct exception_store *store)
 {
-	int r, new_snapshot;
+	int r, uninitialized_var(new_snapshot);
 	struct pstore *ps = get_info(store);
 
 	/*
-- 
cgit v1.2.3


From 69a2ce72a4efe0653479a5d69fc86b5726e83219 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 8 Feb 2008 02:10:14 +0000
Subject: dm: table use uninitialized_var

drivers/md/dm-table.c: In function 'dm_get_device':
drivers/md/dm-table.c:478: warning: 'dev' may be used uninitialized in this function

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 444a4fb64328..f16062982383 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -475,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 			      int mode, struct dm_dev **result)
 {
 	int r;
-	dev_t dev;
+	dev_t uninitialized_var(dev);
 	struct dm_dev *dd;
 	unsigned int major, minor;
 
-- 
cgit v1.2.3


From a26ffd4aa99d6ace82852930edf09e450cc7dc8d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 8 Feb 2008 02:10:16 +0000
Subject: dm ioctl: use uninitialized_var

drivers/md/dm-ioctl.c:1405: warning: 'param' may be used uninitialized in this function

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 9c491397a51d..b262c0042de3 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1402,7 +1402,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
 	unsigned int cmd;
-	struct dm_ioctl *param;
+	struct dm_ioctl *uninitialized_var(param);
 	ioctl_fn fn = NULL;
 	size_t param_size;
 
-- 
cgit v1.2.3


From 6ed7ade89657e71da3afa7cb13ad25570a95dd9d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:19 +0000
Subject: dm: tidy alloc_dev labels

Tidy labels in alloc_dev to make later patches more clear.

No functional change in this patch.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d16bb5b80789..52427e15189b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -982,7 +982,7 @@ static struct mapped_device *alloc_dev(int minor)
 	}
 
 	if (!try_module_get(THIS_MODULE))
-		goto bad0;
+		goto bad_module_get;
 
 	/* get a minor number for the dev */
 	if (minor == DM_ANY_MINOR)
@@ -990,7 +990,7 @@ static struct mapped_device *alloc_dev(int minor)
 	else
 		r = specific_minor(md, minor);
 	if (r < 0)
-		goto bad1;
+		goto bad_minor;
 
 	memset(md, 0, sizeof(*md));
 	init_rwsem(&md->io_lock);
@@ -1006,7 +1006,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue)
-		goto bad1_free_minor;
+		goto bad_queue;
 
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
@@ -1017,11 +1017,11 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
 	if (!md->io_pool)
-		goto bad2;
+		goto bad_io_pool;
 
 	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
 	if (!md->tio_pool)
-		goto bad3;
+		goto bad_tio_pool;
 
 	md->bs = bioset_create(16, 16);
 	if (!md->bs)
@@ -1029,7 +1029,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
-		goto bad4;
+		goto bad_disk;
 
 	atomic_set(&md->pending, 0);
 	init_waitqueue_head(&md->wait);
@@ -1053,19 +1053,19 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
- bad4:
+bad_disk:
 	bioset_free(md->bs);
- bad_no_bioset:
+bad_no_bioset:
 	mempool_destroy(md->tio_pool);
- bad3:
+bad_tio_pool:
 	mempool_destroy(md->io_pool);
- bad2:
+bad_io_pool:
 	blk_cleanup_queue(md->queue);
- bad1_free_minor:
+bad_queue:
 	free_minor(minor);
- bad1:
+bad_minor:
 	module_put(THIS_MODULE);
- bad0:
+bad_module_get:
 	kfree(md);
 	return NULL;
 }
-- 
cgit v1.2.3


From 6d6f10df890df8be69edd4db32dc8ce09f311bb8 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:22 +0000
Subject: dm: refactor deferred bio_list processing

Refactor deferred bio_list processing.

 - use separate _merge_pushback_list function
 - move deferred bio list pick up to flush function
 - use bio_list_pop instead of bio_list_get
 - simplify noflush flag use

No real functional change in this patch.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 67 +++++++++++++++++++++++++--------------------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 52427e15189b..c1ad7d77dbcd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1262,19 +1262,27 @@ EXPORT_SYMBOL_GPL(dm_put);
 /*
  * Process the deferred bios
  */
-static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
+static void __flush_deferred_io(struct mapped_device *md)
 {
-	struct bio *n;
+	struct bio *c;
 
-	while (c) {
-		n = c->bi_next;
-		c->bi_next = NULL;
+	while ((c = bio_list_pop(&md->deferred))) {
 		if (__split_bio(md, c))
 			bio_io_error(c);
-		c = n;
 	}
 }
 
+static void __merge_pushback_list(struct mapped_device *md)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&md->pushback_lock, flags);
+	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+	bio_list_merge_head(&md->deferred, &md->pushback);
+	bio_list_init(&md->pushback);
+	spin_unlock_irqrestore(&md->pushback_lock, flags);
+}
+
 /*
  * Swap in a new table (destroying old one).
  */
@@ -1346,9 +1354,7 @@ static void unlock_fs(struct mapped_device *md)
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
-	unsigned long flags;
 	DECLARE_WAITQUEUE(wait, current);
-	struct bio *def;
 	int r = -EINVAL;
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
@@ -1378,16 +1384,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 			r = -ENOMEM;
 			goto flush_and_out;
 		}
-	}
 
-	/*
-	 * Flush I/O to the device.
-	 * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
-	 */
-	if (do_lockfs && !noflush) {
-		r = lock_fs(md);
-		if (r)
-			goto out;
+		/*
+		 * Flush I/O to the device. noflush supersedes do_lockfs,
+		 * because lock_fs() needs to flush I/Os.
+		 */
+		if (do_lockfs) {
+			r = lock_fs(md);
+			if (r)
+				goto out;
+		}
 	}
 
 	/*
@@ -1421,20 +1427,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	down_write(&md->io_lock);
 	remove_wait_queue(&md->wait, &wait);
 
-	if (noflush) {
-		spin_lock_irqsave(&md->pushback_lock, flags);
-		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-		bio_list_merge_head(&md->deferred, &md->pushback);
-		bio_list_init(&md->pushback);
-		spin_unlock_irqrestore(&md->pushback_lock, flags);
-	}
+	if (noflush)
+		__merge_pushback_list(md);
 
 	/* were we interrupted ? */
 	r = -EINTR;
 	if (atomic_read(&md->pending)) {
 		clear_bit(DMF_BLOCK_IO, &md->flags);
-		def = bio_list_get(&md->deferred);
-		__flush_deferred_io(md, def);
+		__flush_deferred_io(md);
 		up_write(&md->io_lock);
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
@@ -1454,15 +1454,8 @@ flush_and_out:
 		 * flush them before return.
 		 */
 		down_write(&md->io_lock);
-
-		spin_lock_irqsave(&md->pushback_lock, flags);
-		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-		bio_list_merge_head(&md->deferred, &md->pushback);
-		bio_list_init(&md->pushback);
-		spin_unlock_irqrestore(&md->pushback_lock, flags);
-
-		def = bio_list_get(&md->deferred);
-		__flush_deferred_io(md, def);
+		__merge_pushback_list(md);
+		__flush_deferred_io(md);
 		up_write(&md->io_lock);
 	}
 
@@ -1482,7 +1475,6 @@ out_unlock:
 int dm_resume(struct mapped_device *md)
 {
 	int r = -EINVAL;
-	struct bio *def;
 	struct dm_table *map = NULL;
 
 	mutex_lock(&md->suspend_lock);
@@ -1500,8 +1492,7 @@ int dm_resume(struct mapped_device *md)
 	down_write(&md->io_lock);
 	clear_bit(DMF_BLOCK_IO, &md->flags);
 
-	def = bio_list_get(&md->deferred);
-	__flush_deferred_io(md, def);
+	__flush_deferred_io(md);
 	up_write(&md->io_lock);
 
 	unlock_fs(md);
-- 
cgit v1.2.3


From 73d410c0137f63c6597e9763c81e5f4d015e9940 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:25 +0000
Subject: dm: tidy dm_suspend

Tidy dm_suspend function

 - change return value logic in dm_suspend
 - use atomic_read only once.
 - move DMF_BLOCK_IO clearing into one place

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c1ad7d77dbcd..5191954a18b2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1270,6 +1270,8 @@ static void __flush_deferred_io(struct mapped_device *md)
 		if (__split_bio(md, c))
 			bio_io_error(c);
 	}
+
+	clear_bit(DMF_BLOCK_IO, &md->flags);
 }
 
 static void __merge_pushback_list(struct mapped_device *md)
@@ -1355,14 +1357,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
 	DECLARE_WAITQUEUE(wait, current);
-	int r = -EINVAL;
+	int pending, r = 0;
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
 
 	mutex_lock(&md->suspend_lock);
 
-	if (dm_suspended(md))
+	if (dm_suspended(md)) {
+		r = -EINVAL;
 		goto out_unlock;
+	}
 
 	map = dm_get_table(md);
 
@@ -1417,7 +1421,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		smp_mb();
-		if (!atomic_read(&md->pending) || signal_pending(current))
+		pending = atomic_read(&md->pending);
+		if (!pending || signal_pending(current))
 			break;
 
 		io_schedule();
@@ -1431,12 +1436,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		__merge_pushback_list(md);
 
 	/* were we interrupted ? */
-	r = -EINTR;
-	if (atomic_read(&md->pending)) {
-		clear_bit(DMF_BLOCK_IO, &md->flags);
+	if (pending) {
 		__flush_deferred_io(md);
 		up_write(&md->io_lock);
+
 		unlock_fs(md);
+		r = -EINTR;
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
 	up_write(&md->io_lock);
@@ -1445,8 +1450,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	set_bit(DMF_SUSPENDED, &md->flags);
 
-	r = 0;
-
 flush_and_out:
 	if (r && noflush) {
 		/*
@@ -1490,8 +1493,6 @@ int dm_resume(struct mapped_device *md)
 		goto out;
 
 	down_write(&md->io_lock);
-	clear_bit(DMF_BLOCK_IO, &md->flags);
-
 	__flush_deferred_io(md);
 	up_write(&md->io_lock);
 
-- 
cgit v1.2.3


From 94d6351e147231b2c5a9512d69693ee8ac0c204d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:27 +0000
Subject: dm: split dm_suspend io_lock hold into two

Change io_locking to allow processing flush in separate thread.

Because we have DMF_BLOCK_IO already set, any possible
new ios are queued in dm_requests now.

In the case of interrupting previous wait there can be more
ios queued (we unlocked io_lock for a while) but this is safe.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5191954a18b2..11f422ecfda0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1434,9 +1434,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	if (noflush)
 		__merge_pushback_list(md);
+	up_write(&md->io_lock);
 
 	/* were we interrupted ? */
 	if (pending) {
+		down_write(&md->io_lock);
 		__flush_deferred_io(md);
 		up_write(&md->io_lock);
 
@@ -1444,7 +1446,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		r = -EINTR;
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
-	up_write(&md->io_lock);
 
 	dm_table_postsuspend_targets(map);
 
-- 
cgit v1.2.3


From 46125c1c90882e17f856f1ba30440efea9135e80 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:30 +0000
Subject: dm: refactor dm_suspend completion wait

Move completion wait to separate function

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 11f422ecfda0..9ca012e639a8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1259,6 +1259,29 @@ void dm_put(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_put);
 
+static int dm_wait_for_completion(struct mapped_device *md)
+{
+	int r = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		smp_mb();
+		if (!atomic_read(&md->pending))
+			break;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		io_schedule();
+	}
+	set_current_state(TASK_RUNNING);
+
+	return r;
+}
+
 /*
  * Process the deferred bios
  */
@@ -1357,7 +1380,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
 	DECLARE_WAITQUEUE(wait, current);
-	int pending, r = 0;
+	int r = 0;
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
 
@@ -1414,20 +1437,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		dm_table_unplug_all(map);
 
 	/*
-	 * Then we wait for the already mapped ios to
-	 * complete.
+	 * Wait for the already-mapped ios to complete.
 	 */
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		smp_mb();
-		pending = atomic_read(&md->pending);
-		if (!pending || signal_pending(current))
-			break;
-
-		io_schedule();
-	}
-	set_current_state(TASK_RUNNING);
+	r = dm_wait_for_completion(md);
 
 	down_write(&md->io_lock);
 	remove_wait_queue(&md->wait, &wait);
@@ -1437,13 +1449,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	up_write(&md->io_lock);
 
 	/* were we interrupted ? */
-	if (pending) {
+	if (r < 0) {
 		down_write(&md->io_lock);
 		__flush_deferred_io(md);
 		up_write(&md->io_lock);
 
 		unlock_fs(md);
-		r = -EINTR;
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
 
-- 
cgit v1.2.3


From 0149e57fedcaca8905b6cca091fcb0915ff3e27d Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 8 Feb 2008 02:10:32 +0000
Subject: dm: targets no longer experimental

Drop the EXPERIMENTAL tag from well-established device-mapper targets, so
the newer ones stand out better.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/Kconfig | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3fa7c77d9bd9..610af916891e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -204,7 +204,7 @@ config BLK_DEV_DM
 
 config DM_DEBUG
 	boolean "Device mapper debugging support"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	depends on BLK_DEV_DM
 	---help---
 	  Enable this for messages that may help debug device-mapper problems.
 
@@ -212,7 +212,7 @@ config DM_DEBUG
 
 config DM_CRYPT
 	tristate "Crypt target support"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	depends on BLK_DEV_DM
 	select CRYPTO
 	select CRYPTO_CBC
 	---help---
@@ -230,34 +230,34 @@ config DM_CRYPT
 	  If unsure, say N.
 
 config DM_SNAPSHOT
-       tristate "Snapshot target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       tristate "Snapshot target"
+       depends on BLK_DEV_DM
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
 config DM_MIRROR
-       tristate "Mirror target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       tristate "Mirror target"
+       depends on BLK_DEV_DM
        ---help---
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
 config DM_ZERO
-	tristate "Zero target (EXPERIMENTAL)"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	tristate "Zero target"
+	depends on BLK_DEV_DM
 	---help---
 	  A target that discards writes, and returns all zeroes for
 	  reads.  Useful in some recovery situations.
 
 config DM_MULTIPATH
-	tristate "Multipath target (EXPERIMENTAL)"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	tristate "Multipath target"
+	depends on BLK_DEV_DM
 	---help---
 	  Allow volume managers to support multipath hardware.
 
 config DM_MULTIPATH_EMC
-	tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
-	depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
+	tristate "EMC CX/AX multipath support"
+	depends on DM_MULTIPATH && BLK_DEV_DM
 	---help---
 	  Multipath support for EMC CX/AX series hardware.
 
-- 
cgit v1.2.3


From 009cd09042fbd095e708b412ad7870fb421fa2f0 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 8 Feb 2008 02:10:35 +0000
Subject: dm mpath: add missing static

A static declaration missing.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 24b2b1e32fae..e7ee59e655d5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 static struct kmem_cache *_mpio_cache;
 
-struct workqueue_struct *kmultipathd;
+static struct workqueue_struct *kmultipathd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 
-- 
cgit v1.2.3


From 53017030e2548dffbe481fb4ab6b587abbee6f8b Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:38 +0000
Subject: dm crypt: move convert_context inside dm_crypt_io

Move convert_context inside dm_crypt_io.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6b66ee46b87d..af8cd99daa5b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -27,17 +27,6 @@
 #define DM_MSG_PREFIX "crypt"
 #define MESG_STR(x) x, sizeof(x)
 
-/*
- * per bio private data
- */
-struct dm_crypt_io {
-	struct dm_target *target;
-	struct bio *base_bio;
-	struct work_struct work;
-	atomic_t pending;
-	int error;
-};
-
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -52,6 +41,20 @@ struct convert_context {
 	int write;
 };
 
+/*
+ * per bio private data
+ */
+struct dm_crypt_io {
+	struct dm_target *target;
+	struct bio *base_bio;
+	struct work_struct work;
+
+	struct convert_context ctx;
+
+	atomic_t pending;
+	int error;
+};
+
 struct crypt_config;
 
 struct crypt_iv_operations {
@@ -579,13 +582,12 @@ static void process_write(struct dm_crypt_io *io)
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
-	struct convert_context ctx;
 	unsigned remaining = base_bio->bi_size;
 	sector_t sector = base_bio->bi_sector - io->target->begin;
 
 	atomic_inc(&io->pending);
 
-	crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
+	crypt_convert_init(cc, &io->ctx, NULL, base_bio, sector, 1);
 
 	/*
 	 * The allocated buffers can be smaller than the whole bio,
@@ -598,10 +600,10 @@ static void process_write(struct dm_crypt_io *io)
 			return;
 		}
 
-		ctx.bio_out = clone;
-		ctx.idx_out = 0;
+		io->ctx.bio_out = clone;
+		io->ctx.idx_out = 0;
 
-		if (unlikely(crypt_convert(cc, &ctx) < 0)) {
+		if (unlikely(crypt_convert(cc, &io->ctx) < 0)) {
 			crypt_free_buffer_pages(cc, clone);
 			bio_put(clone);
 			crypt_dec_pending(io, -EIO);
@@ -609,7 +611,7 @@ static void process_write(struct dm_crypt_io *io)
 		}
 
 		/* crypt_convert should have filled the clone bio */
-		BUG_ON(ctx.idx_out < clone->bi_vcnt);
+		BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
 
 		clone->bi_sector = cc->start + sector;
 		remaining -= clone->bi_size;
@@ -634,12 +636,11 @@ static void process_write(struct dm_crypt_io *io)
 static void process_read_endio(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
-	struct convert_context ctx;
 
-	crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio,
+	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
 			   io->base_bio->bi_sector - io->target->begin, 0);
 
-	crypt_dec_pending(io, crypt_convert(cc, &ctx));
+	crypt_dec_pending(io, crypt_convert(cc, &io->ctx));
 }
 
 static void kcryptd_do_work(struct work_struct *work)
-- 
cgit v1.2.3


From fcd369daa36d547607dbedd0b41099d6dfc1d1c7 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:41 +0000
Subject: dm crypt: remove unnecessary crypt_context write parm

Remove write attribute from convert_context and use bio flag instead.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index af8cd99daa5b..862ce9f6faac 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -38,7 +38,6 @@ struct convert_context {
 	unsigned int idx_in;
 	unsigned int idx_out;
 	sector_t sector;
-	int write;
 };
 
 /*
@@ -327,7 +326,7 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
 static void crypt_convert_init(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct bio *bio_out, struct bio *bio_in,
-			       sector_t sector, int write)
+			       sector_t sector)
 {
 	ctx->bio_in = bio_in;
 	ctx->bio_out = bio_out;
@@ -336,7 +335,6 @@ static void crypt_convert_init(struct crypt_config *cc,
 	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
 	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
 	ctx->sector = sector + cc->iv_offset;
-	ctx->write = write;
 }
 
 /*
@@ -372,7 +370,7 @@ static int crypt_convert(struct crypt_config *cc,
 		}
 
 		r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
-					      ctx->write, ctx->sector);
+			bio_data_dir(ctx->bio_in) == WRITE, ctx->sector);
 		if (r < 0)
 			break;
 
@@ -587,7 +585,7 @@ static void process_write(struct dm_crypt_io *io)
 
 	atomic_inc(&io->pending);
 
-	crypt_convert_init(cc, &io->ctx, NULL, base_bio, sector, 1);
+	crypt_convert_init(cc, &io->ctx, NULL, base_bio, sector);
 
 	/*
 	 * The allocated buffers can be smaller than the whole bio,
@@ -638,7 +636,7 @@ static void process_read_endio(struct dm_crypt_io *io)
 	struct crypt_config *cc = io->target->private;
 
 	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
-			   io->base_bio->bi_sector - io->target->begin, 0);
+			   io->base_bio->bi_sector - io->target->begin);
 
 	crypt_dec_pending(io, crypt_convert(cc, &io->ctx));
 }
-- 
cgit v1.2.3


From 5742fd77757894ebb5e441afbdac1fb666e782f7 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:43 +0000
Subject: dm crypt: move error setting outside crypt_dec_pending

Move error code setting outside of crypt_dec_pending function.
Use -EIO if crypt_convert_scatterlist() fails.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 862ce9f6faac..cc189a2bc533 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -456,18 +456,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
  */
-static void crypt_dec_pending(struct dm_crypt_io *io, int error)
+static void crypt_dec_pending(struct dm_crypt_io *io)
 {
-	struct crypt_config *cc = (struct crypt_config *) io->target->private;
-
-	if (error < 0)
-		io->error = error;
+	struct crypt_config *cc = io->target->private;
 
 	if (!atomic_dec_and_test(&io->pending))
 		return;
 
 	bio_endio(io->base_bio, io->error);
-
 	mempool_free(io, cc->io_pool);
 }
 
@@ -530,7 +526,11 @@ static void crypt_endio(struct bio *clone, int error)
 
 out:
 	bio_put(clone);
-	crypt_dec_pending(io, error);
+
+	if (unlikely(error))
+		io->error = error;
+
+	crypt_dec_pending(io);
 }
 
 static void clone_init(struct dm_crypt_io *io, struct bio *clone)
@@ -560,7 +560,8 @@ static void process_read(struct dm_crypt_io *io)
 	 */
 	clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
 	if (unlikely(!clone)) {
-		crypt_dec_pending(io, -ENOMEM);
+		io->error = -ENOMEM;
+		crypt_dec_pending(io);
 		return;
 	}
 
@@ -594,7 +595,8 @@ static void process_write(struct dm_crypt_io *io)
 	while (remaining) {
 		clone = crypt_alloc_buffer(io, remaining);
 		if (unlikely(!clone)) {
-			crypt_dec_pending(io, -ENOMEM);
+			io->error = -ENOMEM;
+			crypt_dec_pending(io);
 			return;
 		}
 
@@ -604,7 +606,8 @@ static void process_write(struct dm_crypt_io *io)
 		if (unlikely(crypt_convert(cc, &io->ctx) < 0)) {
 			crypt_free_buffer_pages(cc, clone);
 			bio_put(clone);
-			crypt_dec_pending(io, -EIO);
+			io->error = -EIO;
+			crypt_dec_pending(io);
 			return;
 		}
 
@@ -631,14 +634,25 @@ static void process_write(struct dm_crypt_io *io)
 	}
 }
 
+static void crypt_read_done(struct dm_crypt_io *io, int error)
+{
+	if (unlikely(error < 0))
+		io->error = -EIO;
+
+	crypt_dec_pending(io);
+}
+
 static void process_read_endio(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
+	int r = 0;
 
 	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
 			   io->base_bio->bi_sector - io->target->begin);
 
-	crypt_dec_pending(io, crypt_convert(cc, &io->ctx));
+	r = crypt_convert(cc, &io->ctx);
+
+	crypt_read_done(io, r);
 }
 
 static void kcryptd_do_work(struct work_struct *work)
-- 
cgit v1.2.3


From ee7a491e62214bfd56c97c1fef3672c09e2a700d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:46 +0000
Subject: dm crypt: tidy crypt_endio

Simplify crypt_endio function.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cc189a2bc533..278659975d72 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -504,7 +504,7 @@ static void crypt_endio(struct bio *clone, int error)
 {
 	struct dm_crypt_io *io = clone->bi_private;
 	struct crypt_config *cc = io->target->private;
-	unsigned read_io = bio_data_dir(clone) == READ;
+	unsigned rw = bio_data_dir(clone);
 
 	if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
 		error = -EIO;
@@ -512,20 +512,15 @@ static void crypt_endio(struct bio *clone, int error)
 	/*
 	 * free the processed pages
 	 */
-	if (!read_io) {
+	if (rw == WRITE)
 		crypt_free_buffer_pages(cc, clone);
-		goto out;
-	}
-
-	if (unlikely(error))
-		goto out;
 
 	bio_put(clone);
-	kcryptd_queue_crypt(io);
-	return;
 
-out:
-	bio_put(clone);
+	if (rw == READ && !error) {
+		kcryptd_queue_crypt(io);
+		return;
+	}
 
 	if (unlikely(error))
 		io->error = error;
-- 
cgit v1.2.3


From 4e4eef64e246694a6302c3ee95ac9b60c40f877e Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:49 +0000
Subject: dm crypt: adjust io processing functions

Rename functions to follow calling convention.
Prepare write io error processing function skeleton.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 278659975d72..5b83204b6594 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -481,14 +481,14 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * starved by new requests which can block in the first stages due
  * to memory allocation.
  */
-static void kcryptd_do_work(struct work_struct *work);
-static void kcryptd_do_crypt(struct work_struct *work);
+static void kcryptd_io(struct work_struct *work);
+static void kcryptd_crypt(struct work_struct *work);
 
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 
-	INIT_WORK(&io->work, kcryptd_do_work);
+	INIT_WORK(&io->work, kcryptd_io);
 	queue_work(cc->io_queue, &io->work);
 }
 
@@ -496,7 +496,7 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 
-	INIT_WORK(&io->work, kcryptd_do_crypt);
+	INIT_WORK(&io->work, kcryptd_crypt);
 	queue_work(cc->crypt_queue, &io->work);
 }
 
@@ -539,7 +539,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void process_read(struct dm_crypt_io *io)
+static void kcryptd_io_read(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
@@ -571,7 +571,15 @@ static void process_read(struct dm_crypt_io *io)
 	generic_make_request(clone);
 }
 
-static void process_write(struct dm_crypt_io *io)
+static void kcryptd_io_write(struct dm_crypt_io *io)
+{
+}
+
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
+{
+}
+
+static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
@@ -629,7 +637,7 @@ static void process_write(struct dm_crypt_io *io)
 	}
 }
 
-static void crypt_read_done(struct dm_crypt_io *io, int error)
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
 {
 	if (unlikely(error < 0))
 		io->error = -EIO;
@@ -637,7 +645,7 @@ static void crypt_read_done(struct dm_crypt_io *io, int error)
 	crypt_dec_pending(io);
 }
 
-static void process_read_endio(struct dm_crypt_io *io)
+static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	int r = 0;
@@ -647,25 +655,27 @@ static void process_read_endio(struct dm_crypt_io *io)
 
 	r = crypt_convert(cc, &io->ctx);
 
-	crypt_read_done(io, r);
+	kcryptd_crypt_read_done(io, r);
 }
 
-static void kcryptd_do_work(struct work_struct *work)
+static void kcryptd_io(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
 	if (bio_data_dir(io->base_bio) == READ)
-		process_read(io);
+		kcryptd_io_read(io);
+	else
+		kcryptd_io_write(io);
 }
 
-static void kcryptd_do_crypt(struct work_struct *work)
+static void kcryptd_crypt(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
 	if (bio_data_dir(io->base_bio) == READ)
-		process_read_endio(io);
+		kcryptd_crypt_read_convert(io);
 	else
-		process_write(io);
+		kcryptd_crypt_write_convert(io);
 }
 
 /*
-- 
cgit v1.2.3


From 395b167ca0c559aa975d8bbc46a3d10edd6e17d0 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 8 Feb 2008 02:10:52 +0000
Subject: dm crypt: move queue functions

Reorder kcryptd functions for clarity.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 54 +++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5b83204b6594..ccc2fe19db86 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -110,6 +110,7 @@ struct crypt_config {
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
+static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 
 /*
  * Different IV generation algorithms:
@@ -481,25 +482,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * starved by new requests which can block in the first stages due
  * to memory allocation.
  */
-static void kcryptd_io(struct work_struct *work);
-static void kcryptd_crypt(struct work_struct *work);
-
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
-	struct crypt_config *cc = io->target->private;
-
-	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
-}
-
-static void kcryptd_queue_crypt(struct dm_crypt_io *io)
-{
-	struct crypt_config *cc = io->target->private;
-
-	INIT_WORK(&io->work, kcryptd_crypt);
-	queue_work(cc->crypt_queue, &io->work);
-}
-
 static void crypt_endio(struct bio *clone, int error)
 {
 	struct dm_crypt_io *io = clone->bi_private;
@@ -575,6 +557,24 @@ static void kcryptd_io_write(struct dm_crypt_io *io)
 {
 }
 
+static void kcryptd_io(struct work_struct *work)
+{
+	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+	if (bio_data_dir(io->base_bio) == READ)
+		kcryptd_io_read(io);
+	else
+		kcryptd_io_write(io);
+}
+
+static void kcryptd_queue_io(struct dm_crypt_io *io)
+{
+	struct crypt_config *cc = io->target->private;
+
+	INIT_WORK(&io->work, kcryptd_io);
+	queue_work(cc->io_queue, &io->work);
+}
+
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 {
 }
@@ -658,24 +658,22 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	kcryptd_crypt_read_done(io, r);
 }
 
-static void kcryptd_io(struct work_struct *work)
+static void kcryptd_crypt(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
 	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_io_read(io);
+		kcryptd_crypt_read_convert(io);
 	else
-		kcryptd_io_write(io);
+		kcryptd_crypt_write_convert(io);
 }
 
-static void kcryptd_crypt(struct work_struct *work)
+static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
-	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+	struct crypt_config *cc = io->target->private;
 
-	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_crypt_read_convert(io);
-	else
-		kcryptd_crypt_write_convert(io);
+	INIT_WORK(&io->work, kcryptd_crypt);
+	queue_work(cc->crypt_queue, &io->work);
 }
 
 /*
-- 
cgit v1.2.3


From 0c395b0f8dd7aee394df95b46963fc0f3401cf90 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:54 +0000
Subject: dm crypt: store sector mapping in dm_crypt_io

Add sector into dm_crypt_io instead of using local variable.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ccc2fe19db86..d3c48ad580d9 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -52,6 +52,7 @@ struct dm_crypt_io {
 
 	atomic_t pending;
 	int error;
+	sector_t sector;
 };
 
 struct crypt_config;
@@ -526,7 +527,6 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
-	sector_t sector = base_bio->bi_sector - io->target->begin;
 
 	atomic_inc(&io->pending);
 
@@ -546,7 +546,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 	clone->bi_idx = 0;
 	clone->bi_vcnt = bio_segments(base_bio);
 	clone->bi_size = base_bio->bi_size;
-	clone->bi_sector = cc->start + sector;
+	clone->bi_sector = cc->start + io->sector;
 	memcpy(clone->bi_io_vec, bio_iovec(base_bio),
 	       sizeof(struct bio_vec) * clone->bi_vcnt);
 
@@ -585,11 +585,10 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 	unsigned remaining = base_bio->bi_size;
-	sector_t sector = base_bio->bi_sector - io->target->begin;
 
 	atomic_inc(&io->pending);
 
-	crypt_convert_init(cc, &io->ctx, NULL, base_bio, sector);
+	crypt_convert_init(cc, &io->ctx, NULL, base_bio, io->sector);
 
 	/*
 	 * The allocated buffers can be smaller than the whole bio,
@@ -617,9 +616,9 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		/* crypt_convert should have filled the clone bio */
 		BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
 
-		clone->bi_sector = cc->start + sector;
+		clone->bi_sector = cc->start + io->sector;
 		remaining -= clone->bi_size;
-		sector += bio_sectors(clone);
+		io->sector += bio_sectors(clone);
 
 		/* Grab another reference to the io struct
 		 * before we kick off the request */
@@ -651,7 +650,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	int r = 0;
 
 	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
-			   io->base_bio->bi_sector - io->target->begin);
+			   io->sector);
 
 	r = crypt_convert(cc, &io->ctx);
 
@@ -974,6 +973,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->target = ti;
 	io->base_bio = bio;
+	io->sector = bio->bi_sector - ti->begin;
 	io->error = 0;
 	atomic_set(&io->pending, 0);
 
-- 
cgit v1.2.3


From dec1cedf9d4eabe43f3c7d6af095eff40c139a89 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:57 +0000
Subject: dm crypt: abstract crypt_write_done

Process write request in separate function and queue
final bio through io workqueue.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d3c48ad580d9..4df7d2f782d8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -577,18 +577,34 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
 
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 {
+	struct bio *clone = io->ctx.bio_out;
+	struct crypt_config *cc = io->target->private;
+
+	if (unlikely(error < 0)) {
+		crypt_free_buffer_pages(cc, clone);
+		bio_put(clone);
+		io->error = -EIO;
+		crypt_dec_pending(io);
+		return;
+	}
+
+	/* crypt_convert should have filled the clone bio */
+	BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
+
+	clone->bi_sector = cc->start + io->sector;
+	io->sector += bio_sectors(clone);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
-	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
-	unsigned remaining = base_bio->bi_size;
+	unsigned remaining = io->base_bio->bi_size;
+	int r;
 
 	atomic_inc(&io->pending);
 
-	crypt_convert_init(cc, &io->ctx, NULL, base_bio, io->sector);
+	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
 
 	/*
 	 * The allocated buffers can be smaller than the whole bio,
@@ -605,20 +621,13 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		io->ctx.bio_out = clone;
 		io->ctx.idx_out = 0;
 
-		if (unlikely(crypt_convert(cc, &io->ctx) < 0)) {
-			crypt_free_buffer_pages(cc, clone);
-			bio_put(clone);
-			io->error = -EIO;
-			crypt_dec_pending(io);
-			return;
-		}
+		remaining -= clone->bi_size;
 
-		/* crypt_convert should have filled the clone bio */
-		BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
+		r = crypt_convert(cc, &io->ctx);
 
-		clone->bi_sector = cc->start + io->sector;
-		remaining -= clone->bi_size;
-		io->sector += bio_sectors(clone);
+		kcryptd_crypt_write_io_submit(io, r);
+		if (unlikely(r < 0))
+			return;
 
 		/* Grab another reference to the io struct
 		 * before we kick off the request */
@@ -631,7 +640,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * may be gone already. */
 
 		/* out of memory -> run queues */
-		if (remaining)
+		if (unlikely(remaining))
 			congestion_wait(WRITE, HZ/100);
 	}
 }
-- 
cgit v1.2.3


From 84131db689ab86409315c15a3ea5daf732cb04e1 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:10:59 +0000
Subject: dm crypt: introduce crypt_write_io_loop

Introduce crypt_write_io_loop().

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4df7d2f782d8..986283c5332f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -595,17 +595,13 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 	io->sector += bio_sectors(clone);
 }
 
-static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
+static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *clone;
 	unsigned remaining = io->base_bio->bi_size;
 	int r;
 
-	atomic_inc(&io->pending);
-
-	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
-
 	/*
 	 * The allocated buffers can be smaller than the whole bio,
 	 * so repeat the whole process until all the data can be handled.
@@ -645,6 +641,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	}
 }
 
+static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
+{
+	struct crypt_config *cc = io->target->private;
+
+	atomic_inc(&io->pending);
+
+	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
+	kcryptd_crypt_write_convert_loop(io);
+}
+
 static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
 {
 	if (unlikely(error < 0))
-- 
cgit v1.2.3


From 899c95d36c896f9fb7bc5f4f03b4abd86bda292c Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:02 +0000
Subject: dm crypt: tidy io ref counting

Make io reference counting more obvious.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 986283c5332f..44e1aa30e3f6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -584,7 +584,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 		crypt_free_buffer_pages(cc, clone);
 		bio_put(clone);
 		io->error = -EIO;
-		crypt_dec_pending(io);
 		return;
 	}
 
@@ -593,6 +592,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 
 	clone->bi_sector = cc->start + io->sector;
 	io->sector += bio_sectors(clone);
+
+	atomic_inc(&io->pending);
+	generic_make_request(clone);
 }
 
 static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
@@ -610,7 +612,6 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
 		clone = crypt_alloc_buffer(io, remaining);
 		if (unlikely(!clone)) {
 			io->error = -ENOMEM;
-			crypt_dec_pending(io);
 			return;
 		}
 
@@ -625,16 +626,6 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
 		if (unlikely(r < 0))
 			return;
 
-		/* Grab another reference to the io struct
-		 * before we kick off the request */
-		if (remaining)
-			atomic_inc(&io->pending);
-
-		generic_make_request(clone);
-
-		/* Do not reference clone after this - it
-		 * may be gone already. */
-
 		/* out of memory -> run queues */
 		if (unlikely(remaining))
 			congestion_wait(WRITE, HZ/100);
@@ -645,10 +636,15 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 
+	/*
+	 * Prevent io from disappearing until this function completes.
+	 */
 	atomic_inc(&io->pending);
 
 	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
 	kcryptd_crypt_write_convert_loop(io);
+
+	crypt_dec_pending(io);
 }
 
 static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
-- 
cgit v1.2.3


From 01482b7671d014aa44f2efbc1153f4e3f48d7fb3 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:04 +0000
Subject: dm crypt: extract scatterlist processing

dm-crypt: Use crypto ablkcipher interface

Move scatterlists to separate dm_crypt_struct and
pick out block processing from crypt_convert.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 44e1aa30e3f6..2da9b9536afb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -55,6 +55,11 @@ struct dm_crypt_io {
 	sector_t sector;
 };
 
+struct dm_crypt_request {
+	struct scatterlist sg_in;
+	struct scatterlist sg_out;
+};
+
 struct crypt_config;
 
 struct crypt_iv_operations {
@@ -339,6 +344,39 @@ static void crypt_convert_init(struct crypt_config *cc,
 	ctx->sector = sector + cc->iv_offset;
 }
 
+static int crypt_convert_block(struct crypt_config *cc,
+			       struct convert_context *ctx)
+{
+	struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
+	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+	struct dm_crypt_request dmreq;
+
+	sg_init_table(&dmreq.sg_in, 1);
+	sg_set_page(&dmreq.sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
+		    bv_in->bv_offset + ctx->offset_in);
+
+	sg_init_table(&dmreq.sg_out, 1);
+	sg_set_page(&dmreq.sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
+		    bv_out->bv_offset + ctx->offset_out);
+
+	ctx->offset_in += 1 << SECTOR_SHIFT;
+	if (ctx->offset_in >= bv_in->bv_len) {
+		ctx->offset_in = 0;
+		ctx->idx_in++;
+	}
+
+	ctx->offset_out += 1 << SECTOR_SHIFT;
+	if (ctx->offset_out >= bv_out->bv_len) {
+		ctx->offset_out = 0;
+		ctx->idx_out++;
+	}
+
+	return crypt_convert_scatterlist(cc, &dmreq.sg_out, &dmreq.sg_in,
+					 dmreq.sg_in.length,
+					 bio_data_dir(ctx->bio_in) == WRITE,
+					 ctx->sector);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -349,30 +387,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
 	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
-		struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
-		struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
-		struct scatterlist sg_in, sg_out;
-
-		sg_init_table(&sg_in, 1);
-		sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in);
-
-		sg_init_table(&sg_out, 1);
-		sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out);
-
-		ctx->offset_in += sg_in.length;
-		if (ctx->offset_in >= bv_in->bv_len) {
-			ctx->offset_in = 0;
-			ctx->idx_in++;
-		}
-
-		ctx->offset_out += sg_out.length;
-		if (ctx->offset_out >= bv_out->bv_len) {
-			ctx->offset_out = 0;
-			ctx->idx_out++;
-		}
-
-		r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
-			bio_data_dir(ctx->bio_in) == WRITE, ctx->sector);
+		r = crypt_convert_block(cc, ctx);
 		if (r < 0)
 			break;
 
-- 
cgit v1.2.3


From ddd42edfd8ec44595b1501318512bc29a36f015f Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:07 +0000
Subject: dm crypt: add async request mempool

dm-crypt: Use crypto ablkcipher interface

Introduce mempool for async crypto requests.

cc->req is used mainly during synchronous operations
(to prevent allocation and deallocation of the same object).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2da9b9536afb..79316580c780 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -80,10 +80,11 @@ struct crypt_config {
 	sector_t start;
 
 	/*
-	 * pool for per bio private data and
-	 * for encryption buffer pages
+	 * pool for per bio private data, crypto requests and
+	 * encryption requeusts/buffer pages
 	 */
 	mempool_t *io_pool;
+	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
 
@@ -101,6 +102,22 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
+	/*
+	 * Layout of each crypto request:
+	 *
+	 *   struct ablkcipher_request
+	 *      context
+	 *      padding
+	 *   struct dm_crypt_request
+	 *      padding
+	 *   IV
+	 *
+	 * The padding is added so that dm_crypt_request and the IV are
+	 * correctly aligned.
+	 */
+	unsigned int dmreq_start;
+	struct ablkcipher_request *req;
+
 	char cipher[CRYPTO_MAX_ALG_NAME];
 	char chainmode[CRYPTO_MAX_ALG_NAME];
 	struct crypto_blkcipher *tfm;
@@ -377,6 +394,13 @@ static int crypt_convert_block(struct crypt_config *cc,
 					 ctx->sector);
 }
 
+static void crypt_alloc_req(struct crypt_config *cc,
+			    struct convert_context *ctx)
+{
+	if (!cc->req)
+		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -882,6 +906,17 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_slab_pool;
 	}
 
+	cc->dmreq_start = sizeof(struct ablkcipher_request);
+	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
+
+	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+			sizeof(struct dm_crypt_request) + cc->iv_size);
+	if (!cc->req_pool) {
+		ti->error = "Cannot allocate crypt request mempool";
+		goto bad_req_pool;
+	}
+	cc->req = NULL;
+
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
@@ -955,6 +990,8 @@ bad_device:
 bad_bs:
 	mempool_destroy(cc->page_pool);
 bad_page_pool:
+	mempool_destroy(cc->req_pool);
+bad_req_pool:
 	mempool_destroy(cc->io_pool);
 bad_slab_pool:
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
@@ -975,8 +1012,12 @@ static void crypt_dtr(struct dm_target *ti)
 	destroy_workqueue(cc->io_queue);
 	destroy_workqueue(cc->crypt_queue);
 
+	if (cc->req)
+		mempool_free(cc->req, cc->req_pool);
+
 	bioset_free(cc->bs);
 	mempool_destroy(cc->page_pool);
+	mempool_destroy(cc->req_pool);
 	mempool_destroy(cc->io_pool);
 
 	kfree(cc->iv_mode);
-- 
cgit v1.2.3


From 43d6903482eec168b727bc4bf76a9f415257d862 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:09 +0000
Subject: dm crypt: add completion for async

dm-crypt: Use crypto ablkcipher interface
Prepare completion for async crypto request.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 79316580c780..2ea3eb99c91f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -6,6 +6,7 @@
  * This file is released under the GPL.
  */
 
+#include <linux/completion.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -31,6 +32,7 @@
  * context holding the current state of a multi-part conversion
  */
 struct convert_context {
+	struct completion restart;
 	struct bio *bio_in;
 	struct bio *bio_out;
 	unsigned int offset_in;
@@ -38,6 +40,7 @@ struct convert_context {
 	unsigned int idx_in;
 	unsigned int idx_out;
 	sector_t sector;
+	atomic_t pending;
 };
 
 /*
@@ -359,6 +362,15 @@ static void crypt_convert_init(struct crypt_config *cc,
 	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
 	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
 	ctx->sector = sector + cc->iv_offset;
+	init_completion(&ctx->restart);
+	/*
+	 * Crypto operation can be asynchronous,
+	 * ctx->pending is increased after request submission.
+	 * We need to ensure that we don't call the crypt finish
+	 * operation before pending got incremented
+	 * (dependent on crypt submission return code).
+	 */
+	atomic_set(&ctx->pending, 2);
 }
 
 static int crypt_convert_block(struct crypt_config *cc,
@@ -418,6 +430,15 @@ static int crypt_convert(struct crypt_config *cc,
 		ctx->sector++;
 	}
 
+	/*
+	 * If there are pending crypto operation run async
+	 * code. Otherwise process return code synchronously.
+	 * The step of 2 ensures that async finish doesn't
+	 * call crypto finish too early.
+	 */
+	if (atomic_sub_return(2, &ctx->pending))
+		return -EINPROGRESS;
+
 	return r;
 }
 
-- 
cgit v1.2.3


From 95497a960015c89c7c585d5fb953bc2816dba1e5 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:12 +0000
Subject: dm crypt: prepare async callback fn

dm-crypt: Use crypto ablkcipher interface

Prepare callback function for async crypto operation.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2ea3eb99c91f..c45bd0e59dcc 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -406,11 +406,17 @@ static int crypt_convert_block(struct crypt_config *cc,
 					 ctx->sector);
 }
 
+static void kcryptd_async_done(struct crypto_async_request *async_req,
+			       int error);
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
 	if (!cc->req)
 		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	ablkcipher_request_set_tfm(cc->req, cc->tfm);
+	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+					     CRYPTO_TFM_REQ_MAY_SLEEP,
+					     kcryptd_async_done, ctx);
 }
 
 /*
@@ -615,6 +621,9 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 
 static void kcryptd_io_write(struct dm_crypt_io *io)
 {
+	struct bio *clone = io->ctx.bio_out;
+
+	generic_make_request(clone);
 }
 
 static void kcryptd_io(struct work_struct *work)
@@ -635,7 +644,8 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
 	queue_work(cc->io_queue, &io->work);
 }
 
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
+					  int error, int async)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->target->private;
@@ -653,8 +663,12 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int error)
 	clone->bi_sector = cc->start + io->sector;
 	io->sector += bio_sectors(clone);
 
-	atomic_inc(&io->pending);
-	generic_make_request(clone);
+	if (async)
+		kcryptd_queue_io(io);
+	else {
+		atomic_inc(&io->pending);
+		generic_make_request(clone);
+	}
 }
 
 static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
@@ -682,7 +696,7 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
 
 		r = crypt_convert(cc, &io->ctx);
 
-		kcryptd_crypt_write_io_submit(io, r);
+		kcryptd_crypt_write_io_submit(io, r, 0);
 		if (unlikely(r < 0))
 			return;
 
@@ -728,6 +742,29 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	kcryptd_crypt_read_done(io, r);
 }
 
+static void kcryptd_async_done(struct crypto_async_request *async_req,
+			       int error)
+{
+	struct convert_context *ctx = async_req->data;
+	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+	struct crypt_config *cc = io->target->private;
+
+	if (error == -EINPROGRESS) {
+		complete(&ctx->restart);
+		return;
+	}
+
+	mempool_free(ablkcipher_request_cast(async_req), cc->req_pool);
+
+	if (!atomic_dec_and_test(&ctx->pending))
+		return;
+
+	if (bio_data_dir(io->base_bio) == READ)
+		kcryptd_crypt_read_done(io, error);
+	else
+		kcryptd_crypt_write_io_submit(io, error, 1);
+}
+
 static void kcryptd_crypt(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-- 
cgit v1.2.3


From 3a7f6c990ad04e6f576a159876c602d14d6f7fef Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:14 +0000
Subject: dm crypt: use async crypto

dm-crypt: Use crypto ablkcipher interface

Move encrypt/decrypt core to async crypto call.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 131 +++++++++++++++++++++++++++-----------------------
 1 file changed, 72 insertions(+), 59 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c45bd0e59dcc..b04f98df94ea 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -123,7 +123,7 @@ struct crypt_config {
 
 	char cipher[CRYPTO_MAX_ALG_NAME];
 	char chainmode[CRYPTO_MAX_ALG_NAME];
-	struct crypto_blkcipher *tfm;
+	struct crypto_ablkcipher *tfm;
 	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
@@ -217,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return PTR_ERR(essiv_tfm);
 	}
 	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_blkcipher_ivsize(cc->tfm)) {
+	    crypto_ablkcipher_ivsize(cc->tfm)) {
 		ti->error = "Block size of ESSIV cipher does "
 			    "not match IV size of block cipher";
 		crypto_free_cipher(essiv_tfm);
@@ -254,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned int bs = crypto_blkcipher_blocksize(cc->tfm);
+	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -318,38 +318,6 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
 	.generator = crypt_iv_null_gen
 };
 
-static int
-crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
-                          struct scatterlist *in, unsigned int length,
-                          int write, sector_t sector)
-{
-	u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
-	struct blkcipher_desc desc = {
-		.tfm = cc->tfm,
-		.info = iv,
-		.flags = CRYPTO_TFM_REQ_MAY_SLEEP,
-	};
-	int r;
-
-	if (cc->iv_gen_ops) {
-		r = cc->iv_gen_ops->generator(cc, iv, sector);
-		if (r < 0)
-			return r;
-
-		if (write)
-			r = crypto_blkcipher_encrypt_iv(&desc, out, in, length);
-		else
-			r = crypto_blkcipher_decrypt_iv(&desc, out, in, length);
-	} else {
-		if (write)
-			r = crypto_blkcipher_encrypt(&desc, out, in, length);
-		else
-			r = crypto_blkcipher_decrypt(&desc, out, in, length);
-	}
-
-	return r;
-}
-
 static void crypt_convert_init(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct bio *bio_out, struct bio *bio_in,
@@ -374,18 +342,25 @@ static void crypt_convert_init(struct crypt_config *cc,
 }
 
 static int crypt_convert_block(struct crypt_config *cc,
-			       struct convert_context *ctx)
+			       struct convert_context *ctx,
+			       struct ablkcipher_request *req)
 {
 	struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
 	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
-	struct dm_crypt_request dmreq;
+	struct dm_crypt_request *dmreq;
+	u8 *iv;
+	int r = 0;
 
-	sg_init_table(&dmreq.sg_in, 1);
-	sg_set_page(&dmreq.sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
+	dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
+	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
+			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
+
+	sg_init_table(&dmreq->sg_in, 1);
+	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
 		    bv_in->bv_offset + ctx->offset_in);
 
-	sg_init_table(&dmreq.sg_out, 1);
-	sg_set_page(&dmreq.sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
+	sg_init_table(&dmreq->sg_out, 1);
+	sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
 		    bv_out->bv_offset + ctx->offset_out);
 
 	ctx->offset_in += 1 << SECTOR_SHIFT;
@@ -400,10 +375,21 @@ static int crypt_convert_block(struct crypt_config *cc,
 		ctx->idx_out++;
 	}
 
-	return crypt_convert_scatterlist(cc, &dmreq.sg_out, &dmreq.sg_in,
-					 dmreq.sg_in.length,
-					 bio_data_dir(ctx->bio_in) == WRITE,
-					 ctx->sector);
+	if (cc->iv_gen_ops) {
+		r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+		if (r < 0)
+			return r;
+	}
+
+	ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
+				     1 << SECTOR_SHIFT, iv);
+
+	if (bio_data_dir(ctx->bio_in) == WRITE)
+		r = crypto_ablkcipher_encrypt(req);
+	else
+		r = crypto_ablkcipher_decrypt(req);
+
+	return r;
 }
 
 static void kcryptd_async_done(struct crypto_async_request *async_req,
@@ -429,11 +415,27 @@ static int crypt_convert(struct crypt_config *cc,
 
 	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
 	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
-		r = crypt_convert_block(cc, ctx);
-		if (r < 0)
-			break;
 
-		ctx->sector++;
+		crypt_alloc_req(cc, ctx);
+
+		r = crypt_convert_block(cc, ctx, cc->req);
+
+		switch (r) {
+		case -EBUSY:
+			wait_for_completion(&ctx->restart);
+			INIT_COMPLETION(ctx->restart);
+			/* fall through*/
+		case -EINPROGRESS:
+			atomic_inc(&ctx->pending);
+			cc->req = NULL;
+			r = 0;
+			/* fall through*/
+		case 0:
+			ctx->sector++;
+			continue;
+		}
+
+		break;
 	}
 
 	/*
@@ -696,9 +698,12 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
 
 		r = crypt_convert(cc, &io->ctx);
 
-		kcryptd_crypt_write_io_submit(io, r, 0);
-		if (unlikely(r < 0))
-			return;
+		if (r != -EINPROGRESS) {
+			kcryptd_crypt_write_io_submit(io, r, 0);
+			if (unlikely(r < 0))
+				return;
+		} else
+			atomic_inc(&io->pending);
 
 		/* out of memory -> run queues */
 		if (unlikely(remaining))
@@ -734,12 +739,17 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	struct crypt_config *cc = io->target->private;
 	int r = 0;
 
+	atomic_inc(&io->pending);
+
 	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
 			   io->sector);
 
 	r = crypt_convert(cc, &io->ctx);
 
-	kcryptd_crypt_read_done(io, r);
+	if (r != -EINPROGRESS)
+		kcryptd_crypt_read_done(io, r);
+
+	crypt_dec_pending(io);
 }
 
 static void kcryptd_async_done(struct crypto_async_request *async_req,
@@ -856,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	struct crypto_blkcipher *tfm;
+	struct crypto_ablkcipher *tfm;
 	char *tmp;
 	char *cipher;
 	char *chainmode;
@@ -910,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_cipher;
 	}
 
-	tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
 	if (IS_ERR(tfm)) {
 		ti->error = "Error allocating crypto tfm";
 		goto bad_cipher;
@@ -944,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
 		goto bad_ivmode;
 
-	cc->iv_size = crypto_blkcipher_ivsize(tfm);
+	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -965,7 +975,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
+	cc->dmreq_start += crypto_ablkcipher_reqsize(tfm);
 	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
+	cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) &
+			   ~(crypto_tfm_ctx_alignment() - 1);
 
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
@@ -987,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_bs;
 	}
 
-	if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
+	if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
 		ti->error = "Error setting key";
 		goto bad_device;
 	}
@@ -1055,7 +1068,7 @@ bad_slab_pool:
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
 bad_ivmode:
-	crypto_free_blkcipher(tfm);
+	crypto_free_ablkcipher(tfm);
 bad_cipher:
 	/* Must zero key material before freeing */
 	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
@@ -1081,7 +1094,7 @@ static void crypt_dtr(struct dm_target *ti)
 	kfree(cc->iv_mode);
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
-	crypto_free_blkcipher(cc->tfm);
+	crypto_free_ablkcipher(cc->tfm);
 	dm_put_device(ti, cc->dev);
 
 	/* Must zero key material before freeing */
-- 
cgit v1.2.3


From 304f3f6a58301316da612d7bf21d9abe1369d456 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:17 +0000
Subject: dm: move deferred bio flushing to workqueue

Add a single-thread workqueue for each mapped device
and move flushing of the lists of pushback and deferred bios
to this new workqueue.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 67 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9ca012e639a8..6617ce4af095 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -71,6 +71,19 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
 
+/*
+ * Work processed by per-device workqueue.
+ */
+struct dm_wq_req {
+	enum {
+		DM_WQ_FLUSH_ALL,
+		DM_WQ_FLUSH_DEFERRED,
+	} type;
+	struct work_struct work;
+	struct mapped_device *md;
+	void *context;
+};
+
 struct mapped_device {
 	struct rw_semaphore io_lock;
 	struct mutex suspend_lock;
@@ -95,6 +108,11 @@ struct mapped_device {
 	struct bio_list deferred;
 	struct bio_list pushback;
 
+	/*
+	 * Processing queue (flush/barriers)
+	 */
+	struct workqueue_struct *wq;
+
 	/*
 	 * The current mapping.
 	 */
@@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor)
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
+	md->wq = create_singlethread_workqueue("kdmflush");
+	if (!md->wq)
+		goto bad_thread;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1053,6 +1075,8 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
+bad_thread:
+	put_disk(md->disk);
 bad_disk:
 	bioset_free(md->bs);
 bad_no_bioset:
@@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md)
 		unlock_fs(md);
 		bdput(md->suspended_bdev);
 	}
+	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
 	bioset_free(md->bs);
@@ -1308,6 +1333,44 @@ static void __merge_pushback_list(struct mapped_device *md)
 	spin_unlock_irqrestore(&md->pushback_lock, flags);
 }
 
+static void dm_wq_work(struct work_struct *work)
+{
+	struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
+	struct mapped_device *md = req->md;
+
+	down_write(&md->io_lock);
+	switch (req->type) {
+	case DM_WQ_FLUSH_ALL:
+		__merge_pushback_list(md);
+		/* pass through */
+	case DM_WQ_FLUSH_DEFERRED:
+		__flush_deferred_io(md);
+		break;
+	default:
+		DMERR("dm_wq_work: unrecognised work type %d", req->type);
+		BUG();
+	}
+	up_write(&md->io_lock);
+}
+
+static void dm_wq_queue(struct mapped_device *md, int type, void *context,
+			struct dm_wq_req *req)
+{
+	req->type = type;
+	req->md = md;
+	req->context = context;
+	INIT_WORK(&req->work, dm_wq_work);
+	queue_work(md->wq, &req->work);
+}
+
+static void dm_queue_flush(struct mapped_device *md, int type, void *context)
+{
+	struct dm_wq_req req;
+
+	dm_wq_queue(md, type, context, &req);
+	flush_workqueue(md->wq);
+}
+
 /*
  * Swap in a new table (destroying old one).
  */
@@ -1450,9 +1513,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	/* were we interrupted ? */
 	if (r < 0) {
-		down_write(&md->io_lock);
-		__flush_deferred_io(md);
-		up_write(&md->io_lock);
+		dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
 
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
@@ -1463,16 +1524,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 flush_and_out:
-	if (r && noflush) {
+	if (r && noflush)
 		/*
 		 * Because there may be already I/Os in the pushback list,
 		 * flush them before return.
 		 */
-		down_write(&md->io_lock);
-		__merge_pushback_list(md);
-		__flush_deferred_io(md);
-		up_write(&md->io_lock);
-	}
+		dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
 
 out:
 	if (r && md->suspended_bdev) {
@@ -1504,9 +1561,7 @@ int dm_resume(struct mapped_device *md)
 	if (r)
 		goto out;
 
-	down_write(&md->io_lock);
-	__flush_deferred_io(md);
-	up_write(&md->io_lock);
+	dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
 
 	unlock_fs(md);
 
-- 
cgit v1.2.3


From fb8b284806124bef250196007d7373ea3fe26194 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:19 +0000
Subject: dm log: auto load modules

If the log type is not recognised, attempt to load the module
'dm-log-<type>.ko'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 072ee4353eab..2a74b2142f50 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type)
 	return 0;
 }
 
-static struct dirty_log_type *get_type(const char *type_name)
+static struct dirty_log_type *_get_type(const char *type_name)
 {
 	struct dirty_log_type *type;
 
@@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name)
 	return NULL;
 }
 
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dirty_log_type by name.  If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Log modules are named "dm-log-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-log-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-disk", it would search
+ * 'dm-log-clustered-disk' then 'dm-log-clustered'.
+ *
+ * Returns: dirty_log_type* on success, NULL on failure
+ */
+static struct dirty_log_type *get_type(const char *type_name)
+{
+	char *p, *type_name_dup;
+	struct dirty_log_type *type;
+
+	type = _get_type(type_name);
+	if (type)
+		return type;
+
+	type_name_dup = kstrdup(type_name, GFP_KERNEL);
+	if (!type_name_dup) {
+		DMWARN("No memory left to attempt log module load for \"%s\"",
+		       type_name);
+		return NULL;
+	}
+
+	while (request_module("dm-log-%s", type_name_dup) ||
+	       !(type = _get_type(type_name))) {
+		p = strrchr(type_name_dup, '-');
+		if (!p)
+			break;
+		p[0] = '\0';
+	}
+
+	if (!type)
+		DMWARN("Module for logging type \"%s\" not found.", type_name);
+
+	kfree(type_name_dup);
+
+	return type;
+}
+
 static void put_type(struct dirty_log_type *type)
 {
 	spin_lock(&_lock);
-- 
cgit v1.2.3


From a25eb9446ad50027bc2082386e5358bedad087ed Mon Sep 17 00:00:00 2001
From: Brian Wood <brian.j.wood@intel.com>
Date: Fri, 8 Feb 2008 02:11:22 +0000
Subject: dm: stripe trigger event on failure

This patch adds the stripe_end_io function to process errors that might
occur after an IO operation. As part of this there are a number of
enhancements made to record and trigger events:

- New atomic variable in struct stripe to record the number of
errors each stripe volume device has experienced (could be used
later with uevents to report back directly to userspace)

- New workqueue/work struct setup to process the trigger_event function

- New end_io function. It is here that testing for BIO error conditions
take place. It determines the exact stripe that cause the error,
records this in the new atomic variable, and calls the queue_work() function

- New trigger_event function to process failure events. This
calls dm_table_event()

Signed-off-by: Brian Wood <brian.j.wood@intel.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 969944a8aba2..7c5e2a0c3f2d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -14,10 +14,13 @@
 #include <linux/log2.h>
 
 #define DM_MSG_PREFIX "striped"
+#define DM_IO_ERROR_THRESHOLD 15
 
 struct stripe {
 	struct dm_dev *dev;
 	sector_t physical_start;
+
+	atomic_t error_count;
 };
 
 struct stripe_c {
@@ -30,9 +33,29 @@ struct stripe_c {
 	uint32_t chunk_shift;
 	sector_t chunk_mask;
 
+	/* Needed for handling events */
+	struct dm_target *ti;
+
+	/* Work struct used for triggering events*/
+	struct work_struct kstriped_ws;
+
 	struct stripe stripe[0];
 };
 
+static struct workqueue_struct *kstriped;
+
+/*
+ * An event is triggered whenever a drive
+ * drops out of a stripe volume.
+ */
+static void trigger_event(struct work_struct *work)
+{
+	struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
+
+	dm_table_event(sc->ti->table);
+
+}
+
 static inline struct stripe_c *alloc_context(unsigned int stripes)
 {
 	size_t len;
@@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 		return -ENXIO;
 
 	sc->stripe[stripe].physical_start = start;
+
 	return 0;
 }
 
@@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
+	INIT_WORK(&sc->kstriped_ws, trigger_event);
+
+	/* Set pointer to dm target; used in trigger_event */
+	sc->ti = ti;
+
 	sc->stripes = stripes;
 	sc->stripe_width = width;
 	ti->split_io = chunk_size;
@@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			kfree(sc);
 			return r;
 		}
+		atomic_set(&(sc->stripe[i].error_count), 0);
 	}
 
 	ti->private = sc;
+
 	return 0;
 }
 
@@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti)
 	for (i = 0; i < sc->stripes; i++)
 		dm_put_device(ti, sc->stripe[i].dev);
 
+	flush_workqueue(kstriped);
 	kfree(sc);
 }
 
@@ -213,13 +245,52 @@ static int stripe_status(struct dm_target *ti,
 	return 0;
 }
 
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	unsigned i;
+	char major_minor[16];
+	struct stripe_c *sc = ti->private;
+
+	if (!error)
+		return 0; /* I/O complete */
+
+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+		return error;
+
+	if (error == -EOPNOTSUPP)
+		return error;
+
+	memset(major_minor, 0, sizeof(major_minor));
+	sprintf(major_minor, "%d:%d",
+		bio->bi_bdev->bd_disk->major,
+		bio->bi_bdev->bd_disk->first_minor);
+
+	/*
+	 * Test to see which stripe drive triggered the event
+	 * and increment error count for all stripes on that device.
+	 * If the error count for a given device exceeds the threshold
+	 * value we will no longer trigger any further events.
+	 */
+	for (i = 0; i < sc->stripes; i++)
+		if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
+			atomic_inc(&(sc->stripe[i].error_count));
+			if (atomic_read(&(sc->stripe[i].error_count)) <
+			    DM_IO_ERROR_THRESHOLD)
+				queue_work(kstriped, &sc->kstriped_ws);
+		}
+
+	return error;
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version= {1, 0, 2},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.map    = stripe_map,
+	.end_io = stripe_end_io,
 	.status = stripe_status,
 };
 
@@ -231,6 +302,13 @@ int __init dm_stripe_init(void)
 	if (r < 0)
 		DMWARN("target registration failed");
 
+	kstriped = create_singlethread_workqueue("kstriped");
+	if (!kstriped) {
+		DMERR("failed to create workqueue kstriped");
+		dm_unregister_target(&stripe_target);
+		return -ENOMEM;
+	}
+
 	return r;
 }
 
@@ -239,5 +317,7 @@ void dm_stripe_exit(void)
 	if (dm_unregister_target(&stripe_target))
 		DMWARN("target unregistration failed");
 
+	destroy_workqueue(kstriped);
+
 	return;
 }
-- 
cgit v1.2.3


From 4f7f5c675fd6bacaae3c67be44de872dcff0e3b7 Mon Sep 17 00:00:00 2001
From: Brian Wood <brian.j.wood@intel.com>
Date: Fri, 8 Feb 2008 02:11:24 +0000
Subject: dm: stripe enhanced status return

This patch adds additional information to the status line. It is added at the
end of the returned text so it will not interfere with existing
implementations using this data. The addition of this information will allow
for a common return interface to match that returned with the dm-raid1.c
status line (with Jonathan Brassow's patches).

Here is a sample of what is returned with a mirror "status" call:
isw_eeaaabgfg_mirror: 0 488390920 mirror 2 8:16 8:32 3727/3727 1 AA 1 core

Here's what's returned with this patch for a stripe "status" call:
isw_dheeijjdej_stripe: 0 976783872 striped 2 8:16 8:32 1 AA

Signed-off-by: Brian Wood <brian.j.wood@intel.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 7c5e2a0c3f2d..4de90ab3968b 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -222,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
 	return DM_MAPIO_REMAPPED;
 }
 
+/*
+ * Stripe status:
+ *
+ * INFO
+ * #stripes [stripe_name <stripe_name>] [group word count]
+ * [error count 'A|D' <error count 'A|D'>]
+ *
+ * TABLE
+ * #stripes [stripe chunk size]
+ * [stripe_name physical_start <stripe_name physical_start>]
+ *
+ */
+
 static int stripe_status(struct dm_target *ti,
 			 status_type_t type, char *result, unsigned int maxlen)
 {
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	char buffer[sc->stripes + 1];
 	unsigned int sz = 0;
 	unsigned int i;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
-		result[0] = '\0';
+		DMEMIT("%d ", sc->stripes);
+		for (i = 0; i < sc->stripes; i++)  {
+			DMEMIT("%s ", sc->stripe[i].dev->name);
+			buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
+				'D' : 'A';
+		}
+		buffer[i] = '\0';
+		DMEMIT("1 %s", buffer);
 		break;
 
 	case STATUSTYPE_TABLE:
-- 
cgit v1.2.3


From d74f81f8adc504a23be3babf347b9f69e9389924 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 8 Feb 2008 02:11:27 +0000
Subject: dm snapshot: combine consecutive exceptions in memory

Provided sector_t is 64 bits, reduce the in-memory footprint of the
snapshot exception table by the simple method of using unused bits of
the chunk number to combine consecutive entries.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 82 +++++++++++++++++++++++++++++++++++++++++++---------
 drivers/md/dm-snap.h | 50 ++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index fad84654b045..ae24eab8cd81 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s)
 
 /*
  * Implementation of the exception hash tables.
+ * The lowest hash_shift bits of the chunk number are ignored, allowing
+ * some consecutive chunks to be grouped together.
  */
-static int init_exception_table(struct exception_table *et, uint32_t size)
+static int init_exception_table(struct exception_table *et, uint32_t size,
+				unsigned hash_shift)
 {
 	unsigned int i;
 
+	et->hash_shift = hash_shift;
 	et->hash_mask = size - 1;
 	et->table = dm_vcalloc(size, sizeof(struct list_head));
 	if (!et->table)
@@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
 
 static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
 {
-	return chunk & et->hash_mask;
+	return (chunk >> et->hash_shift) & et->hash_mask;
 }
 
 static void insert_exception(struct exception_table *eh,
@@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
 
 	slot = &et->table[exception_hash(et, chunk)];
 	list_for_each_entry (e, slot, hash_list)
-		if (e->old_chunk == chunk)
+		if (chunk >= e->old_chunk &&
+		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
 			return e;
 
 	return NULL;
@@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 	mempool_free(pe, pending_pool);
 }
 
+static void insert_completed_exception(struct dm_snapshot *s,
+				       struct dm_snap_exception *new_e)
+{
+	struct exception_table *eh = &s->complete;
+	struct list_head *l;
+	struct dm_snap_exception *e = NULL;
+
+	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
+
+	/* Add immediately if this table doesn't support consecutive chunks */
+	if (!eh->hash_shift)
+		goto out;
+
+	/* List is ordered by old_chunk */
+	list_for_each_entry_reverse(e, l, hash_list) {
+		/* Insert after an existing chunk? */
+		if (new_e->old_chunk == (e->old_chunk +
+					 dm_consecutive_chunk_count(e) + 1) &&
+		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
+					 dm_consecutive_chunk_count(e) + 1)) {
+			dm_consecutive_chunk_count_inc(e);
+			free_exception(new_e);
+			return;
+		}
+
+		/* Insert before an existing chunk? */
+		if (new_e->old_chunk == (e->old_chunk - 1) &&
+		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
+			dm_consecutive_chunk_count_inc(e);
+			e->old_chunk--;
+			e->new_chunk--;
+			free_exception(new_e);
+			return;
+		}
+
+		if (new_e->old_chunk > e->old_chunk)
+			break;
+	}
+
+out:
+	list_add(&new_e->hash_list, e ? &e->hash_list : l);
+}
+
 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
 {
 	struct dm_snap_exception *e;
@@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
 		return -ENOMEM;
 
 	e->old_chunk = old;
+
+	/* Consecutive_count is implicitly initialised to zero */
 	e->new_chunk = new;
-	insert_exception(&s->complete, e);
+
+	insert_completed_exception(s, e);
+
 	return 0;
 }
 
@@ -352,7 +404,8 @@ static int init_hash_tables(struct dm_snapshot *s)
 	hash_size = min(hash_size, max_buckets);
 
 	hash_size = rounddown_pow_of_two(hash_size);
-	if (init_exception_table(&s->complete, hash_size))
+	if (init_exception_table(&s->complete, hash_size,
+				 DM_CHUNK_CONSECUTIVE_BITS))
 		return -ENOMEM;
 
 	/*
@@ -363,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s)
 	if (hash_size < 64)
 		hash_size = 64;
 
-	if (init_exception_table(&s->pending, hash_size)) {
+	if (init_exception_table(&s->pending, hash_size, 0)) {
 		exit_exception_table(&s->complete, exception_cache);
 		return -ENOMEM;
 	}
@@ -722,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	 * Add a proper exception, and remove the
 	 * in-flight exception from the list.
 	 */
-	insert_exception(&s->complete, e);
+	insert_completed_exception(s, e);
 
  out:
 	remove_exception(&pe->e);
@@ -856,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 }
 
 static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
-			    struct bio *bio)
+			    struct bio *bio, chunk_t chunk)
 {
 	bio->bi_bdev = s->cow->bdev;
-	bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
-		(bio->bi_sector & s->chunk_mask);
+	bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
+			 (chunk - e->old_chunk)) +
+			 (bio->bi_sector & s->chunk_mask);
 }
 
 static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -891,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	/* If the block is already remapped - use that, else remap it */
 	e = lookup_exception(&s->complete, chunk);
 	if (e) {
-		remap_exception(s, e, bio);
+		remap_exception(s, e, bio, chunk);
 		goto out_unlock;
 	}
 
@@ -908,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 			goto out_unlock;
 		}
 
-		remap_exception(s, &pe->e, bio);
+		remap_exception(s, &pe->e, bio, chunk);
 		bio_list_add(&pe->snapshot_bios, bio);
 
 		r = DM_MAPIO_SUBMITTED;
@@ -1196,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
 
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 5, 0},
+	.version = {1, 6, 0},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
@@ -1207,7 +1261,7 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 5, 0},
+	.version = {1, 6, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 650e0f1f51d8..93bce5d49742 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -16,19 +16,22 @@
 
 struct exception_table {
 	uint32_t hash_mask;
+	unsigned hash_shift;
 	struct list_head *table;
 };
 
 /*
  * The snapshot code deals with largish chunks of the disk at a
- * time. Typically 64k - 256k.
+ * time. Typically 32k - 512k.
  */
-/* FIXME: can we get away with limiting these to a uint32_t ? */
 typedef sector_t chunk_t;
 
 /*
  * An exception is used where an old chunk of data has been
  * replaced by a new one.
+ * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
+ * of chunks that follow contiguously.  Remaining bits hold the number of the
+ * chunk within the device.
  */
 struct dm_snap_exception {
 	struct list_head hash_list;
@@ -37,6 +40,49 @@ struct dm_snap_exception {
 	chunk_t new_chunk;
 };
 
+/*
+ * Funtions to manipulate consecutive chunks
+ */
+#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#    define DM_CHUNK_CONSECUTIVE_BITS 8
+#    define DM_CHUNK_NUMBER_BITS 56
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
+
+	BUG_ON(!dm_consecutive_chunk_count(e));
+}
+
+#  else
+#    define DM_CHUNK_CONSECUTIVE_BITS 0
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+	return chunk;
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+	return 0;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+}
+
+#  endif
+
 /*
  * Abstraction to handle the meta/layout of exception stores (the
  * COW device).
-- 
cgit v1.2.3


From 72f4b314100bae85c75d8e4c6fec621ab44e777d Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:29 +0000
Subject: dm raid1: handle write failures

This patch gives mirror the ability to handle device failures
during normal write operations.

The 'write_callback' function is called when a write completes.
If all the writes failed or succeeded, we report failure or
success respectively.  If some of the writes failed, we call
fail_mirror; which increments the error count for the device, notes
the type of error encountered (DM_RAID1_WRITE_ERROR),  and
selects a new primary (if necessary).  Note that the primary
device can never change while the mirror is not in-sync (IOW,
while recovery is happening.)  This means that the scenario
where a failed write changes the primary and gives
recovery_complete a chance to misread the primary never happens.
The fact that the primary can change has necessitated the change
to the default_mirror field.  We need to protect against reading
garbage while the primary changes.  We then add the bio to a new
list in the mirror set, 'failures'.  For every bio in the 'failures'
list, we call a new function, '__bio_mark_nosync', where we mark
the region 'not-in-sync' in the log and properly set the region
state as, RH_NOSYNC.  Userspace must also be notified of the
failure.  This is done by 'raising an event' (dm_table_event()).
If fail_mirror is called in process context the event can be raised
right away.  If in interrupt context, the event is deferred to the
kmirrord thread - which raises the event if 'event_waiting' is set.

Backwards compatibility is maintained by ignoring errors if
the DM_FEATURES_HANDLE_ERRORS flag is not present.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 250 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 224 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 31123d4a6b9c..4e1e04dbc4ab 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/log2.h>
+#include <linux/hardirq.h>
 
 #define DM_MSG_PREFIX "raid1"
 #define DM_IO_PAGES 64
@@ -113,9 +114,16 @@ struct region {
 /*-----------------------------------------------------------------
  * Mirror set structures.
  *---------------------------------------------------------------*/
+enum dm_raid1_error {
+	DM_RAID1_WRITE_ERROR,
+	DM_RAID1_SYNC_ERROR,
+	DM_RAID1_READ_ERROR
+};
+
 struct mirror {
 	struct mirror_set *ms;
 	atomic_t error_count;
+	uint32_t error_type;
 	struct dm_dev *dev;
 	sector_t offset;
 };
@@ -127,9 +135,10 @@ struct mirror_set {
 	struct kcopyd_client *kcopyd_client;
 	uint64_t features;
 
-	spinlock_t lock;	/* protects the next two lists */
+	spinlock_t lock;	/* protects the lists */
 	struct bio_list reads;
 	struct bio_list writes;
+	struct bio_list failures;
 
 	struct dm_io_client *io_client;
 
@@ -138,10 +147,11 @@ struct mirror_set {
 	int in_sync;
 	int log_failure;
 
-	struct mirror *default_mirror;	/* Default mirror */
+	atomic_t default_mirror;	/* Default mirror */
 
 	struct workqueue_struct *kmirrord_wq;
 	struct work_struct kmirrord_work;
+	struct work_struct trigger_event;
 
 	unsigned int nr_mirrors;
 	struct mirror mirror[0];
@@ -646,6 +656,77 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
 	bio->bi_next = (struct bio *) ms;
 }
 
+static struct mirror *get_default_mirror(struct mirror_set *ms)
+{
+	return &ms->mirror[atomic_read(&ms->default_mirror)];
+}
+
+static void set_default_mirror(struct mirror *m)
+{
+	struct mirror_set *ms = m->ms;
+	struct mirror *m0 = &(ms->mirror[0]);
+
+	atomic_set(&ms->default_mirror, m - m0);
+}
+
+/* fail_mirror
+ * @m: mirror device to fail
+ * @error_type: one of the enum's, DM_RAID1_*_ERROR
+ *
+ * If errors are being handled, record the type of
+ * error encountered for this device.  If this type
+ * of error has already been recorded, we can return;
+ * otherwise, we must signal userspace by triggering
+ * an event.  Additionally, if the device is the
+ * primary device, we must choose a new primary, but
+ * only if the mirror is in-sync.
+ *
+ * This function must not block.
+ */
+static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
+{
+	struct mirror_set *ms = m->ms;
+	struct mirror *new;
+
+	if (!errors_handled(ms))
+		return;
+
+	/*
+	 * error_count is used for nothing more than a
+	 * simple way to tell if a device has encountered
+	 * errors.
+	 */
+	atomic_inc(&m->error_count);
+
+	if (test_and_set_bit(error_type, &m->error_type))
+		return;
+
+	if (m != get_default_mirror(ms))
+		goto out;
+
+	if (!ms->in_sync) {
+		/*
+		 * Better to issue requests to same failing device
+		 * than to risk returning corrupt data.
+		 */
+		DMERR("Primary mirror (%s) failed while out-of-sync: "
+		      "Reads may fail.", m->dev->name);
+		goto out;
+	}
+
+	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
+		if (!atomic_read(&new->error_count)) {
+			set_default_mirror(new);
+			break;
+		}
+
+	if (unlikely(new == ms->mirror + ms->nr_mirrors))
+		DMWARN("All sides of mirror have failed.");
+
+out:
+	schedule_work(&ms->trigger_event);
+}
+
 /*-----------------------------------------------------------------
  * Recovery.
  *
@@ -678,7 +759,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
 	unsigned long flags = 0;
 
 	/* fill in the source */
-	m = ms->default_mirror;
+	m = get_default_mirror(ms);
 	from.bdev = m->dev->bdev;
 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
 	if (reg->key == (ms->nr_regions - 1)) {
@@ -694,7 +775,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
 
 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-		if (&ms->mirror[i] == ms->default_mirror)
+		if (&ms->mirror[i] == get_default_mirror(ms))
 			continue;
 
 		m = ms->mirror + i;
@@ -749,7 +830,7 @@ static void do_recovery(struct mirror_set *ms)
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
 	/* FIXME: add read balancing */
-	return ms->default_mirror;
+	return get_default_mirror(ms);
 }
 
 /*
@@ -776,7 +857,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		if (rh_in_sync(&ms->rh, region, 1))
 			m = choose_mirror(ms, bio->bi_sector);
 		else
-			m = ms->default_mirror;
+			m = get_default_mirror(ms);
 
 		map_bio(ms, m, bio);
 		generic_make_request(bio);
@@ -793,12 +874,67 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
  * RECOVERING:	delay the io until recovery completes
  * NOSYNC:	increment pending, just write to the default mirror
  *---------------------------------------------------------------*/
+
+/* __bio_mark_nosync
+ * @ms
+ * @bio
+ * @done
+ * @error
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+static void __bio_mark_nosync(struct mirror_set *ms,
+			      struct bio *bio, unsigned done, int error)
+{
+	unsigned long flags;
+	struct region_hash *rh = &ms->rh;
+	struct dirty_log *log = ms->rh.log;
+	struct region *reg;
+	region_t region = bio_to_region(rh, bio);
+	int recovering = 0;
+
+	/* We must inform the log that the sync count has changed. */
+	log->type->set_region_sync(log, region, 0);
+	ms->in_sync = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	/* region hash entry should exist because write was in-flight */
+	BUG_ON(!reg);
+	BUG_ON(!list_empty(&reg->list));
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	/*
+	 * Possible cases:
+	 *   1) RH_DIRTY
+	 *   2) RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   3) RH_RECOVERING: flushing pending writes
+	 * Either case, the region should have not been connected to list.
+	 */
+	recovering = (reg->state == RH_RECOVERING);
+	reg->state = RH_NOSYNC;
+	BUG_ON(!list_empty(&reg->list));
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	bio_endio(bio, error);
+	if (recovering)
+		complete_resync_work(reg, 0);
+}
+
 static void write_callback(unsigned long error, void *context)
 {
-	unsigned int i;
-	int uptodate = 1;
+	unsigned i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
+	int uptodate = 0;
+	int should_wake = 0;
+	unsigned long flags;
 
 	ms = bio_get_ms(bio);
 	bio_set_ms(bio, NULL);
@@ -809,20 +945,36 @@ static void write_callback(unsigned long error, void *context)
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
+	if (likely(!error))
+		goto out;
 
-	if (error) {
+	for (i = 0; i < ms->nr_mirrors; i++)
+		if (test_bit(i, &error))
+			fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
+		else
+			uptodate = 1;
+
+	if (unlikely(!uptodate)) {
+		DMERR("All replicated volumes dead, failing I/O");
+		/* None of the writes succeeded, fail the I/O. */
+		ret = -EIO;
+	} else if (errors_handled(ms)) {
 		/*
-		 * only error the io if all mirrors failed.
-		 * FIXME: bogus
+		 * Need to raise event.  Since raising
+		 * events can block, we need to do it in
+		 * the main thread.
 		 */
-		uptodate = 0;
-		for (i = 0; i < ms->nr_mirrors; i++)
-			if (!test_bit(i, &error)) {
-				uptodate = 1;
-				break;
-			}
+		spin_lock_irqsave(&ms->lock, flags);
+		if (!ms->failures.head)
+			should_wake = 1;
+		bio_list_add(&ms->failures, bio);
+		spin_unlock_irqrestore(&ms->lock, flags);
+		if (should_wake)
+			wake(ms);
+		return;
 	}
-	bio_endio(bio, 0);
+out:
+	bio_endio(bio, ret);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -910,33 +1062,75 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->default_mirror, bio);
+		map_bio(ms, get_default_mirror(ms), bio);
 		generic_make_request(bio);
 	}
 }
 
+static void do_failures(struct mirror_set *ms, struct bio_list *failures)
+{
+	struct bio *bio;
+
+	if (!failures->head)
+		return;
+
+	while ((bio = bio_list_pop(failures)))
+		__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+}
+
+static void trigger_event(struct work_struct *work)
+{
+	struct mirror_set *ms =
+		container_of(work, struct mirror_set, trigger_event);
+
+	dm_table_event(ms->ti->table);
+}
+
 /*-----------------------------------------------------------------
  * kmirrord
  *---------------------------------------------------------------*/
-static void do_mirror(struct work_struct *work)
+static int _do_mirror(struct work_struct *work)
 {
 	struct mirror_set *ms =container_of(work, struct mirror_set,
 					    kmirrord_work);
-	struct bio_list reads, writes;
+	struct bio_list reads, writes, failures;
+	unsigned long flags;
 
-	spin_lock(&ms->lock);
+	spin_lock_irqsave(&ms->lock, flags);
 	reads = ms->reads;
 	writes = ms->writes;
+	failures = ms->failures;
 	bio_list_init(&ms->reads);
 	bio_list_init(&ms->writes);
-	spin_unlock(&ms->lock);
+	bio_list_init(&ms->failures);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	rh_update_states(&ms->rh);
 	do_recovery(ms);
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
+	do_failures(ms, &failures);
+
+	return (ms->failures.head) ? 1 : 0;
+}
+
+static void do_mirror(struct work_struct *work)
+{
+	/*
+	 * If _do_mirror returns 1, we give it
+	 * another shot.  This helps for cases like
+	 * 'suspend' where we call flush_workqueue
+	 * and expect all work to be finished.  If
+	 * a failure happens during a suspend, we
+	 * couldn't issue a 'wake' because it would
+	 * not be honored.  Therefore, we return '1'
+	 * from _do_mirror, and retry here.
+	 */
+	while (_do_mirror(work))
+		schedule();
 }
 
+
 /*-----------------------------------------------------------------
  * Target functions
  *---------------------------------------------------------------*/
@@ -965,7 +1159,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
-	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
 	ms->io_client = dm_io_client_create(DM_IO_PAGES);
 	if (IS_ERR(ms->io_client)) {
@@ -1019,6 +1213,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 	}
 
 	ms->mirror[mirror].ms = ms;
+	atomic_set(&(ms->mirror[mirror].error_count), 0);
+	ms->mirror[mirror].error_type = 0;
 	ms->mirror[mirror].offset = offset;
 
 	return 0;
@@ -1171,6 +1367,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto err_free_context;
 	}
 	INIT_WORK(&ms->kmirrord_work, do_mirror);
+	INIT_WORK(&ms->trigger_event, trigger_event);
 
 	r = parse_features(ms, argc, argv, &args_used);
 	if (r)
@@ -1220,14 +1417,15 @@ static void mirror_dtr(struct dm_target *ti)
 
 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
 {
+	unsigned long flags;
 	int should_wake = 0;
 	struct bio_list *bl;
 
 	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
-	spin_lock(&ms->lock);
+	spin_lock_irqsave(&ms->lock, flags);
 	should_wake = !(bl->head);
 	bio_list_add(bl, bio);
-	spin_unlock(&ms->lock);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	if (should_wake)
 		wake(ms);
-- 
cgit v1.2.3


From 8f0205b798f926e2745de5fdebf0a8605c621de6 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:32 +0000
Subject: dm raid1: handle recovery failures

This patch adds the calls to 'fail_mirror' if an error occurs during
mirror recovery (aka resynchronization).  'fail_mirror' is responsible
for recording the type of error by mirror device and ensuring an event
gets raised for the purpose of notifying userspace.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4e1e04dbc4ab..9978b9f07fe9 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -737,15 +737,32 @@ out:
 static void recovery_complete(int read_err, unsigned int write_err,
 			      void *context)
 {
-	struct region *reg = (struct region *) context;
+	struct region *reg = (struct region *)context;
+	struct mirror_set *ms = reg->rh->ms;
+	int m, bit = 0;
 
-	if (read_err)
+	if (read_err) {
 		/* Read error means the failure of default mirror. */
 		DMERR_LIMIT("Unable to read primary mirror during recovery");
+		fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
+	}
 
-	if (write_err)
+	if (write_err) {
 		DMERR_LIMIT("Write error during recovery (error = 0x%x)",
 			    write_err);
+		/*
+		 * Bits correspond to devices (excluding default mirror).
+		 * The default mirror cannot change during recovery.
+		 */
+		for (m = 0; m < ms->nr_mirrors; m++) {
+			if (&ms->mirror[m] == get_default_mirror(ms))
+				continue;
+			if (test_bit(bit, &write_err))
+				fail_mirror(ms->mirror + m,
+					    DM_RAID1_SYNC_ERROR);
+			bit++;
+		}
+	}
 
 	rh_recovery_end(reg, !(read_err || write_err));
 }
-- 
cgit v1.2.3


From b80aa7a0c268d3ae0c472f648af1e3e4a359765c Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:35 +0000
Subject: dm raid1: fix EIO after log failure

This patch adds the ability to requeue write I/O to
core device-mapper when there is a log device failure.

If a write to the log produces and error, the pending writes are
put on the "failures" list.  Since the log is marked as failed,
they will stay on the failures list until a suspend happens.

Suspends come in two phases, presuspend and postsuspend.  We must
make sure that all the writes on the failures list are requeued
in the presuspend phase (a requirement of dm core).  This means
that recovery must be complete (because writes may be delayed
behind it) and the failures list must be requeued before we
return from presuspend.

The mechanisms to ensure recovery is complete (or stopped) was
already in place, but needed to be moved from postsuspend to
presuspend.  We rely on 'flush_workqueue' to ensure that the
mirror thread is complete and therefore, has requeued all writes
in the failures list.

Because we are using flush_workqueue, we must ensure that no
additional 'queue_work' calls will produce additional I/O
that we need to requeue (because once we return from
presuspend, we are unable to do anything about it).  'queue_work'
is called in response to the following functions:
- complete_resync_work = NA, recovery is stopped
- rh_dec (mirror_end_io) = NA, only calls 'queue_work' if it
                           is ready to recover the region
                           (recovery is stopped) or it needs
                           to clear the region in the log*
                           **this doesn't get called while
                           suspending**
- rh_recovery_end = NA, recovery is stopped
- rh_recovery_start = NA, recovery is stopped
- write_callback = 1) Writes w/o failures simply call
                   bio_endio -> mirror_end_io -> rh_dec
                   (see rh_dec above)
                   2) Writes with failures are put on
                   the failures list and queue_work is
                   called**
                   ** write_callbacks don't happen
                   during suspend **
- do_failures = NA, 'queue_work' not called if suspending
- add_mirror (initialization) = NA, only done on mirror creation
- queue_bio = NA, 1) delayed I/O scheduled before flush_workqueue
              is called.  2) No more I/Os are being issued.
              3) Re-attempted READs can still be handled.
              (Write completions are handled through rh_dec/
              write_callback - mention above - and do not
              use queue_bio.)

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 101 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9978b9f07fe9..ec6d675bf766 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
 	region_t nr_regions;
 	int in_sync;
 	int log_failure;
+	atomic_t suspend;
 
 	atomic_t default_mirror;	/* Default mirror */
 
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success)
 	struct region_hash *rh = reg->rh;
 
 	rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+	/*
+	 * Dispatch the bios before we call 'wake_up_all'.
+	 * This is important because if we are suspending,
+	 * we want to know that recovery is complete and
+	 * the work queue is flushed.  If we wake_up_all
+	 * before we dispatch_bios (queue bios and call wake()),
+	 * then we risk suspending before the work queue
+	 * has been properly flushed.
+	 */
 	dispatch_bios(rh->ms, &reg->delayed_bios);
 	if (atomic_dec_and_test(&rh->recovery_in_flight))
 		wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	/*
 	 * Dispatch io.
 	 */
-	if (unlikely(ms->log_failure))
+	if (unlikely(ms->log_failure)) {
+		spin_lock_irq(&ms->lock);
+		bio_list_merge(&ms->failures, &sync);
+		spin_unlock_irq(&ms->lock);
+	} else
 		while ((bio = bio_list_pop(&sync)))
-			bio_endio(bio, -EIO);
-	else while ((bio = bio_list_pop(&sync)))
-		do_write(ms, bio);
+			do_write(ms, bio);
 
 	while ((bio = bio_list_pop(&recover)))
 		rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 	if (!failures->head)
 		return;
 
-	while ((bio = bio_list_pop(failures)))
-		__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+	if (!ms->log_failure) {
+		while ((bio = bio_list_pop(failures)))
+			__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+		return;
+	}
+
+	/*
+	 * If the log has failed, unattempted writes are being
+	 * put on the failures list.  We can't issue those writes
+	 * until a log has been marked, so we must store them.
+	 *
+	 * If a 'noflush' suspend is in progress, we can requeue
+	 * the I/O's to the core.  This give userspace a chance
+	 * to reconfigure the mirror, at which point the core
+	 * will reissue the writes.  If the 'noflush' flag is
+	 * not set, we have no choice but to return errors.
+	 *
+	 * Some writes on the failures list may have been
+	 * submitted before the log failure and represent a
+	 * failure to write to one of the devices.  It is ok
+	 * for us to treat them the same and requeue them
+	 * as well.
+	 */
+	if (dm_noflush_suspending(ms->ti)) {
+		while ((bio = bio_list_pop(failures)))
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+		return;
+	}
+
+	if (atomic_read(&ms->suspend)) {
+		while ((bio = bio_list_pop(failures)))
+			bio_endio(bio, -EIO);
+		return;
+	}
+
+	spin_lock_irq(&ms->lock);
+	bio_list_merge(&ms->failures, failures);
+	spin_unlock_irq(&ms->lock);
+
+	wake(ms);
 }
 
 static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
+	ms->log_failure = 0;
+	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
 	ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	return 0;
 }
 
-static void mirror_postsuspend(struct dm_target *ti)
+static void mirror_presuspend(struct dm_target *ti)
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
 
+	atomic_set(&ms->suspend, 1);
+
+	/*
+	 * We must finish up all the work that we've
+	 * generated (i.e. recovery work).
+	 */
 	rh_stop_recovery(&ms->rh);
 
-	/* Wait for all I/O we generated to complete */
 	wait_event(_kmirrord_recovery_stopped,
 		   !atomic_read(&ms->rh.recovery_in_flight));
 
+	if (log->type->presuspend && log->type->presuspend(log))
+		/* FIXME: need better error handling */
+		DMWARN("log presuspend failed");
+
+	/*
+	 * Now that recovery is complete/stopped and the
+	 * delayed bios are queued, we need to wait for
+	 * the worker thread to complete.  This way,
+	 * we know that all of our I/O has been pushed.
+	 */
+	flush_workqueue(ms->kmirrord_wq);
+}
+
+static void mirror_postsuspend(struct dm_target *ti)
+{
+	struct mirror_set *ms = ti->private;
+	struct dirty_log *log = ms->rh.log;
+
 	if (log->type->postsuspend && log->type->postsuspend(log))
 		/* FIXME: need better error handling */
-		DMWARN("log suspend failed");
+		DMWARN("log postsuspend failed");
 }
 
 static void mirror_resume(struct dm_target *ti)
 {
-	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct mirror_set *ms = ti->private;
 	struct dirty_log *log = ms->rh.log;
+
+	atomic_set(&ms->suspend, 0);
 	if (log->type->resume && log->type->resume(log))
 		/* FIXME: need better error handling */
 		DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("%d", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++)
 			DMEMIT(" %s %llu", ms->mirror[m].dev->name,
-				(unsigned long long)ms->mirror[m].offset);
+			       (unsigned long long)ms->mirror[m].offset);
 
 		if (ms->features & DM_RAID1_HANDLE_ERRORS)
 			DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
 	.dtr	 = mirror_dtr,
 	.map	 = mirror_map,
 	.end_io	 = mirror_end_io,
+	.presuspend = mirror_presuspend,
 	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
-- 
cgit v1.2.3


From 06386bbfd2441416875d0403d405c56822f6ebac Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:37 +0000
Subject: dm raid1: handle read failures

This patch gives the ability to respond-to/record device failures
that happen during read operations.  It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.

There are essentially two read paths in mirroring; the direct path
and the queued path.  When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.

If the direct path is taken, we must record bio information so that
if the read fails we can retry it.  We then discover the status of
a direct read through mirror_end_io.  If the read has failed, we will
mark the device from which the read was attempted as failed (so we
don't try to read from it again), restore the bio and try again.

If the queued path is taken, we discover the results of the read
from 'read_callback'.  If the device failed, we will mark the device
as failed and attempt the read again if there is another device
where this region is known to be 'in-sync'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 256 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 211 insertions(+), 45 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec6d675bf766..38efa7071dd7 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -6,6 +6,7 @@
 
 #include "dm.h"
 #include "dm-bio-list.h"
+#include "dm-bio-record.h"
 #include "dm-io.h"
 #include "dm-log.h"
 #include "kcopyd.h"
@@ -141,6 +142,7 @@ struct mirror_set {
 	struct bio_list failures;
 
 	struct dm_io_client *io_client;
+	mempool_t *read_record_pool;
 
 	/* recovery */
 	region_t nr_regions;
@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh)
 	wake(rh->ms);
 }
 
+#define MIN_READ_RECORDS 20
+struct dm_raid1_read_record {
+	struct mirror *m;
+	struct dm_bio_details details;
+};
+
 /*
  * Every mirror should look like this one.
  */
 #define DEFAULT_MIRROR 0
 
 /*
- * This is yucky.  We squirrel the mirror_set struct away inside
- * bi_next for write buffers.  This is safe since the bh
+ * This is yucky.  We squirrel the mirror struct away inside
+ * bi_next for read/write buffers.  This is safe since the bh
  * doesn't get submitted to the lower levels of block layer.
  */
-static struct mirror_set *bio_get_ms(struct bio *bio)
+static struct mirror *bio_get_m(struct bio *bio)
 {
-	return (struct mirror_set *) bio->bi_next;
+	return (struct mirror *) bio->bi_next;
 }
 
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+static void bio_set_m(struct bio *bio, struct mirror *m)
 {
-	bio->bi_next = (struct bio *) ms;
+	bio->bi_next = (struct bio *) m;
 }
 
 static struct mirror *get_default_mirror(struct mirror_set *ms)
@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
  *---------------------------------------------------------------*/
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
-	/* FIXME: add read balancing */
-	return get_default_mirror(ms);
+	struct mirror *m = get_default_mirror(ms);
+
+	do {
+		if (likely(!atomic_read(&m->error_count)))
+			return m;
+
+		if (m-- == ms->mirror)
+			m += ms->nr_mirrors;
+	} while (m != get_default_mirror(ms));
+
+	return NULL;
+}
+
+static int default_ok(struct mirror *m)
+{
+	struct mirror *default_mirror = get_default_mirror(m->ms);
+
+	return !atomic_read(&default_mirror->error_count);
+}
+
+static int mirror_available(struct mirror_set *ms, struct bio *bio)
+{
+	region_t region = bio_to_region(&ms->rh, bio);
+
+	if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
+		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+
+	return 0;
 }
 
 /*
  * remap a buffer to a particular mirror.
  */
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+	return m->offset + (bio->bi_sector - m->ms->ti->begin);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
 {
 	bio->bi_bdev = m->dev->bdev;
-	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+	bio->bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct io_region *io, struct mirror *m,
+		       struct bio *bio)
+{
+	io->bdev = m->dev->bdev;
+	io->sector = map_sector(m, bio);
+	io->count = bio->bi_size >> 9;
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static void read_callback(unsigned long error, void *context)
+{
+	struct bio *bio = context;
+	struct mirror *m;
+
+	m = bio_get_m(bio);
+	bio_set_m(bio, NULL);
+
+	if (likely(!error)) {
+		bio_endio(bio, 0);
+		return;
+	}
+
+	fail_mirror(m, DM_RAID1_READ_ERROR);
+
+	if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
+		DMWARN_LIMIT("Read failure on mirror device %s.  "
+			     "Trying alternative device.",
+			     m->dev->name);
+		queue_bio(m->ms, bio, bio_rw(bio));
+		return;
+	}
+
+	DMERR_LIMIT("Read failure on mirror device %s.  Failing I/O.",
+		    m->dev->name);
+	bio_endio(bio, -EIO);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+	struct io_region io;
+	struct dm_io_request io_req = {
+		.bi_rw = READ,
+		.mem.type = DM_IO_BVEC,
+		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.notify.fn = read_callback,
+		.notify.context = bio,
+		.client = m->ms->io_client,
+	};
+
+	map_region(&io, m, bio);
+	bio_set_m(bio, m);
+	(void) dm_io(&io_req, 1, &io, NULL);
 }
 
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 
 	while ((bio = bio_list_pop(reads))) {
 		region = bio_to_region(&ms->rh, bio);
+		m = get_default_mirror(ms);
 
 		/*
 		 * We can only read balance if the region is in sync.
 		 */
-		if (rh_in_sync(&ms->rh, region, 1))
+		if (likely(rh_in_sync(&ms->rh, region, 1)))
 			m = choose_mirror(ms, bio->bi_sector);
-		else
-			m = get_default_mirror(ms);
+		else if (m && atomic_read(&m->error_count))
+			m = NULL;
 
-		map_bio(ms, m, bio);
-		generic_make_request(bio);
+		if (likely(m))
+			read_async_bio(m, bio);
+		else
+			bio_endio(bio, -EIO);
 	}
 }
 
@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context)
 	int should_wake = 0;
 	unsigned long flags;
 
-	ms = bio_get_ms(bio);
-	bio_set_ms(bio, NULL);
+	ms = bio_get_m(bio)->ms;
+	bio_set_m(bio, NULL);
 
 	/*
 	 * NOTE: We don't decrement the pending count here,
@@ -1008,7 +1107,7 @@ out:
 static void do_write(struct mirror_set *ms, struct bio *bio)
 {
 	unsigned int i;
-	struct io_region io[KCOPYD_MAX_REGIONS+1];
+	struct io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE,
@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 		.client = ms->io_client,
 	};
 
-	for (i = 0; i < ms->nr_mirrors; i++) {
-		m = ms->mirror + i;
-
-		io[i].bdev = m->dev->bdev;
-		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
-		io[i].count = bio->bi_size >> 9;
-	}
+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
+		map_region(dest++, m, bio);
 
-	bio_set_ms(bio, ms);
+	/*
+	 * Use default mirror because we only need it to retrieve the reference
+	 * to the mirror set in write_callback().
+	 */
+	bio_set_m(bio, get_default_mirror(ms));
 
 	(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
 }
@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, get_default_mirror(ms), bio);
+		map_bio(get_default_mirror(ms), bio);
 		generic_make_request(bio);
 	}
 }
@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
+	len = sizeof(struct dm_raid1_read_record);
+	ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
+							   len);
+	if (!ms->read_record_pool) {
+		ti->error = "Error creating mirror read_record_pool";
+		kfree(ms);
+		return NULL;
+	}
+
 	ms->io_client = dm_io_client_create(DM_IO_PAGES);
 	if (IS_ERR(ms->io_client)) {
 		ti->error = "Error creating dm_io client";
+		mempool_destroy(ms->read_record_pool);
 		kfree(ms);
  		return NULL;
 	}
@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
 		ti->error = "Error creating dirty region hash";
 		dm_io_client_destroy(ms->io_client);
+		mempool_destroy(ms->read_record_pool);
 		kfree(ms);
 		return NULL;
 	}
@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
 
 	dm_io_client_destroy(ms->io_client);
 	rh_exit(&ms->rh);
+	mempool_destroy(ms->read_record_pool);
 	kfree(ms);
 }
 
@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
 	int r, rw = bio_rw(bio);
 	struct mirror *m;
 	struct mirror_set *ms = ti->private;
-
-	map_context->ll = bio_to_region(&ms->rh, bio);
+	struct dm_raid1_read_record *read_record = NULL;
 
 	if (rw == WRITE) {
+		/* Save region for mirror_end_io() handler */
+		map_context->ll = bio_to_region(&ms->rh, bio);
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
 	if (r < 0 && r != -EWOULDBLOCK)
 		return r;
 
-	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
-		r = DM_MAPIO_SUBMITTED;
-
 	/*
-	 * We don't want to fast track a recovery just for a read
-	 * ahead.  So we just let it silently fail.
-	 * FIXME: get rid of this.
+	 * If region is not in-sync queue the bio.
 	 */
-	if (!r && rw == READA)
-		return -EIO;
+	if (!r || (r == -EWOULDBLOCK)) {
+		if (rw == READA)
+			return -EWOULDBLOCK;
 
-	if (!r) {
-		/* Pass this io over to the daemon */
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
 	}
 
+	/*
+	 * The region is in-sync and we can perform reads directly.
+	 * Store enough information so we can retry if it fails.
+	 */
 	m = choose_mirror(ms, bio->bi_sector);
-	if (!m)
+	if (unlikely(!m))
 		return -EIO;
 
-	map_bio(ms, m, bio);
+	read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
+	if (likely(read_record)) {
+		dm_bio_record(&read_record->details, bio);
+		map_context->ptr = read_record;
+		read_record->m = m;
+	}
+
+	map_bio(m, bio);
+
 	return DM_MAPIO_REMAPPED;
 }
 
@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 {
 	int rw = bio_rw(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-	region_t region = map_context->ll;
+	struct mirror *m = NULL;
+	struct dm_bio_details *bd = NULL;
+	struct dm_raid1_read_record *read_record = map_context->ptr;
 
 	/*
 	 * We need to dec pending if this was a write.
 	 */
-	if (rw == WRITE)
-		rh_dec(&ms->rh, region);
+	if (rw == WRITE) {
+		rh_dec(&ms->rh, map_context->ll);
+		return error;
+	}
 
-	return 0;
+	if (error == -EOPNOTSUPP)
+		goto out;
+
+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+		goto out;
+
+	if (unlikely(error)) {
+		if (!read_record) {
+			/*
+			 * There wasn't enough memory to record necessary
+			 * information for a retry or there was no other
+			 * mirror in-sync.
+			 */
+			DMERR_LIMIT("Mirror read failed from %s.",
+				    m->dev->name);
+			return -EIO;
+		}
+		DMERR("Mirror read failed from %s. Trying alternative device.",
+		      m->dev->name);
+
+		m = read_record->m;
+		fail_mirror(m, DM_RAID1_READ_ERROR);
+
+		/*
+		 * A failed read is requeued for another attempt using an intact
+		 * mirror.
+		 */
+		if (default_ok(m) || mirror_available(ms, bio)) {
+			bd = &read_record->details;
+
+			dm_bio_restore(bd, bio);
+			mempool_free(read_record, ms->read_record_pool);
+			map_context->ptr = NULL;
+			queue_bio(ms, bio, rw);
+			return 1;
+		}
+		DMERR("All replicated volumes dead, failing I/O");
+	}
+
+out:
+	if (read_record) {
+		mempool_free(read_record, ms->read_record_pool);
+		map_context->ptr = NULL;
+	}
+
+	return error;
 }
 
 static void mirror_presuspend(struct dm_target *ti)
-- 
cgit v1.2.3


From af195ac82e38ba802fd86b5a014ed05ef6dd88bb Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 8 Feb 2008 02:11:39 +0000
Subject: dm raid1: report fault status

This patch adds extra information to the mirror status output, so that
it can be determined which device(s) have failed.  For each mirror device,
a character is printed indicating the most severe error encountered.  The
characters are:
 *    A => Alive - No failures
 *    D => Dead - A write failure occurred leaving mirror out-of-sync
 *    S => Sync - A sychronization failure occurred, mirror out-of-sync
 *    R => Read - A read failure occurred, mirror data unaffected
This allows userspace to properly reconfigure the mirror set.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 38efa7071dd7..edc057f5cdcc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1781,29 +1781,57 @@ static void mirror_resume(struct dm_target *ti)
 	rh_start_recovery(&ms->rh);
 }
 
+/*
+ * device_status_char
+ * @m: mirror device/leg we want the status of
+ *
+ * We return one character representing the most severe error
+ * we have encountered.
+ *    A => Alive - No failures
+ *    D => Dead - A write failure occurred leaving mirror out-of-sync
+ *    S => Sync - A sychronization failure occurred, mirror out-of-sync
+ *    R => Read - A read failure occurred, mirror data unaffected
+ *
+ * Returns: <char>
+ */
+static char device_status_char(struct mirror *m)
+{
+	if (!atomic_read(&(m->error_count)))
+		return 'A';
+
+	return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
+		(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
+		(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
+}
+
+
 static int mirror_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
 	unsigned int m, sz = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct dirty_log *log = ms->rh.log;
+	char buffer[ms->nr_mirrors + 1];
 
 	switch (type) {
 	case STATUSTYPE_INFO:
 		DMEMIT("%d ", ms->nr_mirrors);
-		for (m = 0; m < ms->nr_mirrors; m++)
+		for (m = 0; m < ms->nr_mirrors; m++) {
 			DMEMIT("%s ", ms->mirror[m].dev->name);
+			buffer[m] = device_status_char(&(ms->mirror[m]));
+		}
+		buffer[m] = '\0';
 
-		DMEMIT("%llu/%llu 0 ",
-			(unsigned long long)ms->rh.log->type->
-				get_sync_count(ms->rh.log),
-			(unsigned long long)ms->nr_regions);
+		DMEMIT("%llu/%llu 1 %s ",
+		      (unsigned long long)log->type->get_sync_count(ms->rh.log),
+		      (unsigned long long)ms->nr_regions, buffer);
 
-		sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
+		sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
 
 		break;
 
 	case STATUSTYPE_TABLE:
-		sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+		sz = log->type->status(ms->rh.log, type, result, maxlen);
 
 		DMEMIT("%d", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++)
@@ -1819,7 +1847,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 0, 3},
+	.version = {1, 0, 20},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
-- 
cgit v1.2.3