From 6bcfd601861cce45ca73ac1d714f1286b6b3f0d4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 May 2008 13:04:34 -0700
Subject: md: kill file_path wrapper

Kill the trivial and rather pointless file_path wrapper around d_path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 83eb78b00137..d31aa6f33a6a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3987,8 +3987,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 	if (!buf)
 		goto out;
 
-	ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
-	if (!ptr)
+	ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
+	if (IS_ERR(ptr))
 		goto out;
 
 	strcpy(file->pathname, ptr);
-- 
cgit v1.2.3


From 09a44cc15079f80c1416cde1a1d5b2cdd8f2118a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 23 May 2008 13:04:36 -0700
Subject: md: notify userspace on 'write-pending' changes to array_state

When an array enters write pending, 'array_state' changes, so we must be
sure to sysfs_notify.

Also, when waiting for user-space to acknowledge 'write-pending' by
marking the metadata as dirty, we don't want to wait for MD_CHANGE_DEVS to
be cleared as that might not happen.  So explicity test for the bits that
we are really interested in.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d31aa6f33a6a..52f9865096c6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5435,8 +5435,11 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 			md_wakeup_thread(mddev->thread);
 		}
 		spin_unlock_irq(&mddev->write_lock);
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
 	}
-	wait_event(mddev->sb_wait, mddev->flags==0);
+	wait_event(mddev->sb_wait,
+		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
+		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 }
 
 void md_write_end(mddev_t *mddev)
@@ -5471,6 +5474,12 @@ void md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
+
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
+		/* wait for the dirty state to be recorded in the metadata */
+		wait_event(mddev->sb_wait,
+			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
+			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 	} else
 		spin_unlock_irq(&mddev->write_lock);
 }
-- 
cgit v1.2.3


From 4f54b0e9485644a3c5fca2ae43bcbe7376825747 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 May 2008 13:04:37 -0700
Subject: md: notify userspace on 'stop' events

This additional notification to 'array_state' is needed to allow the
monitor application to learn about stop events via sysfs.  The
sysfs_notify("sync_action") call that comes at the end of do_md_stop()
(via md_new_event) is insufficient since the 'sync_action' attribute has
been removed by this point.

(Seems like a sysfs-notify-on-removal patch is a better fix.  Currently
removal updates the event count but does not wake up waiters)

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 52f9865096c6..e57213dacd25 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3691,6 +3691,8 @@ static int do_md_stop(mddev_t * mddev, int mode)
 
 			module_put(mddev->pers->owner);
 			mddev->pers = NULL;
+			/* tell userspace to handle 'inactive' */
+			sysfs_notify(&mddev->kobj, NULL, "array_state");
 
 			set_capacity(disk, 0);
 			mddev->changed = 1;
-- 
cgit v1.2.3


From 90b08710e41a07d4ff0fb8940dcce3a552991a56 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bs@q-leap.de>
Date: Fri, 23 May 2008 13:04:38 -0700
Subject: md: allow parallel resync of md-devices.

In some configurations, a raid6 resync can be limited by CPU speed
(Calculating P and Q and moving data) rather than by device speed.  In
these cases there is nothing to be gained byt serialising resync of arrays
that share a device, and doing the resync in parallel can provide benefit.
 So add a sysfs tunable to flag an array as being allowed to resync in
parallel with other arrays that use (a different part of) the same device.

Signed-off-by: Bernd Schubert <bs@q-leap.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e57213dacd25..295be1a68806 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -74,6 +74,8 @@ static DEFINE_SPINLOCK(pers_lock);
 
 static void md_print_devices(void);
 
+static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 /*
@@ -3012,6 +3014,36 @@ degraded_show(mddev_t *mddev, char *page)
 }
 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
 
+static ssize_t
+sync_force_parallel_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d\n", mddev->parallel_resync);
+}
+
+static ssize_t
+sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	long n;
+
+	if (strict_strtol(buf, 10, &n))
+		return -EINVAL;
+
+	if (n != 0 && n != 1)
+		return -EINVAL;
+
+	mddev->parallel_resync = n;
+
+	if (mddev->sync_thread)
+		wake_up(&resync_wait);
+
+	return len;
+}
+
+/* force parallel resync, even with shared block devices */
+static struct md_sysfs_entry md_sync_force_parallel =
+__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
+       sync_force_parallel_show, sync_force_parallel_store);
+
 static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
@@ -3187,6 +3219,7 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_min.attr,
 	&md_sync_max.attr,
 	&md_sync_speed.attr,
+	&md_sync_force_parallel.attr,
 	&md_sync_completed.attr,
 	&md_max_sync.attr,
 	&md_suspend_lo.attr,
@@ -5487,8 +5520,6 @@ void md_allow_write(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
-static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
 void md_do_sync(mddev_t *mddev)
@@ -5552,8 +5583,9 @@ void md_do_sync(mddev_t *mddev)
 		for_each_mddev(mddev2, tmp) {
 			if (mddev2 == mddev)
 				continue;
-			if (mddev2->curr_resync && 
-			    match_mddev_units(mddev,mddev2)) {
+			if (!mddev->parallel_resync
+			&&  mddev2->curr_resync
+			&&  match_mddev_units(mddev, mddev2)) {
 				DEFINE_WAIT(wq);
 				if (mddev < mddev2 && mddev->curr_resync == 2) {
 					/* arbitrarily yield */
-- 
cgit v1.2.3


From dfc7064500061677720fa26352963c772d3ebe6b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 23 May 2008 13:04:39 -0700
Subject: md: restart recovery cleanly after device failure.

When we get any IO error during a recovery (rebuilding a spare), we abort
the recovery and restart it.

For RAID6 (and multi-drive RAID1) it may not be best to restart at the
beginning: when multiple failures can be tolerated, the recovery may be
able to continue and re-doing all that has already been done doesn't make
sense.

We already have the infrastructure to record where a recovery is up to
and restart from there, but it is not being used properly.
This is because:
  - We sometimes abort with MD_RECOVERY_ERR rather than just MD_RECOVERY_INTR,
    which causes the recovery not be be checkpointed.
  - We remove spares and then re-added them which loses important state
    information.

The distinction between MD_RECOVERY_ERR and MD_RECOVERY_INTR really isn't
needed.  If there is an error, the relevant drive will be marked as
Faulty, and that is enough to ensure correct handling of the error.  So we
first remove MD_RECOVERY_ERR, changing some of the uses of it to
MD_RECOVERY_INTR.

Then we cause the attempt to remove a non-faulty device from an array to
fail (unless recovery is impossible as the array is too degraded).  Then
when remove_and_add_spares attempts to remove the devices on which
recovery can continue, it will fail, they will remain in place, and
recovery will continue on them as desired.

Issue:  If we are halfway through rebuilding a spare and another drive
fails, and a new spare is immediately available,  do we want to:
 1/ complete the current rebuild, then go back and rebuild the new spare or
 2/ restart the rebuild from the start and rebuild both devices in
    parallel.

Both options can be argued for.  The code currently takes option 2 as
  a/ this requires least code change
  b/ this results in a minimally-degraded array in minimal time.

Cc: "Eivind Sarto" <ivan@kasenna.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 295be1a68806..51c19f86ff99 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5434,7 +5434,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
 	atomic_sub(blocks, &mddev->recovery_active);
 	wake_up(&mddev->recovery_wait);
 	if (!ok) {
-		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		// stop recovery, signal do_sync ....
 	}
@@ -5690,7 +5690,7 @@ void md_do_sync(mddev_t *mddev)
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
 						  currspeed < speed_min(mddev));
 		if (sectors == 0) {
-			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			goto out;
 		}
 
@@ -5713,8 +5713,7 @@ void md_do_sync(mddev_t *mddev)
 
 		last_check = io_sectors;
 
-		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
-		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			break;
 
 	repeat:
@@ -5768,8 +5767,7 @@ void md_do_sync(mddev_t *mddev)
 	/* tell personality that we are finished */
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
-	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
-	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
+	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5838,7 +5836,10 @@ static int remove_and_add_spares(mddev_t *mddev)
 		}
 
 	if (mddev->degraded) {
-		rdev_for_each(rdev, rtmp, mddev)
+		rdev_for_each(rdev, rtmp, mddev) {
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(In_sync, &rdev->flags))
+				spares++;
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
 				rdev->recovery_offset = 0;
@@ -5856,6 +5857,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 				} else
 					break;
 			}
+		}
 	}
 	return spares;
 }
@@ -5869,7 +5871,7 @@ static int remove_and_add_spares(mddev_t *mddev)
  * to do that as needed.
  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
  * "->recovery" and create a thread at ->sync_thread.
- * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * When the thread finishes it sets MD_RECOVERY_DONE
  * and wakeups up this thread which will reap the thread and finish up.
  * This thread also removes any faulty devices (with nr_pending == 0).
  *
@@ -5944,8 +5946,7 @@ void md_check_recovery(mddev_t *mddev)
 			/* resync has finished, collect result */
 			md_unregister_thread(mddev->sync_thread);
 			mddev->sync_thread = NULL;
-			if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
-			    !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				/* success...*/
 				/* activate any spares */
 				mddev->pers->spare_active(mddev);
@@ -5969,7 +5970,6 @@ void md_check_recovery(mddev_t *mddev)
 		 * might be left set
 		 */
 		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-		clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
-- 
cgit v1.2.3


From a6d8113a986c66aeb379a26b6e0062488b3e59e1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 5 Jun 2008 22:45:53 -0700
Subject: md: fix uninitialized use of mddev->recovery_wait

If an array was created with --assume-clean we will oops when trying to
set ->resync_max.

Fix this by initializing ->recovery_wait in mddev_find.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 51c19f86ff99..7cf512a34ccf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -276,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit)
 	atomic_set(&new->active, 1);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
+	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
 	new->resync_max = MaxSector;
 	new->level = LEVEL_NONE;
@@ -5665,7 +5666,6 @@ void md_do_sync(mddev_t *mddev)
 		window/2,(unsigned long long) max_sectors/2);
 
 	atomic_set(&mddev->recovery_active, 0);
-	init_waitqueue_head(&mddev->recovery_wait);
 	last_check = 0;
 
 	if (j>2) {
-- 
cgit v1.2.3