md: avoid a possibility that a read error can wrongly propagate through md/raid1 to a filesystem.

When a raid1 has only one working drive, we want read error to propagate up to the filesystem as there is no point failing the last drive in an array. Currently the code perform this check is racy. If a write and a read a both submitted to a device on a 2-drive raid1, and the write fails followed by the read failing, the read will see that there is only one working drive and will pass the failure up, even though the one working drive is actually the *other* one. So, tighten up the locking. Signed-off-by: Neil Brown <neilb@suse.de> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: NeilBrown <neilb@suse.de> 2007-05-10 03:15:50 -0700
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-10 09:26:53 -0700
commit: dd00a99e7a4b739bd41ef4093760efc7e447f963 (patch)
tree: 2bb5bfafc0de89bd00ef530540e91e8297fc5d57
parent: c5ddb547e899993be56dc7d0bf72bfd7a8d4ae1e (diff)
1 files changed, 19 insertions, 14 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870b265d..3a95cc5e029c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,21 +271,25 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	 */
 	update_head_pos(mirror, r1_bio);
 
-	if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) {
-		/*
-		 * Set R1BIO_Uptodate in our master bio, so that
-		 * we will return a good error code for to the higher
-		 * levels even if IO on some other mirrored buffer fails.
-		 *
-		 * The 'master' represents the composite IO operation to
-		 * user-side. So if something waits for IO, then it will
-		 * wait for the 'master' bio.
+	if (uptodate)
+		set_bit(R1BIO_Uptodate, &r1_bio->state);
+	else {
+		/* If all other devices have failed, we want to return
+		 * the error upwards rather than fail the last device.
+		 * Here we redefine "uptodate" to mean "Don't want to retry"
 		 */
-		if (uptodate)
-			set_bit(R1BIO_Uptodate, &r1_bio->state);
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (r1_bio->mddev->degraded == conf->raid_disks ||
+		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
+		     !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+			uptodate = 1;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	}
 
+	if (uptodate)
 		raid_end_bio_io(r1_bio);
-	} else {
+	else {
 		/*
 		 * oops, read error:
 		 */
@@ -992,13 +996,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
+		set_bit(Faulty, &rdev->flags);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
-	}
-	set_bit(Faulty, &rdev->flags);
+	} else
+		set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
 		"	Operation continuing on %d devices\n",
author	NeilBrown <neilb@suse.de>	2007-05-10 03:15:50 -0700
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-10 09:26:53 -0700
commit	dd00a99e7a4b739bd41ef4093760efc7e447f963 (patch)
tree	2bb5bfafc0de89bd00ef530540e91e8297fc5d57
parent	c5ddb547e899993be56dc7d0bf72bfd7a8d4ae1e (diff)