md/raid1: simplify the splitting of requests.

raid1 currently splits requests in two different ways for two different reasons. First, bio_split() is used to ensure the bio fits within a resync accounting region. Second, multiple r1bios are allocated for each bio to handle the possiblity of known bad blocks on some devices. This can be simplified to just use bio_split() once, and not use multiple r1bios. We delay the split until we know a maximum bio size that can be handled with a single r1bio, and then split the bio and queue the remainder for later handling. This avoids all loops inside raid1.c request handling. Just a single read, or a single set of writes, is submitted to lower-level devices for each bio that comes from generic_make_request(). When the bio needs to be split, generic_make_request() will do the necessary looping and call md_make_request() multiple times. raid1_make_request() no longer queues request for raid1 to handle, so we can remove that branch from the 'if'. This patch also creates a new private bio_set (conf->bio_split) for splitting bios. Using fs_bio_set is wrong, as it is meant to be used by filesystems, not block devices. Using it inside md can lead to deadlocks under high memory pressure. Delete unused variable in raid1_write_request() (Shaohua) Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
author: NeilBrown <neilb@suse.com> 2017-04-05 14:05:50 +1000
committer: Shaohua Li <shli@fb.com> 2017-04-11 10:07:27 -0700
commit: c230e7e53526c223a3e1caf40747d6e37c0e4394 (patch)
tree: 9d7d0696a720d4b49f74e51d9571fb4ee47d7781 /drivers/md/raid1.c
parent: ae1713e296449caf820635d384a99936ce281a71 (diff)
1 files changed, 50 insertions, 91 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 70278b9a3c43..94f1d754aa09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1202,7 +1202,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
 	return r1_bio;
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+			       int max_read_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
@@ -1211,7 +1212,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	int sectors_handled;
 	int max_sectors;
 	int rdisk;
 
@@ -1222,12 +1222,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	wait_read_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_read_sectors;
 
 	/*
 	 * make_request() can abort the operation when read-ahead is being
 	 * used and no empty request is available.
 	 */
-read_again:
 	rdisk = read_balance(conf, r1_bio, &max_sectors);
 
 	if (rdisk < 0) {
@@ -1247,11 +1247,20 @@ read_again:
 		wait_event(bitmap->behind_wait,
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
+
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = max_sectors;
+	}
+
 	r1_bio->read_disk = rdisk;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-		 max_sectors);
 
 	r1_bio->bios[rdisk] = read_bio;
 
@@ -1270,30 +1279,11 @@ read_again:
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r1_bio->sector);
 
-	if (max_sectors < r1_bio->sectors) {
-		/*
-		 * could not read all from this device, so we will need another
-		 * r1_bio.
-		 */
-		sectors_handled = (r1_bio->sector + max_sectors
-				   - bio->bi_iter.bi_sector);
-		r1_bio->sectors = max_sectors;
-		bio_inc_remaining(bio);
-
-		/*
-		 * Cannot call generic_make_request directly as that will be
-		 * queued in __make_request and subsequent mempool_alloc might
-		 * block waiting for it.  So hand bio over to raid1d.
-		 */
-		reschedule_retry(r1_bio);
-
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto read_again;
-	} else
-		generic_make_request(read_bio);
+	generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				int max_write_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
@@ -1304,9 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
-	int sectors_handled;
 	int max_sectors;
-	sector_t offset;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1345,6 +1333,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	wait_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1443,17 +1432,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors)
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
 		r1_bio->sectors = max_sectors;
-
-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
+	}
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
 	first_clone = 1;
 
-	offset = r1_bio->sector - bio->bi_iter.bi_sector;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		if (!r1_bio->bios[i])
@@ -1470,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
 				mbio = alloc_behind_master_bio(r1_bio, bio,
-							       offset << 9,
+							       0,
 							       max_sectors << 9);
 			}
 
@@ -1486,10 +1479,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 				mbio = bio_clone_fast(r1_bio->behind_master_bio,
 						      GFP_NOIO,
 						      mddev->bio_set);
-			else {
+			else
 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-				bio_trim(mbio, offset, max_sectors);
-			}
 		}
 
 		if (r1_bio->behind_master_bio) {
@@ -1536,19 +1527,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		if (!plug)
 			md_wakeup_thread(mddev->thread);
 	}
-	/* Mustn't call r1_bio_write_done before this next test,
-	 * as it could result in the bio being freed.
-	 */
-	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be counted */
-		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
-
-		inc_pending(conf, sect);
-		bio_inc_remaining(bio);
-		r1_bio_write_done(r1_bio);
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto retry_write;
-	}
 
 	r1_bio_write_done(r1_bio);
 
@@ -1558,7 +1536,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-	struct bio *split;
 	sector_t sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
@@ -1566,43 +1543,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
-	/* if bio exceeds barrier unit boundary, split it */
-	do {
-		sectors = align_to_barrier_unit_end(
-				bio->bi_iter.bi_sector, bio_sectors(bio));
-		if (sectors < bio_sectors(bio)) {
-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
-
-		if (bio_data_dir(split) == READ) {
-			raid1_read_request(mddev, split);
+	/*
+	 * There is a limit to the maximum size, but
+	 * the read/write handler might find a lower limit
+	 * due to bad blocks.  To avoid multiple splits,
+	 * we pass the maximum number of sectors down
+	 * and let the lower level perform the split.
+	 */
+	sectors = align_to_barrier_unit_end(
+		bio->bi_iter.bi_sector, bio_sectors(bio));
 
-			/*
-			 * If a bio is splitted, the first part of bio will
-			 * pass barrier but the bio is queued in
-			 * current->bio_list (see generic_make_request). If
-			 * there is a raise_barrier() called here, the second
-			 * part of bio can't pass barrier. But since the first
-			 * part bio isn't dispatched to underlaying disks yet,
-			 * the barrier is never released, hence raise_barrier
-			 * will alays wait. We have a deadlock.
-			 * Note, this only happens in read path. For write
-			 * path, the first part of bio is dispatched in a
-			 * schedule() call (because of blk plug) or offloaded
-			 * to raid10d.
-			 * Quitting from the function immediately can change
-			 * the bio order queued in bio_list and avoid the deadlock.
-			 */
-			if (split != bio) {
-				generic_make_request(bio);
-				break;
-			}
-		} else
-			raid1_write_request(mddev, split);
-	} while (split != bio);
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, sectors);
+	else
+		raid1_write_request(mddev, bio, sectors);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -2647,10 +2601,7 @@ static void raid1d(struct md_thread *thread)
 		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
 			handle_read_error(conf, r1_bio);
 		else
-			/* just a partial read to be scheduled from separate
-			 * context
-			 */
-			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+			WARN_ON_ONCE(1);
 
 		cond_resched();
 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3038,6 +2989,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->r1bio_pool)
 		goto abort;
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
+
 	conf->poolinfo->mddev = mddev;
 
 	err = -EINVAL;
@@ -3119,6 +3074,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->nr_waiting);
 		kfree(conf->nr_queued);
 		kfree(conf->barrier);
+		if (conf->bio_split)
+			bioset_free(conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3224,6 +3181,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->nr_waiting);
 	kfree(conf->nr_queued);
 	kfree(conf->barrier);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf);
 }
author	NeilBrown <neilb@suse.com>	2017-04-05 14:05:50 +1000
committer	Shaohua Li <shli@fb.com>	2017-04-11 10:07:27 -0700
commit	c230e7e53526c223a3e1caf40747d6e37c0e4394 (patch)
tree	9d7d0696a720d4b49f74e51d9571fb4ee47d7781 /drivers/md/raid1.c
parent	ae1713e296449caf820635d384a99936ce281a71 (diff)