summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c350
1 files changed, 220 insertions, 130 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98765f6f77f..cd2f96b2c572 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- if (atomic_read(&conf->preread_active_stripes)
- < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill to continue
*/
-static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
- int disk_idx, int disks)
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
+ int i;
+
+
+ if (test_bit(R5_LOCKED, &dev->flags) ||
+ test_bit(R5_UPTODATE, &dev->flags))
+ /* No point reading this as we already have it or have
+ * decided to get it.
+ */
+ return 0;
+
+ if (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+ /* We need this block to directly satisfy a request */
+ return 1;
+
+ if (s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)))
+ /* When syncing, or expanding we read everything.
+ * When replacing, we need the replaced block.
+ */
+ return 1;
+
+ if ((s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread))
+ /* If we want to read from a failed device, then
+ * we need to actually read every other device.
+ */
+ return 1;
+
+ /* Sometimes neither read-modify-write nor reconstruct-write
+ * cycles can work. In those cases we read every block we
+ * can. Then the parity-update is certain to have enough to
+ * work with.
+ * This can only be a problem when we need to write something,
+ * and some device has failed. If either of those tests
+ * fail we need look no further.
+ */
+ if (!s->failed || !s->to_write)
+ return 0;
+
+ if (test_bit(R5_Insync, &dev->flags) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ /* Pre-reads at not permitted until after short delay
+ * to gather multiple requests. However if this
+ * device is no Insync, the block could only be be computed
+ * and there is no need to delay that.
+ */
+ return 0;
+
+ for (i = 0; i < s->failed; i++) {
+ if (fdev[i]->towrite &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ /* If we have a partial write to a failed
+ * device, then we will need to reconstruct
+ * the content of that device, so all other
+ * devices must be read.
+ */
+ return 1;
+ }
+
+ /* If we are forced to do a reconstruct-write, either because
+ * the current RAID6 implementation only supports that, or
+ * or because parity cannot be trusted and we are currently
+ * recovering it, there is extra need to be careful.
+ * If one of the devices that we would need to read, because
+ * it is not being overwritten (and maybe not written at all)
+ * is missing/faulty, then we need to read everything we can.
+ */
+ if (sh->raid_conf->level != 6 &&
+ sh->sector < sh->raid_conf->mddev->recovery_cp)
+ /* reconstruct-write isn't being forced */
+ return 0;
+ for (i = 0; i < s->failed; i++) {
+ if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
/* is the data in this block needed, and can we get it? */
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->replacing && want_replace(sh, disk_idx)) ||
- (s->failed >= 1 && fdev[0]->toread) ||
- (s->failed >= 2 && fdev[1]->toread) ||
- (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
- !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- ((sh->raid_conf->level == 6 ||
- sh->sector >= sh->raid_conf->mddev->recovery_cp)
- && s->failed && s->to_write &&
- (s->to_write - s->non_overwrite <
- sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+ if (need_this_block(sh, s, disk_idx, disks)) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -3102,7 +3170,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
* generate correct data from the parity.
*/
if (conf->max_degraded == 2 ||
- (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
+ s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
@@ -4081,7 +4150,7 @@ static void activate_bit_delay(struct r5conf *conf,
}
}
-int md_raid5_congested(struct mddev *mddev, int bits)
+static int raid5_congested(struct mddev *mddev, int bits)
{
struct r5conf *conf = mddev->private;
@@ -4098,24 +4167,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-EXPORT_SYMBOL_GPL(md_raid5_congested);
-
-static int raid5_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid5_congested(mddev, bits);
-}
/* We want read requests to align with chunks where possible,
* but write requests don't need to.
*/
-static int raid5_mergeable_bvec(struct request_queue *q,
+static int raid5_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5062,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
schedule_timeout_uninterruptible(1);
}
/* Need to check if array will still be degraded after recovery/resync
- * We don't need to check the 'failed' flag as when that gets set,
- * recovery aborts.
+ * Note in case of > 1 drive failures it's possible we're rebuilding
+ * one drive while leaving another faulty drive in array.
*/
- for (i = 0; i < conf->raid_disks; i++)
- if (conf->disks[i].rdev == NULL)
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
+ }
+ rcu_read_unlock();
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
@@ -5296,11 +5360,14 @@ static void raid5d(struct md_thread *thread)
static ssize_t
raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->max_nr_stripes);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+ spin_unlock(&mddev->lock);
+ return ret;
}
int
@@ -5339,21 +5406,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
static ssize_t
raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- err = raid5_set_cache_size(mddev, new);
+ err = mddev_lock(mddev);
if (err)
return err;
- return len;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else
+ err = raid5_set_cache_size(mddev, new);
+ mddev_unlock(mddev);
+
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5364,29 +5435,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->bypass_threshold);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->bypass_threshold);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new > conf->max_nr_stripes)
- return -EINVAL;
- conf->bypass_threshold = new;
- return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new > conf->max_nr_stripes)
+ err = -EINVAL;
+ else
+ conf->bypass_threshold = new;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5398,39 +5480,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
static ssize_t
raid5_show_skip_copy(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->skip_copy);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->skip_copy);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
new = !!new;
- if (new == conf->skip_copy)
- return len;
- mddev_suspend(mddev);
- conf->skip_copy = new;
- if (new)
- mddev->queue->backing_dev_info.capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- mddev->queue->backing_dev_info.capabilities &=
- ~BDI_CAP_STABLE_WRITES;
- mddev_resume(mddev);
- return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->skip_copy) {
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5454,11 +5545,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
static ssize_t
raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->worker_cnt_per_group);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5562,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5570,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new == conf->worker_cnt_per_group)
- return len;
-
- mddev_suspend(mddev);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->worker_cnt_per_group) {
+ mddev_suspend(mddev);
- old_groups = conf->worker_groups;
- if (old_groups)
- flush_workqueue(raid5_wq);
+ old_groups = conf->worker_groups;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
- if (!err) {
- spin_lock_irq(&conf->device_lock);
- conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
- conf->worker_groups = new_groups;
- spin_unlock_irq(&conf->device_lock);
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
- if (old_groups)
- kfree(old_groups[0].workers);
- kfree(old_groups);
+ if (old_groups)
+ kfree(old_groups[0].workers);
+ kfree(old_groups);
+ }
+ mddev_resume(mddev);
}
+ mddev_unlock(mddev);
- mddev_resume(mddev);
-
- if (err)
- return err;
- return len;
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -6178,11 +6272,6 @@ static int run(struct mddev *mddev)
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-
- mddev->queue->backing_dev_info.congested_data = mddev;
- mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6349,12 @@ abort:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid5_free(struct mddev *mddev, void *priv)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf = priv;
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- mddev->queue->backing_dev_info.congested_fn = NULL;
free_conf(conf);
- mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
- return 0;
}
static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7128,7 @@ static struct md_personality raid6_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7058,6 +7142,8 @@ static struct md_personality raid6_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7066,7 +7152,7 @@ static struct md_personality raid5_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7080,6 +7166,8 @@ static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7089,7 +7177,7 @@ static struct md_personality raid4_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7103,6 +7191,8 @@ static struct md_personality raid4_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)