diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 140 |
1 files changed, 84 insertions, 56 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b29e89cb815b..15ef2c641b2b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi) { - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; + struct bio *bi; + while ((bi = bio_list_pop(return_bi)) != NULL) { bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); bio_endio(bi); - bi = return_bi; } } @@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; + struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } + if (!raid5_dec_bi_active_stripes(rbi)) + bio_list_add(&return_bi, rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(return_bi); + return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -2517,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -3069,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, - struct bio **return_bi) + struct bio_list *return_bi) { int i; BUG_ON(sh->batch_head); @@ -3114,8 +3109,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = nextbi; } @@ -3139,8 +3133,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = bi2; } @@ -3163,10 +3156,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } + if (!raid5_dec_bi_active_stripes(bi)) + bio_list_add(return_bi, bi); bi = nextbi; } } @@ -3445,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) + struct stripe_head *sh, int disks, struct bio_list *return_bi) { int i; struct r5dev *dev; @@ -3479,8 +3470,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; + bio_list_add(return_bi, wbi); } wbi = wbi2; } @@ -4613,7 +4603,15 @@ finish: md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + if (!bio_list_empty(&s.return_bi)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->return_bi, &s.return_bi); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + return_io(&s.return_bi); + } clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4672,12 +4670,12 @@ static int raid5_congested(struct mddev *mddev, int bits) static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { + struct r5conf *conf = mddev->private; sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -5325,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_t stripe_addr; int reshape_sectors; struct list_head stripes; + sector_t retn; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -5332,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; @@ -5340,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->curr_resync_completed = sector_nr; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; - return sector_nr; + retn = sector_nr; + goto finish; } } @@ -5348,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); /* We update the metadata at least every 10 seconds, or when * the data about to be copied would over-write the source of @@ -5366,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - writepos -= min_t(sector_t, reshape_sectors, writepos); + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ readpos -= min_t(sector_t, reshape_sectors, readpos); safepos -= min_t(sector_t, reshape_sectors, safepos); } @@ -5513,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -5538,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } ret: - return reshape_sectors; + return retn; } static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5794,6 +5804,18 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); + if (!bio_list_empty(&conf->return_bi) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + struct bio_list tmp = BIO_EMPTY_LIST; + spin_lock_irq(&conf->device_lock); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + bio_list_merge(&tmp, &conf->return_bi); + bio_list_init(&conf->return_bi); + } + spin_unlock_irq(&conf->device_lock); + return_io(&tmp); + } + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -6234,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) /* size is defined by the smallest of previous and new size */ raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); + bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; } conf->min_nr_stripes = NR_STRIPES; @@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev) sector_t here_new, here_old; int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev) /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. */ here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { printk(KERN_ERR "md/raid:%s: reshape_position not " "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } - reshape_offset = here_new * mddev->new_chunk_sectors; + reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: reshape position is" - " confused - aborting\n", mdname(mddev)); - return -EINVAL; - } /* We cannot be sure it is safe to start an in-place * reshape. It is only safe if user-space is monitoring * and taking constant backups. @@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev) return -EINVAL; } } else if (mddev->reshape_backwards - ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors + (-min_offset_diff))) { + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); + conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize; - sectors &= ~((sector_t)mddev->chunk_sectors - 1); + struct r5conf *conf = mddev->private; + + sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && mddev->array_sectors > newsize) @@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf) rdev->data_offset = rdev->new_data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); |