diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/Kconfig | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 45 | ||||
-rw-r--r-- | drivers/md/dm-ebs-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-verity-target.c | 172 | ||||
-rw-r--r-- | drivers/md/dm-verity.h | 6 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm.c | 10 | ||||
-rw-r--r-- | drivers/md/md-autodetect.c | 21 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 4 | ||||
-rw-r--r-- | drivers/md/md.c | 424 | ||||
-rw-r--r-- | drivers/md/md.h | 19 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 3 | ||||
-rw-r--r-- | drivers/md/raid10.c | 18 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 40 | ||||
-rw-r--r-- | drivers/md/raid5-log.h | 77 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 729 | ||||
-rw-r--r-- | drivers/md/raid5.h | 2 |
24 files changed, 1020 insertions, 575 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index cf3e8096942a..529c9d04e9a4 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG operations that get stuck. config BCACHE_ASYNC_REGISTRATION - bool "Asynchronous device registration (EXPERIMENTAL)" + bool "Asynchronous device registration" depends on BCACHE help Add a sysfs file /sys/fs/bcache/register_async. Writing registering diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e136d6edc1ed..147c493a989a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - if (register_shrinker(&c->shrink)) + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) pr_warn("bcache: %s: could not register shrinker\n", __func__); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index dc01ce33265b..09c7ed2650ca 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rbtree.h> #include <linux/stacktrace.h> +#include <linux/jump_label.h> #define DM_MSG_PREFIX "bufio" @@ -81,6 +82,8 @@ */ struct dm_bufio_client { struct mutex lock; + spinlock_t spinlock; + bool no_sleep; struct list_head lru[LIST_SIZE]; unsigned long n_buffers[LIST_SIZE]; @@ -90,7 +93,6 @@ struct dm_bufio_client { s8 sectors_per_block_bits; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); - struct kmem_cache *slab_buffer; struct kmem_cache *slab_cache; struct dm_io_client *dm_io; @@ -161,23 +163,34 @@ struct dm_buffer { #endif }; +static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); + /*----------------------------------------------------------------*/ #define dm_bufio_in_request() (!!current->bio_list) static void dm_bufio_lock(struct dm_bufio_client *c) { - mutex_lock_nested(&c->lock, dm_bufio_in_request()); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_lock_bh(&c->spinlock); + else + mutex_lock_nested(&c->lock, dm_bufio_in_request()); } static int dm_bufio_trylock(struct dm_bufio_client *c) { - return mutex_trylock(&c->lock); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return spin_trylock_bh(&c->spinlock); + else + return mutex_trylock(&c->lock); } static void dm_bufio_unlock(struct dm_bufio_client *c) { - mutex_unlock(&c->lock); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_unlock_bh(&c->spinlock); + else + mutex_unlock(&c->lock); } /*----------------------------------------------------------------*/ @@ -802,6 +815,10 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) BUG_ON(test_bit(B_WRITING, &b->state)); BUG_ON(test_bit(B_DIRTY, &b->state)); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep && + unlikely(test_bit(B_READING, &b->state))) + continue; + if (!b->hold_count) { __make_buffer_clean(b); __unlink_buffer(b); @@ -810,6 +827,9 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) cond_resched(); } + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return NULL; + list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { BUG_ON(test_bit(B_READING, &b->state)); @@ -1617,7 +1637,8 @@ static void drop_buffers(struct dm_bufio_client *c) */ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { - if (!(gfp & __GFP_FS)) { + if (!(gfp & __GFP_FS) || + (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) @@ -1715,7 +1736,8 @@ static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrin struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, unsigned reserved_buffers, unsigned aux_size, void (*alloc_callback)(struct dm_buffer *), - void (*write_callback)(struct dm_buffer *)) + void (*write_callback)(struct dm_buffer *), + unsigned int flags) { int r; struct dm_bufio_client *c; @@ -1745,12 +1767,18 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->alloc_callback = alloc_callback; c->write_callback = write_callback; + if (flags & DM_BUFIO_CLIENT_NO_SLEEP) { + c->no_sleep = true; + static_branch_inc(&no_sleep_enabled); + } + for (i = 0; i < LIST_SIZE; i++) { INIT_LIST_HEAD(&c->lru[i]); c->n_buffers[i] = 0; } mutex_init(&c->lock); + spin_lock_init(&c->spinlock); INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; @@ -1804,7 +1832,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker); + r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name, + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); if (r) goto bad; @@ -1876,6 +1905,8 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); mutex_destroy(&c->lock); + if (c->no_sleep) + static_branch_dec(&no_sleep_enabled); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 223e8e1a7a13..512cc6cea095 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -313,7 +313,8 @@ static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, + 0, NULL, NULL, 0); if (IS_ERR(ec->bufio)) { ti->error = "Cannot create dm bufio client"; r = PTR_ERR(ec->bufio); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c60f9b2ece2d..aaf2472df6e5 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4439,7 +4439,7 @@ try_smaller_buffer: } ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, - 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL); + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); if (IS_ERR(ic->bufio)) { r = PTR_ERR(ic->bufio); ti->error = "Cannot initialize dm-bufio"; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1ec17c32867f..c640be453313 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index f46f930eedf9..680cc05ec654 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -493,7 +493,7 @@ static int read_exceptions(struct pstore *ps, client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev, ps->store->chunk_size << SECTOR_SHIFT, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(client)) return PTR_ERR(client); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index cea2b3789736..23cffce56403 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -749,7 +749,7 @@ int verity_fec_ctr(struct dm_verity *v) f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; return PTR_ERR(f->bufio); @@ -765,7 +765,7 @@ int verity_fec_ctr(struct dm_verity *v) f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, 1 << v->data_dev_block_bits, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(f->data_bufio)) { ti->error = "Cannot initialize FEC data bufio client"; return PTR_ERR(f->data_bufio); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4fd853a56b1a..94b6cb599db4 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -20,6 +20,7 @@ #include <linux/reboot.h> #include <linux/scatterlist.h> #include <linux/string.h> +#include <linux/jump_label.h> #define DM_MSG_PREFIX "verity" @@ -35,14 +36,17 @@ #define DM_VERITY_OPT_PANIC "panic_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" +#define DM_VERITY_OPT_TASKLET_VERIFY "try_verify_in_tasklet" -#define DM_VERITY_OPTS_MAX (3 + DM_VERITY_OPTS_FEC + \ +#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \ DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); +static DEFINE_STATIC_KEY_FALSE(use_tasklet_enabled); + struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -221,7 +225,7 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, struct mapped_device *md = dm_table_get_md(v->ti->table); /* Corruption should be visible in device status in all modes */ - v->hash_failed = 1; + v->hash_failed = true; if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS) goto out; @@ -287,7 +291,19 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, verity_hash_at_level(v, block, level, &hash_block, &offset); - data = dm_bufio_read(v->bufio, hash_block, &buf); + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + data = dm_bufio_get(v->bufio, hash_block, &buf); + if (data == NULL) { + /* + * In tasklet and the hash was not in the bufio cache. + * Return early and resume execution from a work-queue + * to read the hash from disk. + */ + return -EAGAIN; + } + } else + data = dm_bufio_read(v->bufio, hash_block, &buf); + if (IS_ERR(data)) return PTR_ERR(data); @@ -308,6 +324,15 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; + else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + r = -EAGAIN; + goto release_ret_r; + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, hash_block, data, NULL) == 0) @@ -474,10 +499,24 @@ static int verity_verify_io(struct dm_verity_io *io) { bool is_zero; struct dm_verity *v = io->v; +#if defined(CONFIG_DM_VERITY_FEC) struct bvec_iter start; - unsigned b; +#endif + struct bvec_iter iter_copy; + struct bvec_iter *iter; struct crypto_wait wait; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + unsigned int b; + + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + /* + * Copy the iterator in case we need to restart + * verification in a work-queue. + */ + iter_copy = io->iter; + iter = &iter_copy; + } else + iter = &io->iter; for (b = 0; b < io->n_blocks; b++) { int r; @@ -486,7 +525,7 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks && likely(test_bit(cur_block, v->validated_blocks))) { - verity_bv_skip_block(v, io, &io->iter); + verity_bv_skip_block(v, io, iter); continue; } @@ -501,7 +540,7 @@ static int verity_verify_io(struct dm_verity_io *io) * If we expect a zero block, don't validate, just * return zeros. */ - r = verity_for_bv_block(v, io, &io->iter, + r = verity_for_bv_block(v, io, iter, verity_bv_zero); if (unlikely(r < 0)) return r; @@ -513,8 +552,11 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; - start = io->iter; - r = verity_for_io_block(v, io, &io->iter, &wait); +#if defined(CONFIG_DM_VERITY_FEC) + if (verity_fec_is_enabled(v)) + start = *iter; +#endif + r = verity_for_io_block(v, io, iter, &wait); if (unlikely(r < 0)) return r; @@ -528,9 +570,18 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); continue; + } else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + return -EAGAIN; +#if defined(CONFIG_DM_VERITY_FEC) } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - cur_block, NULL, &start) == 0) { + cur_block, NULL, &start) == 0) { continue; +#endif } else { if (bio->bi_status) { /* @@ -567,7 +618,8 @@ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) bio->bi_end_io = io->orig_bi_end_io; bio->bi_status = status; - verity_fec_finish_io(io); + if (!static_branch_unlikely(&use_tasklet_enabled) || !io->in_tasklet) + verity_fec_finish_io(io); bio_endio(bio); } @@ -576,9 +628,29 @@ static void verity_work(struct work_struct *w) { struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + io->in_tasklet = false; + + verity_fec_init_io(io); verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } +static void verity_tasklet(unsigned long data) +{ + struct dm_verity_io *io = (struct dm_verity_io *)data; + int err; + + io->in_tasklet = true; + err = verity_verify_io(io); + if (err == -EAGAIN) { + /* fallback to retrying with work-queue */ + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + return; + } + + verity_finish_io(io, errno_to_blk_status(err)); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; @@ -589,8 +661,13 @@ static void verity_end_io(struct bio *bio) return; } - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); + if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) { + tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io); + tasklet_schedule(&io->tasklet); + } else { + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + } } /* @@ -701,8 +778,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_private = io; io->iter = bio->bi_iter; - verity_fec_init_io(io); - verity_submit_prefetch(v, io); submit_bio_noacct(bio); @@ -752,6 +827,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args++; if (v->validated_blocks) args++; + if (v->use_tasklet) + args++; if (v->signature_key_desc) args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS; if (!args) @@ -777,6 +854,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); if (v->validated_blocks) DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); + if (v->use_tasklet) + DMEMIT(" " DM_VERITY_OPT_TASKLET_VERIFY); sz = verity_fec_status_table(v, sz, result, maxlen); if (v->signature_key_desc) DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY @@ -890,6 +969,9 @@ static void verity_dtr(struct dm_target *ti) kfree(v->signature_key_desc); + if (v->use_tasklet) + static_branch_dec(&use_tasklet_enabled); + kfree(v); } @@ -968,9 +1050,10 @@ static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name) } static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, - struct dm_verity_sig_opts *verify_args) + struct dm_verity_sig_opts *verify_args, + bool only_modifier_opts) { - int r; + int r = 0; unsigned argc; struct dm_target *ti = v->ti; const char *arg_name; @@ -991,6 +1074,8 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, argc--; if (verity_is_verity_mode(arg_name)) { + if (only_modifier_opts) + continue; r = verity_parse_verity_mode(v, arg_name); if (r) { ti->error = "Conflicting error handling parameters"; @@ -999,6 +1084,8 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, continue; } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + if (only_modifier_opts) + continue; r = verity_alloc_zero_digest(v); if (r) { ti->error = "Cannot allocate zero digest"; @@ -1007,17 +1094,29 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, continue; } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) { + if (only_modifier_opts) + continue; r = verity_alloc_most_once(v); if (r) return r; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_TASKLET_VERIFY)) { + v->use_tasklet = true; + static_branch_inc(&use_tasklet_enabled); + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) return r; continue; + } else if (verity_verify_is_sig_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; r = verity_verify_sig_parse_opt_args(as, v, verify_args, &argc, arg_name); @@ -1025,8 +1124,17 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return r; continue; + } else if (only_modifier_opts) { + /* + * Ignore unrecognized opt, could easily be an extra + * argument to an option whose parsing was skipped. + * Normal parsing (@only_modifier_opts=false) will + * properly parse all options (and their extra args). + */ + continue; } + DMERR("Unrecognized verity feature request: %s", arg_name); ti->error = "Unrecognized verity feature request"; return -EINVAL; } while (argc && !r); @@ -1054,6 +1162,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct dm_verity_sig_opts verify_args = {0}; struct dm_arg_set as; unsigned int num; + unsigned int wq_flags; unsigned long long num_ll; int r; int i; @@ -1085,6 +1194,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + /* Parse optional parameters that modify primary args */ + if (argc > 10) { + as.argc = argc - 10; + as.argv = argv + 10; + r = verity_parse_opt_args(&as, v, &verify_args, true); + if (r < 0) + goto bad; + } + if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 || num > 1) { ti->error = "Invalid version"; @@ -1156,7 +1274,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0); + v->tfm = crypto_alloc_ahash(v->alg_name, 0, + v->use_tasklet ? CRYPTO_ALG_ASYNC : 0); if (IS_ERR(v->tfm)) { ti->error = "Cannot initialize hash function"; r = PTR_ERR(v->tfm); @@ -1218,8 +1337,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (argc) { as.argc = argc; as.argv = argv; - - r = verity_parse_opt_args(&as, v, &verify_args); + r = verity_parse_opt_args(&as, v, &verify_args, false); if (r < 0) goto bad; } @@ -1266,7 +1384,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) v->bufio = dm_bufio_client_create(v->hash_dev->bdev, 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), - dm_bufio_alloc_callback, NULL); + dm_bufio_alloc_callback, NULL, + v->use_tasklet ? DM_BUFIO_CLIENT_NO_SLEEP : 0); if (IS_ERR(v->bufio)) { ti->error = "Cannot initialize dm-bufio"; r = PTR_ERR(v->bufio); @@ -1281,7 +1400,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } /* WQ_UNBOUND greatly improves performance when running on ramdisk */ - v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); + wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND; + if (v->use_tasklet) { + /* + * Allow verify_wq to preempt softirq since verification in + * tasklet will fall-back to using it for error handling + * (or if the bufio cache doesn't have required hashes). + */ + wq_flags |= WQ_HIGHPRI; + } + v->verify_wq = alloc_workqueue("kverityd", wq_flags, num_online_cpus()); if (!v->verify_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; @@ -1343,7 +1471,7 @@ int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned i static struct target_type verity_target = { .name = "verity", .features = DM_TARGET_IMMUTABLE, - .version = {1, 8, 1}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index c832cc3e3d24..45455de1b4bc 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -13,6 +13,7 @@ #include <linux/dm-bufio.h> #include <linux/device-mapper.h> +#include <linux/interrupt.h> #include <crypto/hash.h> #define DM_VERITY_MAX_LEVELS 63 @@ -51,9 +52,10 @@ struct dm_verity { unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ unsigned char levels; /* the number of tree levels */ unsigned char version; + bool hash_failed:1; /* set if hash of any block failed */ + bool use_tasklet:1; /* try to verify in tasklet before work-queue */ unsigned digest_size; /* digest size for the current hash algorithm */ unsigned int ahash_reqsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ enum verity_mode mode; /* mode for handling verification errors */ unsigned corrupted_errs;/* Number of errors for corrupted blocks */ @@ -76,10 +78,12 @@ struct dm_verity_io { sector_t block; unsigned n_blocks; + bool in_tasklet; struct bvec_iter iter; struct work_struct work; + struct tasklet_struct tasklet; /* * Three variably-size fields follow this struct: diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 1fc161d65673..96a003eb7323 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1594,7 +1594,8 @@ done: default: BUG(); - return -1; + wc_unlock(wc); + return DM_MAPIO_KILL; } } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 34db364c23a8..0278482fac94 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2945,7 +2945,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker); + ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); if (ret) { dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 99642f69bfa7..60549b65c799 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -752,7 +752,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } @@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work) while (io) { struct dm_io *next = io->next; - dm_io_rewind(io, &md->queue->bio_split); + dm_io_rewind(io, &md->disk->bio_split); io->next = NULL; __dm_io_complete(io, false); @@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector) * Does the target need to split IO even further? * - varied (per target) IO splitting is a tenet of DM; this * explains why stacked chunk_sectors based splitting via - * blk_queue_split() isn't possible here. + * bio_split_to_limits() isn't possible here. */ if (!ti->max_io_len) return len; @@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md, is_abnormal = is_abnormal_io(bio); if (unlikely(is_abnormal)) { /* - * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. */ - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); } init_clone_info(&ci, md, map, bio, is_abnormal); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 2cf973722f59..91836e6de326 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args) char *devname = args->device_names; dev_t devices[MD_SB_DISKS + 1], mdev; struct mdu_array_info_s ainfo = { }; - struct block_device *bdev; struct mddev *mddev; int err = 0, i; char name[16]; @@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_info("md: Loading %s: %s\n", name, args->device_names); - bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { - pr_err("md: open failed - cannot start array %s\n", name); + mddev = md_alloc(mdev, name); + if (IS_ERR(mddev)) { + pr_err("md: md_alloc failed - cannot start array %s\n", name); return; } - err = -EIO; - if (WARN(bdev->bd_disk->fops != &md_fops, - "Opening block device %x resulted in non-md device\n", - mdev)) - goto out_blkdev_put; - - mddev = bdev->bd_disk->private_data; - err = mddev_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); - goto out_blkdev_put; + goto out_mddev_put; } if (!list_empty(&mddev->disks) || mddev->raid_disks) { @@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_warn("md: starting %s failed\n", name); out_unlock: mddev_unlock(mddev); -out_blkdev_put: - blkdev_put(bdev, FMODE_READ); +out_mddev_put: + mddev_put(mddev); } static int __init raid_setup(char *str) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 37cbcce3cc66..742b2349fea3 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -40,7 +40,7 @@ struct resync_info { /* Lock the send communication. This is done through * bit manipulation as opposed to a mutex in order to - * accomodate lock and hold. See next comment. + * accommodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 /* If cluster operations (such as adding a disk) must lock the @@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) /* * If resync thread run after raid1d thread, then process_metadata_update * could not continue if raid1d held reconfig_mutex (and raid1d is blocked - * since another node already got EX on Token and waitting the EX of Ack), + * since another node already got EX on Token and waiting the EX of Ack), * so let resync wake up thread in case flag is set. */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4df78e30b76a..729be2c5296c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio) return; } - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) @@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); -static void mddev_put(struct mddev *mddev) +void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del_init(&mddev->all_mddevs); + set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that @@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { - kobject_init(&mddev->kobj, &md_ktype); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->bitmap_info.mutex); @@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void) return dev; } -static struct mddev *mddev_find(dev_t unit) -{ - struct mddev *mddev; - - if (MAJOR(unit) != MD_MAJOR) - unit &= ~((1 << MdpMinorShift) - 1); - - spin_lock(&all_mddevs_lock); - mddev = mddev_find_locked(unit); - if (mddev) - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - - return mddev; -} - static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; @@ -791,6 +756,15 @@ out_free_new: return ERR_PTR(error); } +static void mddev_free(struct mddev *mddev) +{ + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); + + kfree(mddev); +} + static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) @@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) @@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* Need to check that all other rdevs with the same - * ->bdev do not overlap. 'rcu' is sufficient to walk - * the rdev lists safely. - * This check does not provide a hard guarantee, it - * just helps avoid dangerous mistakes. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - - rcu_read_lock(); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - if (overlap) { - mddev_put(mddev); - break; - } - } - rcu_read_unlock(); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; } return len; } @@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -5001,7 +4984,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); @@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } -static void md_free(struct kobject *ko) +static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); @@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); - put_disk(mddev->gendisk); - } - percpu_ref_exit(&mddev->writes_pending); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - kfree(mddev); + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { @@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = { .store = md_attr_store, }; static struct kobj_type md_ktype = { - .release = md_free, + .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; @@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init_writes_pending); -static int md_alloc(dev_t dev, char *name) +struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with @@ -5647,12 +5620,13 @@ static int md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); + flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { - mutex_unlock(&disks_mutex); - return PTR_ERR(mddev); + error = PTR_ERR(mddev); + goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); @@ -5670,7 +5644,7 @@ static int md_alloc(dev_t dev, char *name) strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; - goto out_unlock_disks_mutex; + goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } @@ -5683,7 +5657,7 @@ static int md_alloc(dev_t dev, char *name) error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto out_unlock_disks_mutex; + goto out_free_mddev; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5704,25 +5678,45 @@ static int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = add_disk(disk); if (error) - goto out_cleanup_disk; + goto out_put_disk; + kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) - goto out_del_gendisk; + if (error) { + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. + */ + mddev->hold_active = 0; + mutex_unlock(&disks_mutex); + mddev_put(mddev); + return ERR_PTR(error); + } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); - goto out_unlock_disks_mutex; + mutex_unlock(&disks_mutex); + return mddev; -out_del_gendisk: - del_gendisk(disk); -out_cleanup_disk: +out_put_disk: put_disk(disk); -out_unlock_disks_mutex: +out_free_mddev: + mddev_free(mddev); +out_unlock: mutex_unlock(&disks_mutex); + return ERR_PTR(error); +} + +static int md_alloc_and_put(dev_t dev, char *name) +{ + struct mddev *mddev = md_alloc(dev, name); + + if (IS_ERR(mddev)) + return PTR_ERR(mddev); mddev_put(mddev); - return error; + return 0; } static void md_probe(dev_t dev) @@ -5730,7 +5724,7 @@ static void md_probe(dev_t dev) if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) - md_alloc(dev, NULL); + md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) @@ -5752,12 +5746,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp) return -E2BIG; strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) - return md_alloc(0, buf); + return md_alloc_and_put(0, buf); if (strncmp(buf, "md", 2) == 0 && isdigit(buf[2]) && kstrtoul(buf+2, 10, &devnum) == 0 && devnum <= MINORMASK) - return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); return -EINVAL; } @@ -6197,6 +6191,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -6266,6 +6261,7 @@ void md_stop(struct mddev *mddev) /* stop the array and free an attached data structures. * This is called from dm-raid */ + __md_stop_writes(mddev); __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); @@ -6497,9 +6493,8 @@ static void autorun_devices(int part) break; } - md_probe(dev); - mddev = mddev_find(dev); - if (!mddev) + mddev = md_alloc(dev, NULL); + if (IS_ERR(mddev)) break; if (mddev_lock(mddev)) @@ -7782,45 +7777,33 @@ out_unlock: static int md_open(struct block_device *bdev, fmode_t mode) { - /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. - */ - struct mddev *mddev = mddev_find(bdev->bd_dev); + struct mddev *mddev; int err; + spin_lock(&all_mddevs_lock); + mddev = mddev_get(bdev->bd_disk->private_data); + spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - return -EBUSY; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) goto out; - if (test_bit(MD_CLOSING, &mddev->flags)) { - mutex_unlock(&mddev->open_mutex); - err = -ENODEV; - goto out; - } + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; - err = 0; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); bdev_check_media_change(bdev); - out: - if (err) - mddev_put(mddev); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); return err; } @@ -7844,6 +7827,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) return ret; } +static void md_free_disk(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + percpu_ref_exit(&mddev->writes_pending); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + + mddev_free(mddev); +} + const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -7857,6 +7851,7 @@ const struct block_device_operations md_fops = .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, + .free_disk = md_free_disk, }; static int md_thread(void *arg) @@ -8018,16 +8013,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; - if (resync <= 3) { + if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; - } else if (resync > max_sectors) + } else if (resync > max_sectors) { resync = max_sectors; - else + } else { resync -= atomic_read(&mddev->recovery_active); + if (resync < MD_RESYNC_ACTIVE) { + /* + * Resync has started, but the subtraction has + * yielded one of the special values. Force it + * to active to ensure the status reports an + * active resync. + */ + resync = MD_RESYNC_ACTIVE; + } + } - if (resync == 0) { + if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; @@ -8051,7 +8056,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } return 0; } - if (resync < 3) { + if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } @@ -8152,6 +8157,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); return mddev; } @@ -8165,25 +8172,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; struct mddev *next_mddev, *mddev = v; + struct mddev *to_put = NULL; ++*pos; if (v == (void*)2) return NULL; spin_lock(&all_mddevs_lock); - if (v == (void*)1) + if (v == (void*)1) { tmp = all_mddevs.next; - else + } else { + to_put = mddev; + tmp = mddev->all_mddevs.next; + } + + for (;;) { + if (tmp == &all_mddevs) { + next_mddev = (void*)2; + *pos = 0x10000; + break; + } + next_mddev = list_entry(tmp, struct mddev, all_mddevs); + if (mddev_get(next_mddev)) + break; + mddev = next_mddev; tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; } spin_unlock(&all_mddevs_lock); - if (v != (void*)1) + if (to_put) mddev_put(mddev); return next_mddev; @@ -8682,7 +8699,6 @@ void md_do_sync(struct md_thread *thread) unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; - struct list_head *tmp; sector_t last_check; int skipped = 0; struct md_rdev *rdev; @@ -8729,13 +8745,7 @@ void md_do_sync(struct md_thread *thread) mddev->last_sync_action = action ?: desc; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commence - * other == active in resync - this many blocks - * + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -8747,24 +8757,29 @@ void md_do_sync(struct md_thread *thread) do { int mddev2_minor = -1; - mddev->curr_resync = 2; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - for_each_mddev(mddev2, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -8782,7 +8797,8 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev), mdname(mddev2)); } - mddev_put(mddev2); + spin_unlock(&all_mddevs_lock); + if (signal_pending(current)) flush_signals(current); schedule(); @@ -8792,7 +8808,8 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); + spin_unlock(&all_mddevs_lock); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -8876,7 +8893,7 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); @@ -9011,14 +9028,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { @@ -9082,7 +9099,7 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); @@ -9303,6 +9320,7 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9338,6 +9356,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); goto unlock; } @@ -9417,8 +9436,7 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + /* sync_thread should be unregistered, collect result */ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { @@ -9466,6 +9484,7 @@ void md_reap_sync_thread(struct mddev *mddev) wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) @@ -9544,11 +9563,14 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct list_head *tmp; - struct mddev *mddev; + struct mddev *mddev, *n; int need_delay = 0; - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); @@ -9557,7 +9579,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the @@ -9876,8 +9902,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev; - struct list_head *tmp; + struct mddev *mddev, *n; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -9897,17 +9922,24 @@ static __exit void md_exit(void) } remove_proc_entry("mdstat", NULL); - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; mddev->hold_active = 0; /* - * for_each_mddev() will call mddev_put() at the end of each - * iteration. As the mddev is now fully clear, this will - * schedule the mddev for destruction by a workqueue, and the + * As the mddev is now fully clear, mddev_put will schedule + * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index b4f84b27bdef..b4e2d8b87b61 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -254,6 +254,7 @@ struct md_cluster_info; * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. + * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -270,6 +271,7 @@ enum mddev_flags { MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, + MD_DELETED, }; enum mddev_sb_flags { @@ -288,6 +290,21 @@ struct serial_info { sector_t _subtree_last; /* highest sector in subtree of rb node */ }; +/* + * mddev->curr_resync stores the current sector of the resync but + * also has some overloaded values. + */ +enum { + /* No resync in progress */ + MD_RESYNC_NONE = 0, + /* Yielded to allow another conflicting resync to commence */ + MD_RESYNC_YIELDED = 1, + /* Delayed to check that there is no conflict with another sync */ + MD_RESYNC_DELAYED = 2, + /* Any value greater than or equal to this is in an active resync */ + MD_RESYNC_ACTIVE = 3, +}; + struct mddev { void *private; struct md_personality *pers; @@ -750,6 +767,8 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); +struct mddev *md_alloc(dev_t dev, char *name); +void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 54c089a50b15..11935864f50f 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -391,7 +391,8 @@ struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, sizeof(struct buffer_aux), dm_block_manager_alloc_callback, - dm_block_manager_write_callback); + dm_block_manager_write_callback, + 0); if (IS_ERR(bm->bufio)) { r = PTR_ERR(bm->bufio); kfree(bm); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 26545950ca42..64d6e4cd8a3a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2167,9 +2167,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) int err = 0; int number = rdev->raid_disk; struct md_rdev **rdevp; - struct raid10_info *p = conf->mirrors + number; + struct raid10_info *p; print_conf(conf); + if (unlikely(number >= mddev->raid_disks)) + return 0; + p = conf->mirrors + number; if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -2636,18 +2639,18 @@ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) } static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) + int sectors, struct page *page, enum req_op op) { sector_t first_bad; int bad_sectors; if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) + && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) /* success */ return 1; - if (rw == WRITE) { + if (op == REQ_OP_WRITE) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, @@ -2777,7 +2780,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s, conf->tmppage, WRITE) + s, conf->tmppage, REQ_OP_WRITE) == 0) { /* Well, this device is dead */ pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", @@ -2811,8 +2814,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s, conf->tmppage, - READ)) { + s, conf->tmppage, REQ_OP_READ)) { case 0: /* Well, this device is dead */ pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 6f2dd73128b0..f4e1cc1ece43 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log; - bool ret; - /* don't allow write if journal disk is missing */ - rcu_read_lock(); - log = rcu_dereference(conf->log); + struct r5l_log *log = conf->log; + /* don't allow write if journal disk is missing */ if (!log) - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); else - ret = test_bit(Faulty, &log->rdev->flags); - rcu_read_unlock(); - return ret; + return test_bit(Faulty, &log->rdev->flags); } #define R5L_RECOVERY_PAGE_POOL_SIZE 256 @@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - spin_lock(&mddev->lock); + ret = mddev_lock(mddev); + if (ret) + return ret; + conf = mddev->private; - if (!conf || !conf->log) { - spin_unlock(&mddev->lock); - return 0; - } + if (!conf || !conf->log) + goto out_unlock; switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - spin_unlock(&mddev->lock); + +out_unlock: + mddev_unlock(mddev); return ret; } @@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; - void **pslot; + void __rcu **pslot; sector_t tree_index; int ret; uintptr_t refcount; @@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, int i; int do_wakeup = 0; sector_t tree_index; - void **pslot; + void __rcu **pslot; uintptr_t refcount; if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) @@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - rcu_assign_pointer(conf->log, log); + conf->log = log; set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - conf->log = NULL; - synchronize_rcu(); - /* Ensure disable_writeback_work wakes up and exits */ wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + + conf->log = NULL; + mempool_exit(&log->meta_pool); bioset_exit(&log->bs); mempool_exit(&log->io_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 43c714a8798c..c8332502669e 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -2,49 +2,46 @@ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H -extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); -extern void r5l_exit_log(struct r5conf *conf); -extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); -extern void r5l_write_stripe_run(struct r5l_log *log); -extern void r5l_flush_stripe_to_raid(struct r5l_log *log); -extern void r5l_stripe_write_finished(struct stripe_head *sh); -extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int quiesce); -extern bool r5l_log_disk_error(struct r5conf *conf); -extern bool r5c_is_writeback(struct r5l_log *log); -extern int -r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks); -extern void -r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s); -extern void r5c_release_extra_page(struct stripe_head *sh); -extern void r5c_use_extra_page(struct stripe_head *sh); -extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); -extern void r5c_handle_cached_data_endio(struct r5conf *conf, - struct stripe_head *sh, int disks); -extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); -extern void r5c_make_stripe_write_out(struct stripe_head *sh); -extern void r5c_flush_cache(struct r5conf *conf, int num); -extern void r5c_check_stripe_cache_usage(struct r5conf *conf); -extern void r5c_check_cached_full_stripe(struct r5conf *conf); +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +void r5l_exit_log(struct r5conf *conf); +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +void r5l_write_stripe_run(struct r5l_log *log); +void r5l_flush_stripe_to_raid(struct r5l_log *log); +void r5l_stripe_write_finished(struct stripe_head *sh); +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +void r5l_quiesce(struct r5l_log *log, int quiesce); +bool r5l_log_disk_error(struct r5conf *conf); +bool r5c_is_writeback(struct r5l_log *log); +int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +void r5c_release_extra_page(struct stripe_head *sh); +void r5c_use_extra_page(struct stripe_head *sh); +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks); +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); +void r5c_make_stripe_write_out(struct stripe_head *sh); +void r5c_flush_cache(struct r5conf *conf, int num); +void r5c_check_stripe_cache_usage(struct r5conf *conf); +void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev, - struct md_rdev *rdev); -extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); -extern int r5l_start(struct r5l_log *log); +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +int r5l_start(struct r5l_log *log); -extern struct dma_async_tx_descriptor * +struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx); -extern int ppl_init_log(struct r5conf *conf); -extern void ppl_exit_log(struct r5conf *conf); -extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); -extern void ppl_write_stripe_run(struct r5conf *conf); -extern void ppl_stripe_write_finished(struct stripe_head *sh); -extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); -extern void ppl_quiesce(struct r5conf *conf, int quiesce); -extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +int ppl_init_log(struct r5conf *conf); +void ppl_exit_log(struct r5conf *conf); +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); +void ppl_write_stripe_run(struct r5conf *conf); +void ppl_stripe_write_finished(struct stripe_head *sh); +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +void ppl_quiesce(struct r5conf *conf, int quiesce); +int ppl_handle_flush_request(struct bio *bio); extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) @@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) if (conf->log) ret = r5l_handle_flush_request(conf->log, bio); else if (raid5_has_ppl(conf)) - ret = ppl_handle_flush_request(conf->log, bio); + ret = ppl_handle_flush_request(bio); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 98988cb26295..31b9157bc9ae 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce) } } -int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) +int ppl_handle_flush_request(struct bio *bio) { if (bio->bi_iter.bi_size == 0) { bio_endio(bio); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5cabdbbac48b..31a0cbf63384 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -61,6 +61,8 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +#define RAID5_MAX_REQ_STRIPES 256 + static bool devices_handle_discard_safely = false; module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, @@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, return NULL; } +static struct stripe_head *find_get_stripe(struct r5conf *conf, + sector_t sector, short generation, int hash) +{ + int inc_empty_inactive_list_flag; + struct stripe_head *sh; + + sh = __find_stripe(conf, sector, generation); + if (!sh) + return NULL; + + if (atomic_inc_not_zero(&sh->count)) + return sh; + + /* + * Slow path. The reference count is zero which means the stripe must + * be on a list (sh->lru). Must remove the stripe from the list that + * references it with the device_lock held. + */ + + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; + list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && + inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); + + return sh; +} + /* * Need to check if array has failed when deciding whether to: * - start an array @@ -710,80 +755,121 @@ static bool has_failed(struct r5conf *conf) return degraded > conf->max_degraded; } -struct stripe_head * -raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* a reference to the last stripe_head for batching */ + struct stripe_head *batch_last; + + /* first sector in the request */ + sector_t first_sector; + + /* last sector in the request */ + sector_t last_sector; + + /* + * bitmap to track stripe sectors that have been added to stripes + * add one to account for unaligned requests + */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); + + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +/* + * Block until another thread clears R5_INACTIVE_BLOCKED or + * there are fewer than 3/4 the maximum number of active stripes + * and there is an inactive stripe available. + */ +static bool is_inactive_blocked(struct r5conf *conf, int hash) +{ + int active = atomic_read(&conf->active_stripes); + + if (list_empty(conf->inactive_list + hash)) + return false; + + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + return true; + + return active < (conf->max_nr_stripes * 3 / 4); +} + +static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + bool previous, bool noblock, bool noquiesce) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); - int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(conf->hash_locks + hash); - do { - wait_event_lock_irq(conf->wait_for_quiescent, - conf->quiesce == 0 || noquiesce, +retry: + if (!noquiesce && conf->quiesce) { + /* + * Must release the reference to batch_last before waiting, + * on quiesce, otherwise the batch_last will hold a reference + * to a stripe and raid5_quiesce() will deadlock waiting for + * active_stripes to go to zero. + */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + + wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, *(conf->hash_locks + hash)); - sh = __find_stripe(conf, sector, conf->generation - previous); - if (!sh) { - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { - sh = get_free_stripe(conf, hash); - if (!sh && !test_bit(R5_DID_ALLOC, - &conf->cache_state)) - set_bit(R5_ALLOC_MORE, - &conf->cache_state); - } - if (noblock && sh == NULL) - break; + } - r5c_check_stripe_cache_usage(conf); - if (!sh) { - set_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq( - conf->wait_for_stripe, - !list_empty(conf->inactive_list + hash) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes * 3 / 4) - || !test_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state)), - *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - } else { - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - } - } else if (!atomic_inc_not_zero(&sh->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&sh->count)) { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&sh->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } - } - atomic_inc(&sh->count); - spin_unlock(&conf->device_lock); - } - } while (sh == NULL); + sh = find_get_stripe(conf, sector, conf->generation - previous, hash); + if (sh) + goto out; + + if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + goto wait_for_stripe; + + sh = get_free_stripe(conf, hash); + if (sh) { + r5c_check_stripe_cache_usage(conf); + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + goto out; + } + + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + +wait_for_stripe: + if (noblock) + goto out; + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, + is_inactive_blocked(conf, hash), + *(conf->hash_locks + hash)); + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + goto retry; + +out: spin_unlock_irq(conf->hash_locks + hash); return sh; } +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + sector_t sector, bool previous, bool noblock, bool noquiesce) +{ + return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, + noquiesce); +} + static bool is_full_stripe_write(struct stripe_head *sh) { BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); @@ -824,13 +910,13 @@ static bool stripe_can_batch(struct stripe_head *sh) } /* we only do back search */ -static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +static void stripe_add_to_batch_list(struct r5conf *conf, + struct stripe_head *sh, struct stripe_head *last_sh) { struct stripe_head *head; sector_t head_sector, tmp_sec; int hash; int dd_idx; - int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -838,36 +924,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh return; head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(conf, head_sector); - spin_lock_irq(conf->hash_locks + hash); - head = __find_stripe(conf, head_sector, conf->generation); - if (head && !atomic_inc_not_zero(&head->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&head->count)) { - if (!test_bit(STRIPE_HANDLE, &head->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&head->lru) && - !test_bit(STRIPE_EXPANDING, &head->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&head->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (head->group) { - head->group->stripes_cnt--; - head->group = NULL; - } - } + if (last_sh && head_sector == last_sh->sector) { + head = last_sh; atomic_inc(&head->count); - spin_unlock(&conf->device_lock); + } else { + hash = stripe_hash_locks_hash(conf, head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = find_get_stripe(conf, head_sector, conf->generation, + hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; } - spin_unlock_irq(conf->hash_locks + hash); - - if (!head) - return; - if (!stripe_can_batch(head)) - goto out; lock_two_stripes(head, sh); /* clear_batch_ready clear the flag */ @@ -2882,10 +2952,10 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); + raid5_release_stripe(sh); } static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) @@ -3413,39 +3483,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked, s->ops_request); } -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, - int forwrite, int previous) +static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite) { - struct bio **bip; struct r5conf *conf = sh->raid_conf; - int firstwrite=0; + struct bio **bip; - pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_iter.bi_sector, - (unsigned long long)sh->sector); + pr_debug("checking bi b#%llu to stripe s#%llu\n", + bi->bi_iter.bi_sector, sh->sector); - spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) - goto overlap; - if (forwrite) { + return true; + + if (forwrite) bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL) - firstwrite = 1; - } else + else bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) - goto overlap; - bip = & (*bip)->bi_next; + return true; + bip = &(*bip)->bi_next; } + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) - goto overlap; + return true; if (forwrite && raid5_has_ppl(conf)) { /* @@ -3474,9 +3537,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } if (first + conf->chunk_sectors * (count - 1) != last) - goto overlap; + return true; + } + + return false; +} + +static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + int firstwrite = 0; + + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (!*bip) + firstwrite = 1; + } else { + bip = &sh->dev[dd_idx].toread; } + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) + bip = &(*bip)->bi_next; + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -3502,9 +3586,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, sh->overwrite_disks++; } - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_iter.bi_sector, - (unsigned long long)sh->sector, dd_idx); + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, + sh->dev[dd_idx].sector); if (conf->mddev->bitmap && firstwrite) { /* Cannot hold spinlock over bitmap_startwrite, @@ -3512,7 +3596,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * we have added to the bitmap and set bm_seq. * So set STRIPE_BITMAP_PENDING to prevent * batching. - * If multiple add_stripe_bio() calls race here they + * If multiple __add_stripe_bio() calls race here they * much all set STRIPE_BITMAP_PENDING. So only the first one * to complete "bitmap_startwrite" gets to set * STRIPE_BIT_DELAY. This is important as once a stripe @@ -3530,16 +3614,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BIT_DELAY, &sh->state); } } - spin_unlock_irq(&sh->stripe_lock); +} - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); - return 1; +/* + * Each stripe/dev can have one or more bios attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + spin_lock_irq(&sh->stripe_lock); + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&sh->stripe_lock); + return false; + } - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); spin_unlock_irq(&sh->stripe_lock); - return 0; + return true; } static void end_reshape(struct r5conf *conf); @@ -5785,17 +5880,215 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) bio_endio(bi); } -static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, + sector_t reshape_sector) { - struct r5conf *conf = mddev->private; + return mddev->reshape_backwards ? sector < reshape_sector : + sector >= reshape_sector; +} + +static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, + sector_t max, sector_t reshape_sector) +{ + return mddev->reshape_backwards ? max < reshape_sector : + min >= reshape_sector; +} + +static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, + struct stripe_head *sh) +{ + sector_t max_sector = 0, min_sector = MaxSector; + bool ret = false; int dd_idx; - sector_t new_sector; - sector_t logical_sector, last_sector; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + if (dd_idx == sh->pd_idx) + continue; + + min_sector = min(min_sector, sh->dev[dd_idx].sector); + max_sector = min(max_sector, sh->dev[dd_idx].sector); + } + + spin_lock_irq(&conf->device_lock); + + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + ret = true; + + spin_unlock_irq(&conf->device_lock); + + return ret; +} + +static int add_all_stripe_bios(struct r5conf *conf, + struct stripe_request_ctx *ctx, struct stripe_head *sh, + struct bio *bi, int forwrite, int previous) +{ + int dd_idx; + int ret = 1; + + spin_lock_irq(&sh->stripe_lock); + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &dev->flags); + ret = 0; + continue; + } + } + + if (!ret) + goto out; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + clear_bit((dev->sector - ctx->first_sector) >> + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); + } + +out: + spin_unlock_irq(&sh->stripe_lock); + return ret; +} + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; struct stripe_head *sh; + sector_t new_sector; + int previous = 0; + int seq, dd_idx; + + seq = read_seqcount_begin(&conf->gen_lock); + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, + (bi->bi_opf & REQ_RAHEAD), 0); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous) && + stripe_ahead_of_reshape(mddev, conf, sh)) { + /* + * Expansion moved on while waiting for a stripe. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) { + stripe_add_to_batch_list(conf, sh, ctx->batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); + atomic_inc(&sh->count); + ctx->batch_last = sh; + } + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct r5conf *conf = mddev->private; + sector_t logical_sector; + struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); - DEFINE_WAIT(w); - bool do_prepare; - bool do_flush = false; + enum stripe_result res; + int s, stripe_cnt; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5811,7 +6104,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, * we need to flush journal device */ - do_flush = bi->bi_opf & REQ_PREFLUSH; + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } if (!md_write_start(mddev, bi)) @@ -5835,134 +6128,68 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - last_sector = bio_end_sector(bi); + ctx.first_sector = logical_sector; + ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf)); + bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, + bi->bi_iter.bi_sector, ctx.last_sector); + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - (mddev->reshape_backwards - ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) - : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); return true; } md_account_bio(mddev, &bi); - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int previous; - int seq; - - do_prepare = false; - retry: - seq = read_seqcount_begin(&conf->gen_lock); - previous = 0; - if (do_prepare) - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_progress - : logical_sector >= conf->reshape_progress) { - previous = 1; - } else { - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_safe - : logical_sector >= conf->reshape_safe) { - spin_unlock_irq(&conf->device_lock); - schedule(); - do_prepare = true; - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); + add_wait_queue(&conf->wait_for_overlap, &wait); + while (1) { + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi); + if (res == STRIPE_FAIL) + break; - sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); - if (sh) { - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head - * by accident - */ - raid5_release_stripe(sh); - goto retry; - } + if (res == STRIPE_RETRY) + continue; - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - md_wakeup_thread(mddev->thread); - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; + if (res == STRIPE_SCHEDULE_AND_RETRY) { + /* + * Must release the reference to batch_last before + * scheduling and waiting for work to be done, + * otherwise the batch_last stripe head could prevent + * raid5_activate_delayed() from making progress + * and thus deadlocking. + */ + if (ctx.batch_last) { + raid5_release_stripe(ctx.batch_last); + ctx.batch_last = NULL; } - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe_plug(mddev, sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - bi->bi_status = BLK_STS_IOERR; - break; + wait_woken(&wait, TASK_UNINTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + continue; } + + s = find_first_bit(ctx.sectors_to_do, stripe_cnt); + if (s == stripe_cnt) + break; + + logical_sector = ctx.first_sector + + (s << RAID5_STRIPE_SHIFT(conf)); } - finish_wait(&conf->wait_for_overlap, &w); + remove_wait_queue(&conf->wait_for_overlap, &wait); + + if (ctx.batch_last) + raid5_release_stripe(ctx.batch_last); if (rw == WRITE) md_write_end(mddev); @@ -7417,7 +7644,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker); + ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); @@ -7815,7 +8042,15 @@ static int raid5_run(struct mddev *mddev) mddev->queue->limits.discard_granularity < stripe) blk_queue_max_discard_sectors(mddev->queue, 0); - blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + blk_queue_max_hw_sectors(mddev->queue, + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); + + /* No restrictions on the number of segments in the request */ + blk_queue_max_segments(mddev->queue, USHRT_MAX); } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) @@ -8066,8 +8301,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && + if (rdev->saved_raid_disk >= first && rdev->saved_raid_disk <= last && conf->disks[rdev->saved_raid_disk].rdev == NULL) first = rdev->saved_raid_disk; @@ -8704,8 +8938,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) + if (err) { + mddev_suspend(mddev); log_exit(conf); + mddev_resume(mddev); + } } } else err = -EINVAL; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 638d29863503..a5082bed83c8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -812,7 +812,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, struct stripe_head *sh); extern struct stripe_head * raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce); + bool previous, bool noblock, bool noquiesce); extern int raid5_calc_degraded(struct r5conf *conf); extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif |