diff options
Diffstat (limited to 'drivers/md')
81 files changed, 1227 insertions, 790 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4a249ee86364..83b9362be09c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -178,7 +178,7 @@ config MD_FAULTY config MD_CLUSTER - tristate "Cluster Support for MD (EXPERIMENTAL)" + tristate "Cluster Support for MD" depends on BLK_DEV_MD depends on DLM default n @@ -188,7 +188,8 @@ config MD_CLUSTER nodes in the cluster can access the MD devices simultaneously. This brings the redundancy (and uptime) of RAID levels across the - nodes of the cluster. + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). If unsure, say N. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 786ec9e86d65..f701bb211783 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the kernel software RAID and LVM drivers. # @@ -18,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o -md-mod-y += md.o bitmap.o +md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index c488b846f831..d26b35195825 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_BCACHE) += bcache.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index cacbe2dbd5c3..a27d85232ce1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Primary bucket allocation code * @@ -406,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); @@ -441,6 +443,11 @@ out: b->prio = INITIAL_PRIO; } + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + return r; } @@ -448,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } } void bch_bucket_free(struct cache_set *c, struct bkey *k) @@ -600,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, /* * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but + * second time we call pick_data_bucket(). If we allocated a bucket but * didn't use it, drop the refcount bch_bucket_alloc_set() took: */ if (KEY_PTRS(&alloc.key)) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2ed9bd231d84..843877e017e1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_H #define _BCACHE_H @@ -184,6 +185,7 @@ #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/rwsem.h> +#include <linux/refcount.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -265,9 +267,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -299,7 +298,7 @@ struct cached_dev { struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; + refcount_t count; struct work_struct detach; /* @@ -362,12 +361,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; - int64_t writeback_rate_change; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { @@ -581,6 +582,7 @@ struct cache_set { uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; + size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ @@ -806,13 +808,13 @@ do { \ static inline void cached_dev_put(struct cached_dev *dc) { - if (atomic_dec_and_test(&dc->count)) + if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { - if (!atomic_inc_not_zero(&dc->count)) + if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 18526d44688d..e56d3ecdbfcb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for working with individual keys, and sorted sets of keys with in a * btree node diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae964624efb2..fa506c1aa524 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 866dcf78ff8e..11c5503d31dc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * @@ -1240,6 +1241,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) __bch_btree_mark_key(c, level, k); } +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1651,9 +1657,8 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -static size_t bch_btree_gc_finish(struct cache_set *c) +static void bch_btree_gc_finish(struct cache_set *c) { - size_t available = 0; struct bucket *b; struct cache *ca; unsigned i; @@ -1690,6 +1695,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) } rcu_read_unlock(); + c->avail_nbuckets = 0; for_each_cache(ca, c, i) { uint64_t *i; @@ -1711,18 +1717,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - available++; + c->avail_nbuckets++; } } mutex_unlock(&c->bucket_lock); - return available; } static void bch_btree_gc(struct cache_set *c) { int ret; - unsigned long available; struct gc_stat stats; struct closure writes; struct btree_op op; @@ -1745,14 +1749,14 @@ static void bch_btree_gc(struct cache_set *c) pr_warn("gc failed!"); } while (ret); - available = bch_btree_gc_finish(c); + bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; - stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73da1f5626cb..d211e2c25b6b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_BTREE_H #define _BCACHE_BTREE_H @@ -305,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); - +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 7d5286b05036..1841d0359bac 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put); void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; - struct closure *cl; + struct closure *cl, *t; struct llist_node *reverse = NULL; list = llist_del_all(&wait_list->list); @@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) reverse = llist_reverse_order(list); /* Then do the wakeups */ - llist_for_each_entry(cl, reverse, list) { + llist_for_each_entry_safe(cl, t, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 295b7e43f92c..ccfbea6f9f6b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CLOSURE_H #define _LINUX_CLOSURE_H @@ -251,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, static inline void closure_queue(struct closure *cl) { struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 61076eda2e6d..c7a02c4900da 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Assorted bcache debug code * diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1f63c195d247..acc48d3fa274 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 243de0bf15cd..41c238fc3733 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index e2ed54054e7a..0cd3575afa1d 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_EXTENTS_H #define _BCACHE_EXTENTS_H diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 7e871bdc0097..fac97ec2d0e2 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7e1d1c3ba33a..02a98ddb592d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache journalling code, for btree insertions * diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index e3c39457afbb..b5788199188f 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_JOURNAL_H #define _BCACHE_JOURNAL_H diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index f633b30c962e..d50c1c97da68 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Moving/copying garbage collector * diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 681b4f12b05a..3a7aed7282b2 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Main bcache entry point - handle a read or a write request and decide what to * do with it; the make_request functions are called by the block layer. @@ -26,12 +27,12 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +static unsigned cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } -static bool verify(struct cached_dev *dc, struct bio *bio) +static bool verify(struct cached_dev *dc) { return dc->verify; } @@ -369,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc, bio); + unsigned mode = cache_mode(dc); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -384,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) op_is_write(bio_op(bio)))) goto skip; + /* + * Flag for bypass if the IO is for read-ahead or background, + * unless the read-ahead request is for metadata (eg, for gfs2). + */ + if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && + !(bio->bi_opf & REQ_META)) + goto skip; + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { pr_debug("skipping unaligned io"); @@ -462,6 +471,7 @@ struct search { unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; + unsigned cache_missed:1; unsigned long start_time; @@ -648,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio, s->orig_bio = bio; s->cache_miss = NULL; + s->cache_missed = 0; s->d = d; s->recoverable = 1; s->write = op_is_write(bio_op(bio)); @@ -697,8 +708,16 @@ static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - if (s->recoverable) { + /* + * If cache device is dirty (dc->has_dirty is non-zero), then + * recovery a failed read request from cached device may get a + * stale data back. So read failure recovery is only permitted + * when cache device is clean. + */ + if (s->recoverable && + (dc && !atomic_read(&dc->has_dirty))) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); @@ -739,7 +758,7 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); @@ -759,12 +778,12 @@ static void cached_dev_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, - !s->cache_miss, s->iop.bypass); + !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->iop.bio || verify(dc, &s->bio.bio)) + else if (s->iop.bio || verify(dc)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -778,6 +797,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + s->cache_missed = 1; + if (s->cache_miss || s->iop.bypass) { miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; @@ -891,7 +912,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, - cache_mode(dc, bio), + cache_mode(dc), s->iop.bypass)) { s->iop.bypass = false; s->iop.writeback = true; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 7689176951ce..dea0886b81c1 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 0ca072c20d0d..d0831d5bcc87 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache stats code * diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index adbff141c887..0b70f9de0c03 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_STATS_H_ #define _BCACHE_STATS_H_ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fc0a31b13ac4..b4d28928dec5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); static int bcache_major; -static DEFINE_IDA(bcache_minor); +static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) -#define BCACHE_MINORS 16 /* partition support */ +/* limitation of partitions number on single bcache device */ +#define BCACHE_MINORS 128 +/* limitation of bcache devices number on single system */ +#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) /* Superblock */ @@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, closure_get(&c->caching); } +static inline int first_minor_to_idx(int first_minor) +{ + return (first_minor/BCACHE_MINORS); +} + +static inline int idx_to_first_minor(int idx) +{ + return (idx * BCACHE_MINORS); +} + static void bcache_device_free(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d) if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); if (d->disk) { - ida_simple_remove(&bcache_minor, d->disk->first_minor); + ida_simple_remove(&bcache_device_idx, + first_minor_to_idx(d->disk->first_minor)); put_disk(d->disk); } @@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; - int minor; + int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; @@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->full_dirty_stripes) return -ENOMEM; - minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); - if (minor < 0) - return minor; - - minor *= BCACHE_MINORS; + idx = ida_simple_get(&bcache_device_idx, 0, + BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + if (idx < 0) + return idx; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS | BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_minor, minor); + ida_simple_remove(&bcache_device_idx, idx); return -ENOMEM; } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); d->disk->major = bcache_major; - d->disk->first_minor = minor; + d->disk->first_minor = idx_to_first_minor(idx); d->disk->fops = &bcache_ops; d->disk->private_data = d; @@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); + BUG_ON(refcount_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ - atomic_set(&dc->count, 1); + refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ down_write(&dc->writeback_lock); @@ -1028,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); + refcount_inc(&dc->count); bch_writeback_queue(dc); } @@ -1129,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) if (ret) return ret; - set_capacity(dc->disk.disk, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info->ra_pages = max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); @@ -2085,6 +2095,7 @@ static void bcache_exit(void) if (bcache_major) unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); } static int __init bcache_init(void) @@ -2103,14 +2114,15 @@ static int __init bcache_init(void) bcache_major = register_blkdev(0, "bcache"); if (bcache_major < 0) { unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); return bcache_major; } if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - sysfs_create_files(bcache_kobj, files) || bch_request_init() || - bch_debug_init(bcache_kobj)) + bch_debug_init(bcache_kobj) || + sysfs_create_files(bcache_kobj, files)) goto err; return 0; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 104c57cd666c..b4184092c727 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache sysfs interfaces * @@ -81,8 +82,9 @@ rw_attribute(writeback_delay); rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); -rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -130,15 +132,16 @@ SHOW(__bch_cached_dev) sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); - var_print(writeback_rate_d_term); + var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { char rate[20]; char dirty[20]; char target[20]; char proportional[20]; - char derivative[20]; + char integral[20]; char change[20]; s64 next_io; @@ -146,7 +149,7 @@ SHOW(__bch_cached_dev) bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); bch_hprint(change, dc->writeback_rate_change << 9); next_io = div64_s64(dc->writeback_rate.next - local_clock(), @@ -157,11 +160,11 @@ SHOW(__bch_cached_dev) "dirty:\t\t%s\n" "target:\t\t%s\n" "proportional:\t%s\n" - "derivative:\t%s\n" + "integral:\t%s\n" "change:\t\t%s/sec\n" "next io:\t%llims\n", rate, dirty, target, proportional, - derivative, change, next_io); + integral, change, next_io); } sysfs_hprint(dirty_data, @@ -213,7 +216,7 @@ STORE(__cached_dev) dc->writeback_rate.rate, 1, INT_MAX); d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); @@ -319,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, - &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, &sysfs_dirty_data, @@ -745,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = { }; KTYPE(bch_cache_set_internal); +static int __bch_cache_cmp(const void *l, const void *r) +{ + return *((uint16_t *)r) - *((uint16_t *)l); +} + SHOW(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -769,9 +777,6 @@ SHOW(__bch_cache) CACHE_REPLACEMENT(&ca->sb)); if (attr == &sysfs_priority_stats) { - int cmp(const void *l, const void *r) - { return *((uint16_t *) r) - *((uint16_t *) l); } - struct bucket *b; size_t n = ca->sb.nbuckets, i; size_t unused = 0, available = 0, dirty = 0, meta = 0; @@ -800,7 +805,7 @@ SHOW(__bch_cache) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); - sort(p, n, sizeof(uint16_t), cmp, NULL); + sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); while (n && !cached[n - 1]) diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 0526fe92a683..b54fe9602529 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_SYSFS_H_ #define _BCACHE_SYSFS_H_ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index b7820b0d2621..a9a73f560c04 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcache.h" #include "btree.h" diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 176d3c2ef5f5..e548b8b51322 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) d->next += div_u64(done * NSEC_PER_SEC, d->rate); - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) + d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6..ed5e8a412eb8 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H @@ -441,10 +442,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e663ca082183..56a37884ca8b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * background writeback - scan btree for dirty data and write it to the backing * device @@ -25,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc) bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), c->cached_dev_sectors); - /* PD controller */ - + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; - int64_t proportional = dirty - target; - int64_t change; - - dc->disk.sectors_dirty_last = dirty; - - /* Scale to sectors per second */ - - proportional *= dc->writeback_rate_update_seconds; - proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - - derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - (dc->writeback_rate_d_term / - dc->writeback_rate_update_seconds) ?: 1, 0); - - derivative *= dc->writeback_rate_d_term; - derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - - change = proportional + derivative; + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled; + uint32_t new_rate; + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + NSEC_PER_MSEC)) - change = 0; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - dc->writeback_rate.rate = - clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); - dc->writeback_rate_proportional = proportional; - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; + dc->writeback_rate.rate = new_rate; dc->writeback_rate_target = target; } @@ -179,13 +195,21 @@ static void write_dirty(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = KEY_START(&w->key); - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + closure_bio_submit(&io->bio, cl); + } continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } @@ -417,6 +441,8 @@ static int bch_writeback_thread(void *arg) struct cached_dev *dc = arg; bool searched_full_index; + bch_ratelimit_reset(&dc->writeback_rate); + while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || @@ -444,7 +470,6 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); - bch_ratelimit_reset(&dc->writeback_rate); read_dirty(dc); if (searched_full_index) { @@ -454,6 +479,8 @@ static int bch_writeback_thread(void *arg) !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); } } @@ -491,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - - d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -506,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = 5; - dc->writeback_rate_d_term = 30; - dc->writeback_rate_p_term_inverse = 6000; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e35421d20d2e..a9e3ffb4b03c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_WRITEBACK_H #define _BCACHE_WRITEBACK_H @@ -76,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; + return (op_is_sync(bio->bi_opf) || + bio->bi_opf & (REQ_META|REQ_PRIO) || + in_use <= CUTOFF_WRITEBACK); } static inline void bch_writeback_queue(struct cached_dev *dc) @@ -89,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + refcount_inc(&dc->count); if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index d216a8f7bc22..33bb074d6941 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -347,7 +347,7 @@ static void __cache_size_refresh(void) BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); BUG_ON(dm_bufio_client_count < 0); - dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); + dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size); /* * Use default if set to 0 and report the actual cache size used. @@ -960,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { + if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { if (mutex_trylock(&dm_bufio_clients_lock)) { __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); @@ -1600,7 +1600,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } @@ -1647,7 +1647,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); - return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]); + return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); } /* @@ -1818,7 +1818,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); static unsigned get_max_age_hz(void) { - unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); + unsigned max_age = READ_ONCE(dm_bufio_max_age); if (max_age > UINT_MAX / HZ) max_age = UINT_MAX / HZ; diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c index f092771878c2..8eb52e425141 100644 --- a/drivers/md/dm-builtin.c +++ b/drivers/md/dm-builtin.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "dm-core.h" /* diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c index 707233891291..1d0af0a21fc7 100644 --- a/drivers/md/dm-cache-background-tracker.c +++ b/drivers/md/dm-cache-background-tracker.c @@ -161,8 +161,17 @@ EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); static bool max_work_reached(struct background_tracker *b) { - // FIXME: finish - return false; + return atomic_read(&b->pending_promotes) + + atomic_read(&b->pending_writebacks) + + atomic_read(&b->pending_demotes) >= b->max_work; +} + +struct bt_work *alloc_work(struct background_tracker *b) +{ + if (max_work_reached(b)) + return NULL; + + return kmem_cache_alloc(b->work_cache, GFP_NOWAIT); } int btracker_queue(struct background_tracker *b, @@ -174,10 +183,7 @@ int btracker_queue(struct background_tracker *b, if (pwork) *pwork = NULL; - if (max_work_reached(b)) - return -ENOMEM; - - w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + w = alloc_work(b); if (!w) return -ENOMEM; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 4a4e9c75fc4c..0d7212410e21 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -13,6 +13,7 @@ #include "persistent-data/dm-transaction-manager.h" #include <linux/device-mapper.h> +#include <linux/refcount.h> /*----------------------------------------------------------------*/ @@ -100,7 +101,7 @@ struct cache_disk_superblock { } __packed; struct dm_cache_metadata { - atomic_t ref_count; + refcount_t ref_count; struct list_head list; unsigned version; @@ -753,7 +754,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev, } cmd->version = metadata_version; - atomic_set(&cmd->ref_count, 1); + refcount_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; cmd->data_block_size = data_block_size; @@ -791,7 +792,7 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev) list_for_each_entry(cmd, &table, list) if (cmd->bdev == bdev) { - atomic_inc(&cmd->ref_count); + refcount_inc(&cmd->ref_count); return cmd; } @@ -862,7 +863,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, void dm_cache_metadata_close(struct dm_cache_metadata *cmd) { - if (atomic_dec_and_test(&cmd->ref_count)) { + if (refcount_dec_and_test(&cmd->ref_count)) { mutex_lock(&table_lock); list_del(&cmd->list); mutex_unlock(&table_lock); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index e5eb9c9b4bc8..4ab23d0075f6 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -213,6 +213,19 @@ static void l_del(struct entry_space *es, struct ilist *l, struct entry *e) l->nr_elts--; } +static struct entry *l_pop_head(struct entry_space *es, struct ilist *l) +{ + struct entry *e; + + for (e = l_head(es, l); e; e = l_next(es, e)) + if (!e->sentinel) { + l_del(es, l, e); + return e; + } + + return NULL; +} + static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l) { struct entry *e; @@ -719,7 +732,7 @@ static struct entry *alloc_entry(struct entry_alloc *ea) if (l_empty(&ea->free)) return NULL; - e = l_pop_tail(ea->es, &ea->free); + e = l_pop_head(ea->es, &ea->free); init_entry(e); ea->nr_allocated++; @@ -1158,13 +1171,13 @@ static void clear_pending(struct smq_policy *mq, struct entry *e) e->pending_work = false; } -static void queue_writeback(struct smq_policy *mq) +static void queue_writeback(struct smq_policy *mq, bool idle) { int r; struct policy_work work; struct entry *e; - e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed); + e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle); if (e) { mark_pending(mq, e); q_del(&mq->dirty, e); @@ -1174,12 +1187,16 @@ static void queue_writeback(struct smq_policy *mq) work.cblock = infer_cblock(mq, e); r = btracker_queue(mq->bg_work, &work, NULL); - WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. + if (r) { + clear_pending(mq, e); + q_push_front(&mq->dirty, e); + } } } static void queue_demotion(struct smq_policy *mq) { + int r; struct policy_work work; struct entry *e; @@ -1189,7 +1206,7 @@ static void queue_demotion(struct smq_policy *mq) e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { if (!clean_target_met(mq, true)) - queue_writeback(mq); + queue_writeback(mq, false); return; } @@ -1199,12 +1216,17 @@ static void queue_demotion(struct smq_policy *mq) work.op = POLICY_DEMOTE; work.oblock = e->oblock; work.cblock = infer_cblock(mq, e); - btracker_queue(mq->bg_work, &work, NULL); + r = btracker_queue(mq->bg_work, &work, NULL); + if (r) { + clear_pending(mq, e); + q_push_front(&mq->clean, e); + } } static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, struct policy_work **workp) { + int r; struct entry *e; struct policy_work work; @@ -1234,7 +1256,9 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, work.op = POLICY_PROMOTE; work.oblock = oblock; work.cblock = infer_cblock(mq, e); - btracker_queue(mq->bg_work, &work, workp); + r = btracker_queue(mq->bg_work, &work, workp); + if (r) + free_entry(&mq->cache_alloc, e); } /*----------------------------------------------------------------*/ @@ -1418,7 +1442,7 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle, r = btracker_issue(mq->bg_work, result); if (r == -ENODATA) { if (!clean_target_met(mq, idle)) { - queue_writeback(mq); + queue_writeback(mq, idle); r = btracker_issue(mq->bg_work, result); } } @@ -1778,7 +1802,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, mq->next_hotspot_period = jiffies; mq->next_cache_period = jiffies; - mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ + mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */ if (!mq->bg_work) goto bad_btracker; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 8785134c9f1f..cf23a14f9c6a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -408,9 +408,7 @@ struct cache { int sectors_per_block_shift; spinlock_t lock; - struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_writethrough_bios; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -446,10 +444,10 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; struct work_struct deferred_bio_worker; - struct work_struct deferred_writethrough_worker; struct work_struct migration_worker; struct delayed_work waker; struct dm_bio_prison_v2 *prison; + struct bio_set *bs; mempool_t *migration_pool; @@ -490,15 +488,6 @@ struct per_bio_data { struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; - - /* - * writethrough fields. These MUST remain at the end of this - * structure and the 'cache' member must be the first as it - * is used to determine the offset of the writethrough fields. - */ - struct cache *cache; - dm_cblock_t cblock; - struct dm_bio_details bio_details; }; struct dm_cache_migration { @@ -515,19 +504,19 @@ struct dm_cache_migration { /*----------------------------------------------------------------*/ -static bool writethrough_mode(struct cache_features *f) +static bool writethrough_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITETHROUGH; + return cache->features.io_mode == CM_IO_WRITETHROUGH; } -static bool writeback_mode(struct cache_features *f) +static bool writeback_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITEBACK; + return cache->features.io_mode == CM_IO_WRITEBACK; } -static inline bool passthrough_mode(struct cache_features *f) +static inline bool passthrough_mode(struct cache *cache) { - return unlikely(f->io_mode == CM_IO_PASSTHROUGH); + return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); } /*----------------------------------------------------------------*/ @@ -537,14 +526,9 @@ static void wake_deferred_bio_worker(struct cache *cache) queue_work(cache->wq, &cache->deferred_bio_worker); } -static void wake_deferred_writethrough_worker(struct cache *cache) -{ - queue_work(cache->wq, &cache->deferred_writethrough_worker); -} - static void wake_migration_worker(struct cache *cache) { - if (passthrough_mode(&cache->features)) + if (passthrough_mode(cache)) return; queue_work(cache->wq, &cache->migration_worker); @@ -567,10 +551,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) struct dm_cache_migration *mg; mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); - if (mg) { - mg->cache = cache; - atomic_inc(&mg->cache->nr_allocated_migrations); - } + if (!mg) + return NULL; + + memset(mg, 0, sizeof(*mg)); + + mg->cache = cache; + atomic_inc(&cache->nr_allocated_migrations); return mg; } @@ -618,27 +605,16 @@ static unsigned lock_level(struct bio *bio) * Per bio data *--------------------------------------------------------------*/ -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - -static size_t get_per_bio_data_size(struct cache *cache) +static struct per_bio_data *get_per_bio_data(struct bio *bio) { - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; -} - -static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); BUG_ON(!pb); return pb; } -static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +static struct per_bio_data *init_per_bio_data(struct bio *bio) { - struct per_bio_data *pb = get_per_bio_data(bio, data_size); + struct per_bio_data *pb = get_per_bio_data(bio); pb->tick = false; pb->req_nr = dm_bio_get_target_bio_nr(bio); @@ -678,7 +654,6 @@ static void defer_bios(struct cache *cache, struct bio_list *bios) static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) { bool r; - size_t pb_size; struct per_bio_data *pb; struct dm_cell_key_v2 key; dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); @@ -703,8 +678,7 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi if (cell != cell_prealloc) free_prison_cell(cache, cell_prealloc); - pb_size = get_per_bio_data_size(cache); - pb = get_per_bio_data(bio, pb_size); + pb = get_per_bio_data(bio); pb->cell = cell; return r; @@ -856,28 +830,35 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { unsigned long flags; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb; spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { + pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } spin_unlock_irqrestore(&cache->lock, flags); } -static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) +static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, bool bio_has_pbd) { - // FIXME: this is called way too much. - check_if_tick_bio_needed(cache, bio); + if (bio_has_pbd) + check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) clear_discard(cache, oblock_to_dblock(cache, oblock)); } +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + // FIXME: check_if_tick_bio_needed() is called way too much through this interface + __remap_to_origin_clear_discard(cache, bio, oblock, true); +} + static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { @@ -908,10 +889,10 @@ static bool accountable_bio(struct cache *cache, struct bio *bio) static void accounted_begin(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb; if (accountable_bio(cache, bio)) { + pb = get_per_bio_data(bio); pb->len = bio_sectors(bio); iot_io_begin(&cache->tracker, pb->len); } @@ -919,8 +900,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) static void accounted_complete(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); iot_io_end(&cache->tracker, pb->len); } @@ -937,57 +917,26 @@ static void issue_op(struct bio *bio, void *context) accounted_request(cache, bio); } -static void defer_writethrough_bio(struct cache *cache, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_writethrough_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_deferred_writethrough_worker(cache); -} - -static void writethrough_endio(struct bio *bio) -{ - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - - dm_unhook_bio(&pb->hook_info, bio); - - if (bio->bi_status) { - bio_endio(bio); - return; - } - - dm_bio_restore(&pb->bio_details, bio); - remap_to_cache(pb->cache, bio, pb->cblock); - - /* - * We can't issue this bio directly, since we're in interrupt - * context. So it gets put on a bio list for processing by the - * worker thread. - */ - defer_writethrough_bio(pb->cache, bio); -} - /* - * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks - * to both the cache and origin devices. In future we'd like to clone the - * bio and send them in parallel, but for now we're doing them in - * series as this is easier. + * to both the cache and origin devices. Clone the bio and send them in parallel. */ -static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, - dm_oblock_t oblock, dm_cblock_t cblock) +static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) { - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs); + + BUG_ON(!origin_bio); - pb->cache = cache; - pb->cblock = cblock; - dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); - dm_bio_record(&pb->bio_details, bio); + bio_chain(origin_bio, bio); + /* + * Passing false to __remap_to_origin_clear_discard() skips + * all code that might use per_bio_data (since clone doesn't have it) + */ + __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); + submit_bio(origin_bio); - remap_to_origin_clear_discard(pb->cache, bio, oblock); + remap_to_cache(cache, bio, cblock); } /*---------------------------------------------------------------- @@ -1201,6 +1150,18 @@ static void background_work_end(struct cache *cache) /*----------------------------------------------------------------*/ +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) +{ + return writeback_mode(cache) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); +} + static void quiesce(struct dm_cache_migration *mg, void (*continuation)(struct work_struct *)) { @@ -1248,8 +1209,7 @@ static int copy(struct dm_cache_migration *mg, bool promote) static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) free_prison_cell(cache, pb->cell); @@ -1260,23 +1220,21 @@ static void overwrite_endio(struct bio *bio) { struct dm_cache_migration *mg = bio->bi_private; struct cache *cache = mg->cache; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); dm_unhook_bio(&pb->hook_info, bio); if (bio->bi_status) mg->k.input = bio->bi_status; - queue_continuation(mg->cache->wq, &mg->k); + queue_continuation(cache->wq, &mg->k); } static void overwrite(struct dm_cache_migration *mg, void (*continuation)(struct work_struct *)) { struct bio *bio = mg->overwrite_bio; - size_t pb_data_size = get_per_bio_data_size(mg->cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); @@ -1474,13 +1432,51 @@ static void mg_upgrade_lock(struct work_struct *ws) } } +static void mg_full_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); + + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } + + init_continuation(&mg->k, mg_upgrade_lock); + + if (copy(mg, is_policy_promote)) { + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); + mg->k.input = BLK_STS_IOERR; + mg_complete(mg, false); + } +} + static void mg_copy(struct work_struct *ws) { - int r; struct dm_cache_migration *mg = ws_to_mg(ws); if (mg->overwrite_bio) { /* + * No exclusive lock was held when we last checked if the bio + * was optimisable. So we have to check again in case things + * have changed (eg, the block may no longer be discarded). + */ + if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { + /* + * Fallback to a real full copy after doing some tidying up. + */ + bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); + BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ + mg->overwrite_bio = NULL; + inc_io_migrations(mg->cache); + mg_full_copy(ws); + return; + } + + /* * It's safe to do this here, even though it's new data * because all IO has been locked out of the block. * @@ -1489,26 +1485,8 @@ static void mg_copy(struct work_struct *ws) */ overwrite(mg, mg_update_metadata_after_copy); - } else { - struct cache *cache = mg->cache; - struct policy_work *op = mg->op; - bool is_policy_promote = (op->op == POLICY_PROMOTE); - - if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || - is_discarded_oblock(cache, op->oblock)) { - mg_upgrade_lock(ws); - return; - } - - init_continuation(&mg->k, mg_upgrade_lock); - - r = copy(mg, is_policy_promote); - if (r) { - DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = BLK_STS_IOERR; - mg_complete(mg, false); - } - } + } else + mg_full_copy(ws); } static int mg_lock_writes(struct dm_cache_migration *mg) @@ -1567,9 +1545,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio return -ENOMEM; } - memset(mg, 0, sizeof(*mg)); - - mg->cache = cache; mg->op = op; mg->overwrite_bio = bio; @@ -1703,9 +1678,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, return -ENOMEM; } - memset(mg, 0, sizeof(*mg)); - - mg->cache = cache; mg->overwrite_bio = bio; mg->invalidate_cblock = cblock; mg->invalidate_oblock = oblock; @@ -1748,26 +1720,12 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) /*----------------------------------------------------------------*/ -static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) -{ - return (bio_data_dir(bio) == WRITE) && - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); -} - -static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) -{ - return writeback_mode(&cache->features) && - (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); -} - static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, bool *commit_needed) { int r, data_dir; bool rb, background_queued; dm_cblock_t cblock; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); *commit_needed = false; @@ -1816,6 +1774,8 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, } if (r == -ENOENT) { + struct per_bio_data *pb = get_per_bio_data(bio); + /* * Miss. */ @@ -1823,7 +1783,6 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, if (pb->req_nr == 0) { accounted_begin(cache, bio); remap_to_origin_clear_discard(cache, bio, block); - } else { /* * This is a duplicate writethrough io that is no @@ -1842,18 +1801,17 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, * Passthrough always maps to the origin, invalidating any * cache blocks that are written to. */ - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { if (bio_data_dir(bio) == WRITE) { bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); invalidate_start(cache, cblock, block, bio); } else remap_to_origin_clear_discard(cache, bio, block); - } else { - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && !is_dirty(cache, cblock)) { - remap_to_origin_then_cache(cache, bio, block, cblock); + remap_to_origin_and_cache(cache, bio, block, cblock); accounted_begin(cache, bio); } else remap_to_cache_dirty(cache, bio, block, cblock); @@ -1922,8 +1880,7 @@ static blk_status_t commit_op(void *context) static bool process_flush_bio(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (!pb->req_nr) remap_to_origin(cache, bio); @@ -1983,28 +1940,6 @@ static void process_deferred_bios(struct work_struct *ws) schedule_commit(&cache->committer); } -static void process_deferred_writethrough_bios(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); - - unsigned long flags; - struct bio_list bios; - struct bio *bio; - - bio_list_init(&bios); - - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_writethrough_bios); - bio_list_init(&cache->deferred_writethrough_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - /* - * These bios have already been through accounted_begin() - */ - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); -} - /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ @@ -2112,6 +2047,9 @@ static void destroy(struct cache *cache) kfree(cache->ctr_args[i]); kfree(cache->ctr_args); + if (cache->bs) + bioset_free(cache->bs); + kfree(cache); } @@ -2555,8 +2493,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->discards_supported = true; ti->split_discard_bios = false; + ti->per_io_data_size = sizeof(struct per_bio_data); + cache->features = ca->features; - ti->per_io_data_size = get_per_bio_data_size(cache); + if (writethrough_mode(cache)) { + /* Create bioset for writethrough bios issued to origin */ + cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!cache->bs) + goto bad; + } cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -2618,7 +2563,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { bool all_clean; r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); @@ -2637,9 +2582,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) } spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_writethrough_bios); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); @@ -2678,8 +2621,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); - INIT_WORK(&cache->deferred_writethrough_worker, - process_deferred_writethrough_bios); INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); @@ -2795,9 +2736,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio) int r; bool commit_needed; dm_oblock_t block = get_bio_block(cache, bio); - size_t pb_data_size = get_per_bio_data_size(cache); - init_per_bio_data(bio, pb_data_size); + init_per_bio_data(bio); if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { /* * This can only occur if the io goes to a partial block at @@ -2821,13 +2761,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } -static int cache_end_io(struct dm_target *ti, struct bio *bio, - blk_status_t *error) +static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) { struct cache *cache = ti->private; unsigned long flags; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (pb->tick) { policy_tick(cache->policy, false); @@ -3243,13 +3181,13 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("1 "); - if (writethrough_mode(&cache->features)) + if (writethrough_mode(cache)) DMEMIT("writethrough "); - else if (passthrough_mode(&cache->features)) + else if (passthrough_mode(cache)) DMEMIT("passthrough "); - else if (writeback_mode(&cache->features)) + else if (writeback_mode(cache)) DMEMIT("writeback "); else { @@ -3415,7 +3353,7 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun unsigned i; struct cblock_range range; - if (!passthrough_mode(&cache->features)) { + if (!passthrough_mode(cache)) { DMERR("%s: cache has to be in passthrough mode for invalidation", cache_device_name(cache)); return -EPERM; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 24eddbdf2ab4..6a14f945783c 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -29,7 +29,6 @@ struct dm_kobject_holder { * DM targets must _not_ deference a mapped_device to directly access its members! */ struct mapped_device { - struct srcu_struct io_barrier; struct mutex suspend_lock; /* @@ -127,6 +126,8 @@ struct mapped_device { struct blk_mq_tag_set *tag_set; bool use_blk_mq:1; bool init_tio_pdu:1; + + struct srcu_struct io_barrier; }; void dm_init_md_queue(struct mapped_device *md); @@ -149,5 +150,6 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen extern atomic_t dm_global_event_nr; extern wait_queue_head_t dm_global_eventq; +void dm_issue_global_event(void); #endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a55ffd4f5933..9fc12f556534 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1075,7 +1075,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); /* Reject unexpected unaligned bio. */ - if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) return -EIO; dmreq = dmreq_of_req(cc, req); @@ -1168,7 +1168,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, int r = 0; /* Reject unexpected unaligned bio. */ - if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) return -EIO; dmreq = dmreq_of_req(cc, req); @@ -2466,6 +2466,7 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key kfree(cipher_api); return ret; } + kfree(cipher_api); return 0; bad_mem: @@ -2584,6 +2585,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid feature value for sector_size"; return -EINVAL; } + if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not multiple of sector_size feature"; + return -EINVAL; + } cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; } else if (!strcasecmp(opt_string, "iv_large_sectors")) set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 096fe9b66c50..61180783ef42 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -6,6 +6,7 @@ * This file is released under the GPL. */ +#include <linux/compiler.h> #include <linux/module.h> #include <linux/device-mapper.h> #include <linux/dm-io.h> @@ -80,13 +81,13 @@ struct journal_entry { #define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) #if BITS_PER_LONG == 64 -#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #elif defined(CONFIG_LBDAF) -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #else -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) #define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) #endif #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) @@ -320,7 +321,7 @@ static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, in static int dm_integrity_failed(struct dm_integrity_c *ic) { - return ACCESS_ONCE(ic->failed); + return READ_ONCE(ic->failed); } static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, @@ -1376,7 +1377,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { - if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", bv.bv_offset, bv.bv_len, ic->sectors_per_block); return DM_MAPIO_KILL; @@ -1545,7 +1546,7 @@ retry_kmap: smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { queue_work(ic->commit_wq, &ic->commit_work); } else { schedule_autocommit(ic); @@ -1798,7 +1799,7 @@ static void integrity_commit(struct work_struct *w) ic->n_committed_sections += commit_sections; spin_unlock_irq(&ic->endio_wait.lock); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) queue_work(ic->writer_wq, &ic->writer_work); release_flush_bios: @@ -1980,7 +1981,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (ACCESS_ONCE(ic->suspending)) + if (READ_ONCE(ic->suspending)) return; spin_lock_irq(&ic->endio_wait.lock); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 8756a6850431..e52676fa9832 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -477,9 +477,13 @@ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_si * Round up the ptr to an 8-byte boundary. */ #define ALIGN_MASK 7 +static inline size_t align_val(size_t val) +{ + return (val + ALIGN_MASK) & ~ALIGN_MASK; +} static inline void *align_ptr(void *ptr) { - return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); + return (void *)align_val((size_t)ptr); } /* @@ -505,7 +509,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ struct hash_cell *hc; size_t len, needed = 0; struct gendisk *disk; - struct dm_name_list *nl, *old_nl = NULL; + struct dm_name_list *orig_nl, *nl, *old_nl = NULL; uint32_t *event_nr; down_write(&_hash_lock); @@ -516,17 +520,15 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ */ for (i = 0; i < NUM_BUCKETS; i++) { list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += sizeof(struct dm_name_list); - needed += strlen(hc->name) + 1; - needed += ALIGN_MASK; - needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t)); } } /* * Grab our output buffer. */ - nl = get_result_buffer(param, param_size, &len); + nl = orig_nl = get_result_buffer(param, param_size, &len); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; @@ -549,11 +551,16 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ strcpy(nl->name, hc->name); old_nl = nl; - event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1); + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); *event_nr = dm_get_event_nr(hc->md); nl = align_ptr(event_nr + 1); } } + /* + * If mismatch happens, security may be compromised due to buffer + * overflow, so it's better to crash. + */ + BUG_ON((char *)nl - (char *)orig_nl != needed); out: up_write(&_hash_lock); @@ -1621,7 +1628,8 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para * which has a variable size, is not used by the function processing * the ioctl. */ -#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT 2 /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char @@ -1635,12 +1643,12 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) ioctl_fn fn; } _ioctls[] = { {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all}, {DM_LIST_DEVICES_CMD, 0, list_devices}, - {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, - {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, - {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove}, + {DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename}, {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, {DM_DEV_WAIT_CMD, 0, dev_wait}, @@ -1869,6 +1877,9 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT) + dm_issue_global_event(); + /* * Copy the results back to userland. */ diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index cf2c67e35eaf..eb45cc3df31d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t) try_again: spin_lock_irq(&throttle_spinlock); - throttle = ACCESS_ONCE(t->throttle); + throttle = READ_ONCE(t->throttle); if (likely(throttle >= 100)) goto skip_limit; @@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t) t->num_io_jobs--; - if (likely(ACCESS_ONCE(t->throttle) >= 100)) + if (likely(READ_ONCE(t->throttle) >= 100)) goto skip_limit; if (!t->num_io_jobs) { diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8b80a9ce9ea9..189badbeddaf 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -10,9 +10,11 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/uio.h> #define DM_MSG_PREFIX "log-writes" @@ -246,27 +248,108 @@ error: return -1; } +static int write_inline_data(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + int num_pages, bio_pages, pg_datalen, pg_sectorlen, i; + struct page *page; + struct bio *bio; + size_t ret; + void *ptr; + + while (datalen) { + num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT; + bio_pages = min(num_pages, BIO_MAX_PAGES); + + atomic_inc(&lc->io_blocks); + + bio = bio_alloc(GFP_KERNEL, bio_pages); + if (!bio) { + DMERR("Couldn't alloc inline data bio"); + goto error; + } + + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, lc->logdev->bdev); + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + for (i = 0; i < bio_pages; i++) { + pg_datalen = min_t(int, datalen, PAGE_SIZE); + pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc inline data page"); + goto error_bio; + } + + ptr = kmap_atomic(page); + memcpy(ptr, data, pg_datalen); + if (pg_sectorlen > pg_datalen) + memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, pg_sectorlen, 0); + if (ret != pg_sectorlen) { + DMERR("Couldn't add page of inline data"); + __free_page(page); + goto error_bio; + } + + datalen -= pg_datalen; + data += pg_datalen; + } + submit_bio(bio); + + sector += bio_pages * PAGE_SECTORS; + } + return 0; +error_bio: + bio_free_pages(bio); + bio_put(bio); +error: + put_io_block(lc); + return -1; +} + static int log_one_block(struct log_writes_c *lc, struct pending_block *block, sector_t sector) { struct bio *bio; struct log_write_entry entry; - size_t ret; + size_t metadatalen, ret; int i; entry.sector = cpu_to_le64(block->sector); entry.nr_sectors = cpu_to_le64(block->nr_sectors); entry.flags = cpu_to_le64(block->flags); entry.data_len = cpu_to_le64(block->datalen); + + metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; if (write_metadata(lc, &entry, sizeof(entry), block->data, - block->datalen, sector)) { + metadatalen, sector)) { free_pending_block(lc, block); return -1; } + sector += dev_to_bio_sectors(lc, 1); + + if (block->datalen && metadatalen == 0) { + if (write_inline_data(lc, &entry, sizeof(entry), block->data, + block->datalen, sector)) { + free_pending_block(lc, block); + return -1; + } + /* we don't support both inline data & bio data */ + goto out; + } + if (!block->vec_cnt) goto out; - sector += dev_to_bio_sectors(lc, 1); atomic_inc(&lc->io_blocks); bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); @@ -527,6 +610,51 @@ static int log_mark(struct log_writes_c *lc, char *data) return 0; } +static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, + struct iov_iter *i) +{ + struct pending_block *block; + + if (!bytes) + return 0; + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating dax pending block"); + return -ENOMEM; + } + + block->data = kzalloc(bytes, GFP_KERNEL); + if (!block->data) { + DMERR("Error allocating dax data space"); + kfree(block); + return -ENOMEM; + } + + /* write data provided via the iterator */ + if (!copy_from_iter(block->data, bytes, i)) { + DMERR("Error copying dax data"); + kfree(block->data); + kfree(block); + return -EIO; + } + + /* rewind the iterator so that the block driver can use it */ + iov_iter_revert(i, bytes); + + block->datalen = bytes; + block->sector = bio_to_dev_sectors(lc, sector); + block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; + + atomic_inc(&lc->pending_blocks); + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + + return 0; +} + static void log_writes_dtr(struct dm_target *ti) { struct log_writes_c *lc = ti->private; @@ -792,9 +920,46 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit limits->io_min = limits->physical_block_size; } +static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + int ret; + + ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff); + if (ret) + return ret; + return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn); +} + +static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, + struct iov_iter *i) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + int err; + + if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + + /* Don't bother doing anything if logging has been disabled */ + if (!lc->logging_enabled) + goto dax_copy; + + err = log_dax(lc, sector, bytes, i); + if (err) { + DMWARN("Error %d logging DAX write", err); + return 0; + } +dax_copy: + return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); +} + static struct target_type log_writes_target = { .name = "log-writes", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = log_writes_ctr, .dtr = log_writes_dtr, @@ -805,6 +970,8 @@ static struct target_type log_writes_target = { .message = log_writes_message, .iterate_devices = log_writes_iterate_devices, .io_hints = log_writes_io_hints, + .direct_access = log_writes_dax_direct_access, + .dax_copy_from_iter = log_writes_dax_copy_from_iter, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 0e211de9bc54..204606bef9e2 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -1796,7 +1796,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1818,7 +1818,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1887,9 +1887,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..366c625b9591 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -12,7 +12,7 @@ #include "raid1.h" #include "raid5.h" #include "raid10.h" -#include "bitmap.h" +#include "md-bitmap.h" #include <linux/device-mapper.h> @@ -2143,13 +2143,6 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) struct dm_raid_superblock *refsb; uint64_t events_sb, events_refsb; - rdev->sb_start = 0; - rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); - if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { - DMERR("superblock size of a logical block is no longer valid"); - return -EINVAL; - } - r = read_disk_sb(rdev, rdev->sb_size, false); if (r) return r; @@ -2494,6 +2487,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (test_bit(Journal, &rdev->flags)) continue; + if (!rdev->meta_bdev) + continue; + + /* Set superblock offset/size for metadata device. */ + rdev->sb_start = 0; + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } + /* * Skipping super_load due to CTR_FLAG_SYNC will cause * the array to undergo initialization again as @@ -2506,9 +2510,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; - if (!rdev->meta_bdev) - continue; - r = super_load(rdev, freshest); switch (r) { @@ -3238,7 +3239,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; } @@ -3297,11 +3298,10 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, static sector_t rs_get_progress(struct raid_set *rs, sector_t resync_max_sectors, bool *array_in_sync) { - sector_t r, recovery_cp, curr_resync_completed; + sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - recovery_cp = mddev->recovery_cp; *array_in_sync = false; if (rs_is_raid0(rs)) { @@ -3330,9 +3330,11 @@ static sector_t rs_get_progress(struct raid_set *rs, } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) r = curr_resync_completed; else - r = recovery_cp; + r = mddev->recovery_cp; - if (r == MaxSector) { + if ((r == MaxSector) || + (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ @@ -3628,8 +3630,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } rs->md.ro = 1; } @@ -3886,13 +3891,16 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } } static struct target_type raid_target = { .name = "raid", - .version = {1, 12, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106ff..9d32f25489c2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return blk_queue_stackable(md->queue); + return queue_is_rq_based(md->queue); } static void dm_old_start_queue(struct request_queue *q) diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 6028d8247f58..29bc51084c82 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/numa.h> #include <linux/slab.h> @@ -431,7 +432,7 @@ do_sync_free: synchronize_rcu_expedited(); dm_stat_free(&s->rcu_head); } else { - ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; + WRITE_ONCE(dm_stat_need_rcu_barrier, 1); call_rcu(&s->rcu_head, dm_stat_free); } return 0; @@ -639,12 +640,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, */ last = raw_cpu_ptr(stats->last); stats_aux->merged = - (bi_sector == (ACCESS_ONCE(last->last_sector) && + (bi_sector == (READ_ONCE(last->last_sector) && ((bi_rw == WRITE) == - (ACCESS_ONCE(last->last_rw) == WRITE)) + (READ_ONCE(last->last_rw) == WRITE)) )); - ACCESS_ONCE(last->last_sector) = end_sector; - ACCESS_ONCE(last->last_rw) = bi_rw; + WRITE_ONCE(last->last_sector, end_sector); + WRITE_ONCE(last->last_rw, bi_rw); } rcu_read_lock(); @@ -693,22 +694,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared for_each_possible_cpu(cpu) { p = &s->stat_percpu[cpu][x]; - shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); - shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); - shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); - shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); - shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); - shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); - shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); - shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); - shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); - shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); - shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); - shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); + shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); if (s->n_histogram_entries) { unsigned i; for (i = 0; i < s->n_histogram_entries + 1; i++) - shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); + shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); } } } diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h index f1c0956e3843..2ddfae678f32 100644 --- a/drivers/md/dm-stats.h +++ b/drivers/md/dm-stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef DM_STATS_H #define DM_STATS_H diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 4c8de1ff78ca..8d0ba879777e 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long switch_get_position(sctx, region_nr, ®ion_index, &bit); - return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + return (READ_ONCE(sctx->region_table[region_index]) >> bit) & ((1 << sctx->region_table_entry_bits) - 1); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ef7b8f201f73..433bd3f3014c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -451,15 +451,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, return r; } - atomic_set(&dd->count, 0); + refcount_set(&dd->count, 1); list_add(&dd->list, &t->devices); } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; + refcount_inc(&dd->count); } - atomic_inc(&dd->count); *result = dd->dm_dev; return 0; @@ -515,7 +515,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) dm_device_name(ti->table->md), d->name); return; } - if (atomic_dec_and_test(&dd->count)) { + if (refcount_dec_and_test(&dd->count)) { dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); @@ -1000,7 +1000,7 @@ verify_rq_based: list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - if (!blk_queue_stackable(q)) { + if (!queue_is_rq_based(q)) { DMERR("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 1e25705209c2..89e5dff9b4cf 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); - unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; + unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ; /* * Never allow the pool to transition to PM_WRITE mode if user diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index bda3caca23ca..aedb8222836b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -92,74 +92,33 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -/* - * Callback function for asynchrnous crypto API completion notification - */ -static void verity_op_done(struct crypto_async_request *base, int err) -{ - struct verity_result *res = (struct verity_result *)base->data; - - if (err == -EINPROGRESS) - return; - - res->err = err; - complete(&res->completion); -} - -/* - * Wait for async crypto API callback - */ -static inline int verity_complete_op(struct verity_result *res, int ret) -{ - switch (ret) { - case 0: - break; - - case -EINPROGRESS: - case -EBUSY: - ret = wait_for_completion_interruptible(&res->completion); - if (!ret) - ret = res->err; - reinit_completion(&res->completion); - break; - - default: - DMERR("verity_wait_hash: crypto op submission failed: %d", ret); - } - - if (unlikely(ret < 0)) - DMERR("verity_wait_hash: crypto op failed: %d", ret); - - return ret; -} - static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, - struct verity_result *res) + struct crypto_wait *wait) { struct scatterlist sg; sg_init_one(&sg, data, len); ahash_request_set_crypt(req, &sg, NULL, len); - return verity_complete_op(res, crypto_ahash_update(req)); + return crypto_wait_req(crypto_ahash_update(req), wait); } /* * Wrapper for crypto_ahash_init, which handles verity salting. */ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, - struct verity_result *res) + struct crypto_wait *wait) { int r; ahash_request_set_tfm(req, v->tfm); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, - verity_op_done, (void *)res); - init_completion(&res->completion); + crypto_req_done, (void *)wait); + crypto_init_wait(wait); - r = verity_complete_op(res, crypto_ahash_init(req)); + r = crypto_wait_req(crypto_ahash_init(req), wait); if (unlikely(r < 0)) { DMERR("crypto_ahash_init failed: %d", r); @@ -167,18 +126,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, } if (likely(v->salt_size && (v->version >= 1))) - r = verity_hash_update(v, req, v->salt, v->salt_size, res); + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); return r; } static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct verity_result *res) + u8 *digest, struct crypto_wait *wait) { int r; if (unlikely(v->salt_size && (!v->version))) { - r = verity_hash_update(v, req, v->salt, v->salt_size, res); + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); if (r < 0) { DMERR("verity_hash_final failed updating salt: %d", r); @@ -187,7 +146,7 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, } ahash_request_set_crypt(req, NULL, digest, 0); - r = verity_complete_op(res, crypto_ahash_final(req)); + r = crypto_wait_req(crypto_ahash_final(req), wait); out: return r; } @@ -196,17 +155,17 @@ int verity_hash(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, u8 *digest) { int r; - struct verity_result res; + struct crypto_wait wait; - r = verity_hash_init(v, req, &res); + r = verity_hash_init(v, req, &wait); if (unlikely(r < 0)) goto out; - r = verity_hash_update(v, req, data, len, &res); + r = verity_hash_update(v, req, data, len, &wait); if (unlikely(r < 0)) goto out; - r = verity_hash_final(v, req, digest, &res); + r = verity_hash_final(v, req, digest, &wait); out: return r; @@ -389,7 +348,7 @@ out: * Calculates the digest for the given bio */ int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct verity_result *res) + struct bvec_iter *iter, struct crypto_wait *wait) { unsigned int todo = 1 << v->data_dev_block_bits; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -414,7 +373,7 @@ int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, */ sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); ahash_request_set_crypt(req, &sg, NULL, len); - r = verity_complete_op(res, crypto_ahash_update(req)); + r = crypto_wait_req(crypto_ahash_update(req), wait); if (unlikely(r < 0)) { DMERR("verity_for_io_block crypto op failed: %d", r); @@ -482,7 +441,7 @@ static int verity_verify_io(struct dm_verity_io *io) struct dm_verity *v = io->v; struct bvec_iter start; unsigned b; - struct verity_result res; + struct crypto_wait wait; for (b = 0; b < io->n_blocks; b++) { int r; @@ -507,17 +466,17 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash_init(v, req, &res); + r = verity_hash_init(v, req, &wait); if (unlikely(r < 0)) return r; start = io->iter; - r = verity_for_io_block(v, io, &io->iter, &res); + r = verity_for_io_block(v, io, &io->iter, &wait); if (unlikely(r < 0)) return r; r = verity_hash_final(v, req, verity_io_real_digest(v, io), - &res); + &wait); if (unlikely(r < 0)) return r; @@ -589,7 +548,7 @@ static void verity_prefetch_io(struct work_struct *work) verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { - unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); + unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster); cluster >>= v->data_dev_block_bits; if (unlikely(!cluster)) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index a59e0ada6fd3..b675bc015512 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -90,11 +90,6 @@ struct dm_verity_io { */ }; -struct verity_result { - struct completion completion; - int err; -}; - static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, struct dm_verity_io *io) { diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b87c1741da4b..6d7bda6f8190 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -660,6 +660,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) struct dmz_target *dmz = ti->private; struct request_queue *q; struct dmz_dev *dev; + sector_t aligned_capacity; int ret; /* Get the target device */ @@ -685,15 +686,17 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) goto err; } + q = bdev_get_queue(dev->bdev); dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (ti->begin || (ti->len != dev->capacity)) { + aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); + if (ti->begin || + ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { ti->error = "Partial mapping not supported"; ret = -EINVAL; goto err; } - q = bdev_get_queue(dev->bdev); - dev->zone_nr_sectors = q->limits.chunk_sectors; + dev->zone_nr_sectors = blk_queue_zone_sectors(q); dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors); dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); @@ -929,8 +932,10 @@ static int dmz_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dmz_target *dmz = ti->private; + struct dmz_dev *dev = dmz->dev; + sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1); - return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data); + return fn(ti, dmz->ddev, 0, capacity, data); } static struct target_type dmz_type = { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5..de17b7193299 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -24,6 +24,7 @@ #include <linux/delay.h> #include <linux/wait.h> #include <linux/pr.h> +#include <linux/refcount.h> #define DM_MSG_PREFIX "core" @@ -52,6 +53,12 @@ static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); +void dm_issue_global_event(void) +{ + atomic_inc(&dm_global_event_nr); + wake_up(&dm_global_eventq); +} + /* * One of these is allocated per bio. */ @@ -92,7 +99,7 @@ struct dm_md_mempools { struct table_device { struct list_head list; - atomic_t count; + refcount_t count; struct dm_dev dm_dev; }; @@ -108,7 +115,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; static int __dm_get_module_param_int(int *module_param, int min, int max) { - int param = ACCESS_ONCE(*module_param); + int param = READ_ONCE(*module_param); int modified_param = 0; bool modified = true; @@ -130,7 +137,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max) unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { - unsigned param = ACCESS_ONCE(*module_param); + unsigned param = READ_ONCE(*module_param); unsigned modified_param = 0; if (!param) @@ -679,10 +686,11 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, format_dev_t(td->dm_dev.name, dev); - atomic_set(&td->count, 0); + refcount_set(&td->count, 1); list_add(&td->list, &md->table_devices); + } else { + refcount_inc(&td->count); } - atomic_inc(&td->count); mutex_unlock(&md->table_devices_lock); *result = &td->dm_dev; @@ -695,7 +703,7 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) struct table_device *td = container_of(d, struct table_device, dm_dev); mutex_lock(&md->table_devices_lock); - if (atomic_dec_and_test(&td->count)) { + if (refcount_dec_and_test(&td->count)) { close_table_device(td, md); list_del(&td->list); kfree(td); @@ -712,7 +720,7 @@ static void free_table_devices(struct list_head *devices) struct table_device *td = list_entry(tmp, struct table_device, list); DMWARN("dm_destroy: %s still exists with %d references", - td->dm_dev.name, atomic_read(&td->count)); + td->dm_dev.name, refcount_read(&td->count)); kfree(td); } } @@ -1613,17 +1621,6 @@ static void dm_wq_work(struct work_struct *work); void dm_init_md_queue(struct mapped_device *md) { /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device may not have been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - - /* * Initialize data that will only be used by a non-blk-mq DM queue * - must do so here (in alloc_dev callchain) before queue is used */ @@ -1689,7 +1686,7 @@ static struct mapped_device *alloc_dev(int minor) struct mapped_device *md; void *old_md; - md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); + md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -1789,7 +1786,7 @@ bad_io_barrier: bad_minor: module_put(THIS_MODULE); bad_module_get: - kfree(md); + kvfree(md); return NULL; } @@ -1808,7 +1805,7 @@ static void free_dev(struct mapped_device *md) free_minor(minor); module_put(THIS_MODULE); - kfree(md); + kvfree(md); } static void __bind_mempools(struct mapped_device *md, struct dm_table *t) @@ -1865,9 +1862,8 @@ static void event_callback(void *context) dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); - atomic_inc(&dm_global_event_nr); wake_up(&md->eventq); - wake_up(&dm_global_eventq); + dm_issue_global_event(); } /* @@ -2067,17 +2063,12 @@ struct mapped_device *dm_get_md(dev_t dev) spin_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); - if (md) { - if ((md == MINOR_ALLOCED || - (MINOR(disk_devt(dm_disk(md))) != minor) || - dm_deleting_md(md) || - test_bit(DMF_FREEING, &md->flags))) { - md = NULL; - goto out; - } - dm_get(md); + if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; } - + dm_get(md); out: spin_unlock(&_minor_lock); @@ -2283,6 +2274,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) } map = __bind(md, table, &limits); + dm_issue_global_event(); out: mutex_unlock(&md->suspend_lock); @@ -2703,11 +2695,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) md = container_of(kobj, struct mapped_device, kobj_holder.kobj); - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) - return NULL; - + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } dm_get(md); +out: + spin_unlock(&_minor_lock); + return md; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 38c84c0a35d4..36399bb875dd 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -19,6 +19,7 @@ #include <linux/hdreg.h> #include <linux/completion.h> #include <linux/kobject.h> +#include <linux/refcount.h> #include "dm-stats.h" @@ -38,7 +39,7 @@ */ struct dm_dev_internal { struct list_head list; - atomic_t count; + refcount_t count; struct dm_dev *dm_dev; }; diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c index d2121637b4ab..239c7bb3929b 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,7 +29,7 @@ #include <linux/seq_file.h> #include <trace/events/block.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" static inline char *bmname(struct bitmap *bitmap) { @@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); + bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); if (!bh) { ret = -ENOMEM; goto out; @@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); @@ -625,7 +629,7 @@ re_read: err = read_sb_page(bitmap->mddev, offset, sb_page, - 0, PAGE_SIZE); + 0, sizeof(bitmap_super_t)); } if (err) return err; @@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) BUG_ON(file && mddev->bitmap_info.offset); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); @@ -2123,7 +2133,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), - PAGE_SIZE); + sizeof(bitmap_super_t)); bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2152,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, for (k = 0; k < page; k++) { kfree(new_bp[k].map); } + kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; @@ -2202,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, block += old_blocks; } + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + if (!init) { int i; while (block < (chunks << chunkshift)) { diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h index d15721ac07a6..5df35ca90f58 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/md-bitmap.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 * diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e17c65c..79bfbc840385 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,7 +15,7 @@ #include <linux/sched.h> #include <linux/raid/md_p.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } @@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev) /* * return 0 if all the bitmaps have the same sync_size */ -int cluster_check_sync_size(struct mddev *mddev) +static int cluster_check_sync_size(struct mddev *mddev) { int i, rv; bitmap_super_t *sb; @@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = { static int __init cluster_init(void) { - pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); register_md_cluster_operations(&cluster_ops, THIS_MODULE); return 0; diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 274016177983..c0240708f443 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MD_CLUSTER_H diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c index 38264b38420f..38264b38420f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/md-faulty.c diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c index c464fb48039a..773fc70dced7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/md-linear.c @@ -23,7 +23,7 @@ #include <linux/slab.h> #include <trace/events/block.h> #include "md.h" -#include "linear.h" +#include "md-linear.h" /* * find which device holds a particular offset diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h index 8d392e6098b3..8381d651d4ed 100644 --- a/drivers/md/linear.h +++ b/drivers/md/md-linear.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINEAR_H #define _LINEAR_H diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c index b68e0666b9b0..e40065bdbfc8 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/md-multipath.c @@ -25,7 +25,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include "md.h" -#include "multipath.h" +#include "md-multipath.h" #define MAX_WORK_PER_DISK 128 @@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf) static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - struct request_queue *q; int err = -EEXIST; int path; struct multipath_info *p; @@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h index 717c60f62898..0adb941f485a 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/md-multipath.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MULTIPATH_H #define _MULTIPATH_H diff --git a/drivers/md/md.c b/drivers/md/md.c index 08fcaebc61bd..09c3af3dcdca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include <trace/events/block.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE @@ -266,6 +266,52 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + rcu_read_lock(); + if (is_suspended(mddev, bio)) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!is_suspended(mddev, bio)) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} +EXPORT_SYMBOL(md_handle_request); + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); @@ -285,23 +331,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } -check_suspended: - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); /* * save the sectors now since our bio can @@ -310,20 +339,14 @@ check_suspended: sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); - goto check_suspended; - } + + md_handle_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); - return BLK_QC_T_NONE; } @@ -336,12 +359,17 @@ check_suspended: void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -349,6 +377,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -439,16 +468,22 @@ static void md_submit_flush_data(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); } void md_flush_request(struct mddev *mddev, struct bio *bio) @@ -649,6 +684,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -2299,7 +2335,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -2418,10 +2454,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -2637,7 +2681,7 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; - unsigned long flags = ACCESS_ONCE(rdev->flags); + unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && @@ -4810,7 +4854,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4826,16 +4870,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4853,7 +4891,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4866,19 +4904,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -5820,8 +5852,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); @@ -6348,7 +6386,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } @@ -6604,22 +6642,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev_resume(mddev); + } else if (fd < 0) { + mddev_suspend(mddev); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev_resume(mddev); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6721,7 +6763,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; @@ -6903,8 +6945,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6912,7 +6954,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6935,9 +6977,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7454,8 +7496,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); @@ -8025,7 +8067,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; @@ -8096,7 +8139,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); @@ -8463,16 +8505,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: @@ -8799,6 +8844,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index 561d22b9a9a8..7d6bcf0eba0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,6 +237,12 @@ enum mddev_flags { */ MD_HAS_PPL, /* The raid array has PPL feature set */ MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ }; enum mddev_sb_flags { @@ -494,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev) mutex_lock(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -538,12 +539,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. @@ -692,6 +692,7 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); +extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index ff528792c358..66be7c66479a 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ dm-array.o \ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 4aed69d9dd17..aec449243966 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/device-mapper.h> +#include <linux/kernel.h> #define DM_MSG_PREFIX "space map metadata" @@ -111,7 +112,7 @@ static bool brb_empty(struct bop_ring_buffer *brb) static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) { unsigned r = old + 1; - return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; + return r >= ARRAY_SIZE(brb->bops) ? 0 : r; } static int brb_push(struct bop_ring_buffer *brb, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5a00fc118470..5ecba9eef441 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } -static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { } diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 7127a623f5da..540e65d92642 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID0_H #define _RAID0_H diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 9f2670b45f31..400001b815db 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Maximum size of each resync request */ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f3f3e40dc9d8..cc9d337a1ed3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,13 +37,12 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> -#include <linux/sched/signal.h> #include <trace/events/block.h> #include "md.h" #include "raid1.h" -#include "bitmap.h" +#include "md-bitmap.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ @@ -990,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr) _wait_barrier(conf, idx); } -static void wait_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _wait_barrier(conf, idx); -} - static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); @@ -1011,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr) _allow_barrier(conf, idx); } -static void allow_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _allow_barrier(conf, idx); -} - /* conf->resync_lock should be held */ static int get_unqueued_pending(struct r1conf *conf) { @@ -1303,42 +1286,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - - - if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && + if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + bio->bi_iter.bi_sector, bio_end_sector(bio))) { - /* - * As the suspend_* range is controlled by userspace, we want - * an interruptible wait. - */ DEFINE_WAIT(w); for (;;) { - sigset_t full, old; prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); - if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi || - (mddev_is_clustered(mddev) && - !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio))) break; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ wait_barrier(conf, bio->bi_iter.bi_sector); r1_bio = alloc_r1bio(mddev, bio); @@ -1654,8 +1623,12 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_all_barriers(conf); - allow_all_barriers(conf); + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx); + _allow_barrier(conf, idx); + } mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; @@ -3277,21 +3250,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; } -static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private; - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } } static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c8894ef1e9d2..c7294e7557e0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID1_H #define _RAID1_H diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 374df5796649..b9edbc747a95 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,7 +29,7 @@ #include "md.h" #include "raid10.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) /* * When performing a resync, we need to read and compare, so @@ -383,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio) { int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; - int slot, dev; + int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: @@ -748,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); - sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; best_dist = MaxSector; @@ -761,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, * the resync window. We take the first readable disk when * above the resync window. */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { @@ -1293,6 +1297,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -2575,7 +2595,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2586,7 +2605,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; @@ -2826,6 +2844,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) } /* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + +/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write @@ -2897,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). @@ -3251,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3385,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + while (biolist) { bio = biolist; biolist = biolist->bi_next; @@ -3644,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out; + } + } + mddev->thread = conf->thread; conf->thread = NULL; @@ -3832,18 +3958,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } -static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; - switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } } static int raid10_resize(struct mddev *mddev, sector_t sectors) @@ -4578,15 +4700,18 @@ static int handle_reshape_read_error(struct mddev *mddev, /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; struct r10conf *conf = mddev->private; - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10b = &on_stack.r10_bio; + struct r10bio *r10b; int slot = 0; int idx = 0; struct page **pages; + r10b = kmalloc(sizeof(*r10b) + + sizeof(struct r10dev) * conf->copies, GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + /* reshape IOs share pages from .devs[0].bio */ pages = get_resync_pages(r10_bio->devs[0].bio)->pages; @@ -4635,11 +4760,13 @@ static int handle_reshape_read_error(struct mddev *mddev, /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + kfree(r10b); return -EIO; } sectors -= s; idx++; } + kfree(r10b); return 0; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 735ce1a3d260..db2ac22ac1b4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID10_H #define _RAID10_H @@ -88,6 +89,12 @@ struct r10conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; }; /* diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0b7406ac8ce1..f1c86d938502 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -23,7 +23,7 @@ #include <linux/types.h> #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* @@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) @@ -1194,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1210,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1382,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1409,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); if (!conf->log) return; @@ -1583,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu(); + /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 328d67aedda4..284578b0a349 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H @@ -8,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index cd026c88f7ef..628c0bf7b9fd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)sector); rdev = conf->disks[dd_idx].rdev; - if (!rdev) { + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", __func__, indent, "", dd_idx); update_parity = false; @@ -1296,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && - mddev->recovery_cp == 0 && !mddev->degraded && + } else if (!mddev->pers && mddev->recovery_cp == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4188a4881148..31dc25e2871a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include <linux/ratelimit.h> #include <linux/nodemask.h> #include <linux/flex_array.h> -#include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/list_sort.h> @@ -63,7 +62,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) @@ -811,6 +810,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -818,8 +825,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; @@ -1812,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -4599,7 +4607,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; @@ -5675,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or @@ -5751,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5873,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5971,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -6065,7 +6070,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n */ rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; @@ -6568,14 +6573,17 @@ static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf; - unsigned long new; + unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; - if (kstrtoul(page, 10, &new)) + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) return -EINVAL; err = mddev_lock(mddev); @@ -7146,6 +7154,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to @@ -7948,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7955,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -8010,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8035,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f6536399677a..2e6123825095 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID5_H #define _RAID5_H |