diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-27 12:55:48 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-27 12:55:48 -0800 |
commit | 22a8f39c520fc577c02b4e5c99f8bb3b6017680b (patch) | |
tree | b3c89646e533a994c87c5ffe12da413d1c4a47f6 /drivers | |
parent | 48b4b4ff1ee044a977929bcf80e79f8212f756b4 (diff) | |
parent | e3de04469a49ee09c89e80bf821508df458ccee6 (diff) |
Merge tag 'for-5.6/drivers-2020-01-27' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"Like the core side, not a lot of changes here, just two main items:
- Series of patches (via Coly) with fixes for bcache (Coly,
Christoph)
- MD pull request from Song"
* tag 'for-5.6/drivers-2020-01-27' of git://git.kernel.dk/linux-block: (31 commits)
bcache: reap from tail of c->btree_cache in bch_mca_scan()
bcache: reap c->btree_cache_freeable from the tail in bch_mca_scan()
bcache: remove member accessed from struct btree
bcache: print written and keys in trace_bcache_btree_write
bcache: avoid unnecessary btree nodes flushing in btree_flush_write()
bcache: add code comments for state->pool in __btree_sort()
lib: crc64: include <linux/crc64.h> for 'crc64_be'
bcache: use read_cache_page_gfp to read the superblock
bcache: store a pointer to the on-disk sb in the cache and cached_dev structures
bcache: return a pointer to the on-disk sb from read_super
bcache: transfer the sb_page reference to register_{bdev,cache}
bcache: fix use-after-free in register_bcache()
bcache: properly initialize 'path' and 'err' in register_bcache()
bcache: rework error unwinding in register_bcache
bcache: use a separate data structure for the on-disk super block
bcache: cached_dev_free needs to put the sb page
md/raid1: introduce wait_for_serialization
md/raid1: use bucket based mechanism for IO serialization
md: introduce a new struct for IO serialization
md: don't destroy serial_info_pool if serialize_policy is true
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bcache/bcache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 24 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 80 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 136 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 25 | ||||
-rw-r--r-- | drivers/md/md.c | 254 | ||||
-rw-r--r-- | drivers/md/md.h | 45 | ||||
-rw-r--r-- | drivers/md/raid1.c | 111 | ||||
-rw-r--r-- | drivers/md/raid5.c | 21 |
11 files changed, 469 insertions, 236 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 9198c1b480d9..adf26a21fcd1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -301,6 +301,7 @@ struct cached_dev { struct block_device *bdev; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct closure sb_write; @@ -403,6 +404,7 @@ enum alloc_reserve { struct cache { struct cache_set *set; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cffcdc9feefb..4385303836d8 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1257,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * Our temporary buffer is the same size as the btree node's * buffer, we can just swap buffers instead of doing a big * memcpy() + * + * Don't worry event 'out' is allocated from mempool, it can + * still be swapped here. Because state->pool is a page mempool + * creaated by by mempool_init_page_pool(), which allocates + * pages by alloc_pages() indeed. */ out->magic = b->set->data->magic; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 14d6c33b0957..fa872df4e770 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -734,34 +734,32 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, i = 0; btree_cache_used = c->btree_cache_used; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out; - if (++i > 3 && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } nr--; + i++; } - for (; (nr--) && i < btree_cache_used; i++) { - if (list_empty(&c->btree_cache)) + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) goto out; - b = list_first_entry(&c->btree_cache, struct btree, list); - list_rotate_left(&c->btree_cache); - - if (!b->accessed && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; - } else - b->accessed = 0; + } + + nr--; + i++; } out: mutex_unlock(&c->bucket_lock); @@ -1069,7 +1067,6 @@ retry: BUG_ON(!b->written); b->parent = parent; - b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { prefetch(b->keys.set[i].tree); @@ -1160,7 +1157,6 @@ retry: goto retry; } - b->accessed = 1; b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 76cfd121a486..f4dcca449391 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -121,8 +121,6 @@ struct btree { /* Key/pointer for this btree node */ BKEY_PADDED(key); - /* Single bit - set when accessed, cleared by shrinker */ - unsigned long accessed; unsigned long seq; struct rw_semaphore lock; struct cache_set *c; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index be2a2a201603..33ddc5269e8d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -417,10 +417,14 @@ err: /* Journalling */ +#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask)) + static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; - unsigned int i, n; + unsigned int i, nr, ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; if (c->journal.btree_flushing) return; @@ -433,12 +437,50 @@ static void btree_flush_write(struct cache_set *c) c->journal.btree_flushing = true; spin_unlock(&c->journal.flush_write_lock); + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } + spin_unlock(&c->journal.lock); + + mask = c->journal.pin.mask; + nr = 0; atomic_long_inc(&c->flush_write); memset(btree_nodes, 0, sizeof(btree_nodes)); - n = 0; mutex_lock(&c->bucket_lock); list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + if (btree_node_journal_flush(b)) pr_err("BUG: flush_write bit should not be set here!"); @@ -454,17 +496,44 @@ static void btree_flush_write(struct cache_set *c) continue; } + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the folowing for-loop. + */ + if (nr_to_fifo_front(btree_current_write(b)->journal, + fifo_front_p, + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } + set_btree_node_journal_flush(b); mutex_unlock(&b->write_lock); - btree_nodes[n++] = b; - if (n == BTREE_FLUSH_NR) + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) break; } mutex_unlock(&c->bucket_lock); - for (i = 0; i < n; i++) { + for (i = 0; i < nr; i++) { b = btree_nodes[i]; if (!b) { pr_err("BUG: btree_nodes[%d] is NULL", i); @@ -497,6 +566,7 @@ static void btree_flush_write(struct cache_set *c) mutex_unlock(&b->write_lock); } +out: spin_lock(&c->journal.flush_write_lock); c->journal.btree_flushing = false; spin_unlock(&c->journal.flush_write_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 77e9869345e7..3dea1d5acd5c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -15,7 +15,6 @@ #include "writeback.h" #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> #include <linux/idr.h> @@ -60,17 +59,18 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, - struct page **res) + struct cache_sb_disk **res) { const char *err; - struct cache_sb *s; - struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + struct cache_sb_disk *s; + struct page *page; unsigned int i; - if (!bh) + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(page)) return "IO error"; - - s = (struct cache_sb *) bh->b_data; + s = page_address(page) + offset_in_page(SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -188,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, } sb->last_mount = (u32)ktime_get_real_seconds(); - err = NULL; - - get_page(bh->b_page); - *res = bh->b_page; + *res = s; + return NULL; err: - put_bh(bh); + put_page(page); return err; } @@ -207,15 +205,15 @@ static void write_bdev_super_endio(struct bio *bio) closure_put(&dc->sb_write); } -static void __write_super(struct cache_sb *sb, struct bio *bio) +static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, + struct bio *bio) { - struct cache_sb *out = page_address(bio_first_page_all(bio)); unsigned int i; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_iter.bi_size = SB_SIZE; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, NULL); + __bio_add_page(bio, virt_to_page(out), SB_SIZE, + offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); out->version = cpu_to_le64(sb->version); @@ -257,14 +255,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); - bio_reset(bio); + bio_init(bio, dc->sb_bv, 1); bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; closure_get(cl); /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); + __write_super(&dc->sb, dc->sb_disk, bio); closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } @@ -306,13 +304,13 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); - bio_reset(bio); + bio_init(bio, ca->sb_bv, 1); bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; closure_get(cl); - __write_super(&ca->sb, bio); + __write_super(&ca->sb, ca->sb_disk, bio); } closure_return_with_destructor(cl, bcache_write_super_unlock); @@ -1275,6 +1273,9 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); + if (dc->sb_disk) + put_page(virt_to_page(dc->sb_disk)); + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1350,7 +1351,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static int register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cached_dev *dc) { @@ -1362,11 +1363,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; - - bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; - get_page(sb_page); - + dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -2136,8 +2133,8 @@ void bch_cache_release(struct kobject *kobj) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(bio_first_page_all(&ca->sb_bio)); + if (ca->sb_disk) + put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -2259,7 +2256,7 @@ err_free: return ret; } -static int register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cache *ca) { const char *err = NULL; /* must be set for any error case */ @@ -2269,10 +2266,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; - - bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; - get_page(sb_page); + ca->sb_disk = sb_disk; if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); @@ -2372,29 +2366,35 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; + const char *err; char *path = NULL; - struct cache_sb *sb = NULL; - struct block_device *bdev = NULL; - struct page *sb_page = NULL; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; + ssize_t ret; + ret = -EBUSY; + err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) - return -EBUSY; + goto out; /* For latest state of bcache_is_reboot */ smp_mb(); + err = "bcache is in reboot"; if (bcache_is_reboot) - return -EBUSY; + goto out_module_put; + ret = -ENOMEM; + err = "cannot allocate memory"; path = kstrndup(buffer, size, GFP_KERNEL); if (!path) - goto err; + goto out_module_put; sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) - goto err; + goto out_free_path; + ret = -EINVAL; err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, @@ -2411,57 +2411,63 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto quiet_out; + goto done; } - goto err; + goto out_free_sb; } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) - goto err_close; + goto out_blkdev_put; - err = read_super(sb, bdev, &sb_page); + err = read_super(sb, bdev, &sb_disk); if (err) - goto err_close; + goto out_blkdev_put; err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) - goto err_close; + goto out_put_sb_page; mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) - goto err; + goto out_free_sb; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err_close; + goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; + if (register_cache(sb, sb_disk, bdev, ca) != 0) + goto out_free_sb; } -quiet_out: - ret = size; -out: - if (sb_page) - put_page(sb_page); + +done: kfree(sb); kfree(path); module_put(THIS_MODULE); - return ret; + return size; -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - pr_info("error %s: %s", path, err); - goto out; +out_put_sb_page: + put_page(virt_to_page(sb_disk)); +out_blkdev_put: + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: + kfree(sb); +out_free_path: + kfree(path); + path = NULL; +out_module_put: + module_put(THIS_MODULE); +out: + pr_info("error %s: %s", path?path:"", err); + return ret; } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 3ad18246fcb3..e230052c2107 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1019,8 +1019,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) /* look at each page to see if there are any set bits that need to be * flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) { - if (!bitmap->storage.filemap) - return; dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); @@ -1338,7 +1336,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; - if (test_and_clear_page_attr(bitmap, j, + if (bitmap->storage.filemap && + test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE)) { write_page(bitmap, bitmap->storage.filemap[j], 0); } @@ -1790,8 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, true); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1908,7 +1907,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev, true); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2475,16 +2474,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; - if (!backlog && mddev->wb_info_pool) { - /* wb_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - } else if (backlog && !mddev->wb_info_pool) { - /* wb_info_pool is needed since backlog is not zero */ + if (!backlog && mddev->serial_info_pool) { + /* serial_info_pool is not needed if backlog is zero */ + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, false); + } else if (backlog && !mddev->serial_info_pool) { + /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e7c9f398bc6..4824d50526fa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,74 +125,165 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static int rdev_init_wb(struct md_rdev *rdev) +static void rdev_uninit_serial(struct md_rdev *rdev) { - if (rdev->bdev->bd_queue->nr_hw_queues == 1) + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; + + kvfree(rdev->serial); + rdev->serial = NULL; +} + +static void rdevs_uninit_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} + +static int rdev_init_serial(struct md_rdev *rdev) +{ + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); + struct serial_in_rdev *serial = NULL; + + if (test_bit(CollisionCheck, &rdev->flags)) return 0; - spin_lock_init(&rdev->wb_list_lock); - INIT_LIST_HEAD(&rdev->wb_list); - init_waitqueue_head(&rdev->wb_io_wait); - set_bit(WBCollisionCheck, &rdev->flags); + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); + if (!serial) + return -ENOMEM; - return 1; + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; + + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + + rdev->serial = serial; + set_bit(CollisionCheck, &rdev->flags); + + return 0; +} + +static int rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + int ret = 0; + + rdev_for_each(rdev, mddev) { + ret = rdev_init_serial(rdev); + if (ret) + break; + } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); + + return ret; } /* - * Create wb_info_pool if rdev is the first multi-queue device flaged - * with writemostly, also write-behind mode is enabled. + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. */ -void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +static int rdev_need_serial(struct md_rdev *rdev) { - if (mddev->bitmap_info.max_write_behind == 0) - return; + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. + */ +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + int ret = 0; - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool == NULL) { + if (!is_suspend) + mddev_suspend(mddev); + + if (!rdev) + ret = rdevs_init_serial(mddev); + else + ret = rdev_init_serial(rdev); + if (ret) + goto abort; + + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; - if (!is_suspend) - mddev_suspend(mddev); noio_flag = memalloc_noio_save(); - mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->wb_info_pool) - pr_err("can't alloc memory pool for writemostly\n"); - if (!is_suspend) - mddev_resume(mddev); + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); + pr_err("can't alloc memory pool for serialization\n"); + } } + +abort: + if (!is_suspend) + mddev_resume(mddev); } -EXPORT_SYMBOL_GPL(mddev_create_wb_pool); /* - * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { - if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool) { + if (mddev->serial_info_pool) { struct md_rdev *temp; - int num = 0; + int num = 0; /* used to track if other rdevs need the pool */ - /* - * Check if other rdevs need wb_info_pool. - */ - rdev_for_each(temp, mddev) - if (temp != rdev && - test_bit(WBCollisionCheck, &temp->flags)) + if (!is_suspend) + mddev_suspend(mddev); + rdev_for_each(temp, mddev) { + if (!rdev) { + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) num++; - if (!num) { - mddev_suspend(rdev->mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - mddev_resume(rdev->mddev); } + + if (rdev) + rdev_uninit_serial(rdev); + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } + if (!is_suspend) + mddev_resume(mddev); } } @@ -2337,7 +2428,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2375,7 +2466,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2888,10 +2979,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_wb_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -5277,6 +5368,57 @@ static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->level != 1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->level != 1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + mddev_suspend(mddev); + if (value) + mddev_create_serial_pool(mddev, NULL, true); + else + mddev_destroy_serial_pool(mddev, NULL, true); + mddev->serialize_policy = value; + mddev_resume(mddev); +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5294,6 +5436,7 @@ static struct attribute *md_default_attrs[] = { &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, + &md_serialize_policy.attr, NULL, }; @@ -5769,18 +5912,18 @@ int md_run(struct mddev *mddev) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { - bool creat_pool = false; + bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && - rdev_init_wb(rdev)) - creat_pool = true; - } - if (creat_pool && mddev->wb_info_pool == NULL) { - mddev->wb_info_pool = - mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); - if (!mddev->wb_info_pool) { + rdev_init_serial(rdev)) + create_pool = true; + } + if (create_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } @@ -6025,8 +6168,9 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL, true); } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 5f86f8adb0a4..acd681939112 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -32,6 +32,16 @@ * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) + +/* + * The struct embedded in rdev is used to serialize IO. + */ +struct serial_in_rdev { + struct rb_root_cached serial_rb; + spinlock_t serial_lock; + wait_queue_head_t serial_io_wait; +}; + /* * MD's 'extended' device */ @@ -110,12 +120,7 @@ struct md_rdev { * in superblock. */ - /* - * The members for check collision of write behind IOs. - */ - struct list_head wb_list; - spinlock_t wb_list_lock; - wait_queue_head_t wb_io_wait; + struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -201,9 +206,9 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ - WBCollisionCheck, /* - * multiqueue device should check if there - * is collision between write behind bios. + CollisionCheck, /* + * check if there is collision between raid1 + * serial bios. */ }; @@ -263,12 +268,13 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_WB_INFOS 8 -/* record current range of write behind IOs */ -struct wb_info { - sector_t lo; - sector_t hi; - struct list_head list; +#define NR_SERIAL_INFOS 8 +/* record current range of serialize IOs */ +struct serial_info { + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ }; struct mddev { @@ -487,13 +493,14 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ - mempool_t *wb_info_pool; + mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ bool has_superblocks:1; bool fail_last_dev:1; + bool serialize_policy:1; }; enum recovery_flags { @@ -737,8 +744,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); -extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); +extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 201fd8aec59a..cd810e195086 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/interval_tree_generic.h> #include <trace/events/block.h> @@ -50,55 +51,71 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" -static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, + START, LAST, static inline, raid1_rb); + +static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, + struct serial_info *si, int idx) { - struct wb_info *wi, *temp_wi; unsigned long flags; int ret = 0; - struct mddev *mddev = rdev->mddev; - - wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); - - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->wb_list, list) { - /* collision happened */ - if (hi > temp_wi->lo && lo < temp_wi->hi) { - ret = -EBUSY; - break; - } + sector_t lo = r1_bio->sector; + sector_t hi = lo + r1_bio->sectors; + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + /* collision happened */ + if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + ret = -EBUSY; + else { + si->start = lo; + si->last = hi; + raid1_rb_insert(si, &serial->serial_rb); } - - if (!ret) { - wi->lo = lo; - wi->hi = hi; - list_add(&wi->list, &rdev->wb_list); - } else - mempool_free(wi, mddev->wb_info_pool); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } -static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + struct mddev *mddev = rdev->mddev; + struct serial_info *si; + int idx = sector_to_idx(r1_bio->sector); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + if (WARN_ON(!mddev->serial_info_pool)) + return; + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + wait_event(serial->serial_io_wait, + check_and_add_serial(rdev, r1_bio, si, idx) == 0); +} + +static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi; + struct serial_info *si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(wi, &rdev->wb_list, list) - if (hi == wi->hi && lo == wi->lo) { - list_del(&wi->list); - mempool_free(wi, mddev->wb_info_pool); + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + si; si = raid1_rb_iter_next(si, lo, hi)) { + if (si->start == lo && si->last == hi) { + raid1_rb_remove(si, &serial->serial_rb); + mempool_free(si, mddev->serial_info_pool); found = 1; break; } - + } if (!found) - WARN(1, "The write behind IO is not recorded\n"); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); - wake_up(&rdev->wb_io_wait); + WARN(1, "The write IO is not recorded for serialization\n"); + spin_unlock_irqrestore(&serial->serial_lock, flags); + wake_up(&serial->serial_io_wait); } /* @@ -430,6 +447,8 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; @@ -499,12 +518,8 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WBCollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - - remove_wb(rdev, lo, hi); - } + if (test_bit(CollisionCheck, &rdev->flags)) + remove_serial(rdev, lo, hi); if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -527,7 +542,8 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } + } else if (rdev->mddev->serialize_policy) + remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -1479,6 +1495,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; + struct md_rdev *rdev = conf->mirrors[i].rdev; if (!r1_bio->bios[i]) continue; @@ -1506,18 +1523,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - - if (test_bit(WBCollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - - wait_event(rdev->wb_io_wait, - check_and_add_wb(rdev, lo, hi) == 0); - } + if (test_bit(CollisionCheck, &rdev->flags)) + wait_for_serialization(rdev, r1_bio); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); - } + } else if (mddev->serialize_policy) + wait_for_serialization(rdev, r1_bio); r1_bio->bios[i] = mbio; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d4d3b67ffbba..ba00e9877f02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6598,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, - int *worker_cnt_per_group, struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) @@ -6607,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; - int group_cnt, worker_cnt_per_group; + int group_cnt; if (len >= PAGE_SIZE) return -EINVAL; @@ -6630,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (old_groups) flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); if (!err) { spin_lock_irq(&conf->device_lock); conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = new; conf->worker_groups = new_groups; spin_unlock_irq(&conf->device_lock); @@ -6672,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt, - int *group_cnt, - int *worker_cnt_per_group, +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, struct r5worker_group **worker_groups) { int i, j, k; ssize_t size; struct r5worker *workers; - *worker_cnt_per_group = cnt; if (cnt == 0) { *group_cnt = 0; *worker_groups = NULL; @@ -6882,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct disk_info *disk; char pers_name[6]; int i; - int group_cnt, worker_cnt_per_group; + int group_cnt; struct r5worker_group *new_group; int ret; @@ -6928,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < PENDING_IO_MAX; i++) list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ - if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, - &new_group)) { + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = 0; conf->worker_groups = new_group; } else goto abort; |