diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
commit | 7cd4ecd9177b94af783b8e21de7c65b41a871342 (patch) | |
tree | 3ca393f3eaeeaad56d4ab60f87e28d7197b0ba21 /drivers/md | |
parent | 79ec6d9cac46d59db9b006bc9cde2811ef365292 (diff) | |
parent | 79cd16681acccffcf5521f6e3d8c7c50aaffca0a (diff) |
Merge tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"Here are the driver updates for 5.10.
A few SCSI updates in here too, in coordination with Martin as they
depend on core block changes for the shared tag bitmap.
This contains:
- NVMe pull requests via Christoph:
- fix keep alive timer modification (Amit Engel)
- order the PCI ID list more sensibly (Andy Shevchenko)
- cleanup the open by controller helper (Chaitanya Kulkarni)
- use an xarray for the CSE log lookup (Chaitanya Kulkarni)
- support ZNS in nvmet passthrough mode (Chaitanya Kulkarni)
- fix nvme_ns_report_zones (Christoph Hellwig)
- add a sanity check to nvmet-fc (James Smart)
- fix interrupt allocation when too many polled queues are
specified (Jeffle Xu)
- small nvmet-tcp optimization (Mark Wunderlich)
- fix a controller refcount leak on init failure (Chaitanya
Kulkarni)
- misc cleanups (Chaitanya Kulkarni)
- major refactoring of the scanning code (Christoph Hellwig)
- MD updates via Song:
- Bug fixes in bitmap code, from Zhao Heming
- Fix a work queue check, from Guoqing Jiang
- Fix raid5 oops with reshape, from Song Liu
- Clean up unused code, from Jason Yan
- Discard improvements, from Xiao Ni
- raid5/6 page offset support, from Yufen Yu
- Shared tag bitmap for SCSI/hisi_sas/null_blk (John, Kashyap,
Hannes)
- null_blk open/active zone limit support (Niklas)
- Set of bcache updates (Coly, Dongsheng, Qinglang)"
* tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block: (78 commits)
md/raid5: fix oops during stripe resizing
md/bitmap: fix memory leak of temporary bitmap
md: fix the checking of wrong work queue
md/bitmap: md_bitmap_get_counter returns wrong blocks
md/bitmap: md_bitmap_read_sb uses wrong bitmap blocks
md/raid0: remove unused function is_io_in_chunk_boundary()
nvme-core: remove extra condition for vwc
nvme-core: remove extra variable
nvme: remove nvme_identify_ns_list
nvme: refactor nvme_validate_ns
nvme: move nvme_validate_ns
nvme: query namespace identifiers before adding the namespace
nvme: revalidate zone bitmaps in nvme_update_ns_info
nvme: remove nvme_update_formats
nvme: update the known admin effects
nvme: set the queue limits in nvme_update_ns_info
nvme: remove the 0 lba_shift check in nvme_update_ns_info
nvme: clean up the check for too large logic block sizes
nvme: freeze the queue over ->lba_shift updates
nvme: factor out a nvme_configure_metadata helper
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 60 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 29 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 146 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 16 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 10 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/features.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 246 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 58 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 244 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 10 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 2 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 16 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 22 | ||||
-rw-r--r-- | drivers/md/md.h | 2 | ||||
-rw-r--r-- | drivers/md/raid0.c | 31 | ||||
-rw-r--r-- | drivers/md/raid10.c | 431 | ||||
-rw-r--r-- | drivers/md/raid10.h | 1 | ||||
-rw-r--r-- | drivers/md/raid5.c | 278 | ||||
-rw-r--r-- | drivers/md/raid5.h | 29 |
24 files changed, 1032 insertions, 620 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 52035a78d836..8c371d5eef8e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -49,7 +49,7 @@ * * bch_bucket_alloc() allocates a single bucket from a specific cache. * - * bch_bucket_alloc_set() allocates one or more buckets from different caches + * bch_bucket_alloc_set() allocates one bucket from different caches * out of a cache set. * * free_some_buckets() drives all the processes described above. It's called @@ -87,8 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; - unsigned int i; + unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024; int r; atomic_sub(sectors, &c->rescale); @@ -104,14 +103,14 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) c->min_prio = USHRT_MAX; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) - if (b->prio && - b->prio != BTREE_PRIO && - !atomic_read(&b->pin)) { - b->prio--; - c->min_prio = min(c->min_prio, b->prio); - } + ca = c->cache; + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } mutex_unlock(&c->bucket_lock); } @@ -362,7 +361,7 @@ retry_invalidate: * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb)) { + if (CACHE_SYNC(&ca->sb)) { /* * This could deadlock if an allocation with a btree * node locked ever blocked - having the btree node @@ -488,34 +487,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { - int i; + struct cache *ca; + long b; /* No allocation if CACHE_SET_IO_DISABLE bit is set */ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return -1; lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); bkey_init(k); - /* sort by free space/prio of oldest data in caches */ - - for (i = 0; i < n; i++) { - struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, reserve, wait); + ca = c->cache; + b = bch_bucket_alloc(ca, reserve, wait); + if (b == -1) + goto err; - if (b == -1) - goto err; + k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); - k->ptr[i] = MAKE_PTR(ca->buckets[b].gen, - bucket_to_sector(c, b), - ca->sb.nr_this_dev); - - SET_KEY_PTRS(k, i + 1); - } + SET_KEY_PTRS(k, 1); return 0; err: @@ -525,12 +519,12 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -589,7 +583,7 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, struct open_bucket, list); found: if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; + ret->sectors_free = c->cache->sb.bucket_size; bkey_copy(&ret->key, alloc); bkey_init(alloc); } @@ -638,7 +632,7 @@ bool bch_alloc_sectors(struct cache_set *c, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) return false; spin_lock(&c->data_bucket_lock); @@ -683,7 +677,7 @@ bool bch_alloc_sectors(struct cache_set *c, &PTR_CACHE(c, &b->key, i)->sectors_written); } - if (b->sectors_free < c->sb.block_size) + if (b->sectors_free < c->cache->sb.block_size) b->sectors_free = 0; /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4fd03d2496d8..1d57f48307e6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -517,11 +517,7 @@ struct cache_set { atomic_t idle_counter; atomic_t at_max_writeback_rate; - struct cache_sb sb; - - struct cache *cache[MAX_CACHES_PER_SET]; - struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; - int caches_loaded; + struct cache *cache; struct bcache_device **devices; unsigned int devices_max_used; @@ -670,6 +666,7 @@ struct cache_set { struct mutex verify_lock; #endif + uint8_t set_uuid[16]; unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); @@ -758,9 +755,8 @@ struct bbio { #define btree_default_blocks(c) \ ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) -#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) -#define bucket_bytes(c) ((c)->sb.bucket_size << 9) -#define block_bytes(c) ((c)->sb.block_size << 9) +#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9) +#define block_bytes(ca) ((ca)->sb.block_size << 9) static inline unsigned int meta_bucket_pages(struct cache_sb *sb) { @@ -801,14 +797,14 @@ static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) { - return s & (c->sb.bucket_size - 1); + return s & (c->cache->sb.bucket_size - 1); } static inline struct cache *PTR_CACHE(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return c->cache[PTR_DEV(k, ptr)]; + return c->cache; } static inline size_t PTR_BUCKET_NR(struct cache_set *c, @@ -889,9 +885,6 @@ do { \ /* Looping macros */ -#define for_each_cache(ca, cs, iter) \ - for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) - #define for_each_bucket(b, ca) \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) @@ -933,11 +926,9 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) static inline void wake_up_allocators(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - wake_up_process(ca->alloc_thread); + wake_up_process(ca->alloc_thread); } static inline void closure_bio_submit(struct cache_set *c, @@ -994,9 +985,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k); long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3d8bd0692af3..910df242c83d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -104,7 +104,7 @@ static inline struct bset *write_block(struct btree *b) { - return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); } static void bch_btree_init_next(struct btree *b) @@ -117,7 +117,7 @@ static void bch_btree_init_next(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); } @@ -155,7 +155,7 @@ void bch_btree_node_read_done(struct btree *b) * See the comment arount cache_set->fill_iter. */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter->size = b->c->sb.bucket_size / b->c->sb.block_size; + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG @@ -173,12 +173,12 @@ void bch_btree_node_read_done(struct btree *b) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, block_bytes(b->c)) > + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > btree_blocks(b)) goto err; err = "bad magic"; - if (i->magic != bset_magic(&b->c->sb)) + if (i->magic != bset_magic(&b->c->cache->sb)) goto err; err = "bad checksum"; @@ -199,13 +199,13 @@ void bch_btree_node_read_done(struct btree *b) bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } err = "corrupted btree"; for (i = write_block(b); bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); - i = ((void *) i) + block_bytes(b->c)) + i = ((void *) i) + block_bytes(b->c->cache)) if (i->seq == b->keys.set[0].data->seq) goto err; @@ -219,7 +219,7 @@ void bch_btree_node_read_done(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); out: mempool_free(iter, &b->c->fill_iter); return; @@ -347,7 +347,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; - b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); @@ -423,10 +423,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } void bch_btree_node_write(struct btree *b, struct closure *parent) @@ -514,7 +514,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -#define mca_reserve(c) (((c->root && c->root->level) \ +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ max_t(int, 0, c->btree_cache_used - mca_reserve(c)) @@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -785,7 +785,8 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); if (!c->verify_ondisk) { /* * Don't worry about the mca_rereserve buckets @@ -1091,7 +1092,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); @@ -1108,7 +1109,7 @@ retry: } b->parent = parent; - bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); mutex_unlock(&c->bucket_lock); @@ -1167,19 +1168,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; - struct cache *ca; - unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { - if (op) - prepare_to_wait(&c->btree_cache_wait, &op->wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->bucket_lock); - return -EINTR; - } + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } mutex_unlock(&c->bucket_lock); @@ -1345,7 +1345,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (nodes < 2 || __set_blocks(b->keys.set[0].data, keys, - block_bytes(b->c)) > blocks * (nodes - 1)) + block_bytes(b->c->cache)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { @@ -1379,7 +1379,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), - block_bytes(b->c)) > blocks) + block_bytes(b->c->cache)) > blocks) break; last = k; @@ -1395,7 +1395,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - block_bytes(b->c)) > + block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])) goto out_unlock_nocoalesce; @@ -1404,7 +1404,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])); if (last) @@ -1695,7 +1695,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned int i; if (!c->gc_mark_valid) return; @@ -1705,14 +1704,14 @@ static void btree_gc_start(struct cache_set *c) c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) { - b->last_gc = b->gen; - if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, 0); - SET_GC_SECTORS_USED(b, 0); - } + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); } + } mutex_unlock(&c->bucket_lock); } @@ -1721,7 +1720,8 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned int i; + unsigned int i, j; + uint64_t *k; mutex_lock(&c->bucket_lock); @@ -1739,7 +1739,6 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1756,29 +1755,27 @@ static void bch_btree_gc_finish(struct cache_set *c) rcu_read_unlock(); c->avail_nbuckets = 0; - for_each_cache(ca, c, i) { - uint64_t *i; - ca->invalidate_needs_gc = 0; + ca = c->cache; + ca->invalidate_needs_gc = 0; - for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for (i = ca->prio_buckets; - i < ca->prio_buckets + prio_buckets(ca) * 2; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for_each_bucket(b, ca) { - c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (atomic_read(&b->pin)) - continue; + if (atomic_read(&b->pin)) + continue; - BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); - if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - c->avail_nbuckets++; - } + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; } mutex_unlock(&c->bucket_lock); @@ -1830,12 +1827,10 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) - return true; + if (ca->invalidate_needs_gc) + return true; if (atomic_read(&c->sectors_to_gc) < 0) return true; @@ -2081,9 +2076,8 @@ out: void bch_initial_gc_finish(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; bch_btree_gc_finish(c); @@ -2098,20 +2092,18 @@ void bch_initial_gc_finish(struct cache_set *c) * This is only safe for buckets that have no live data in them, which * there should always be some of. */ - for_each_cache(ca, c, i) { - for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO]) && - fifo_full(&ca->free[RESERVE_BTREE])) - break; + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; - if (bch_can_invalidate_bucket(ca, b) && - !GC_MARK(b)) { - __bch_invalidate_one_bucket(ca, b); - if (!fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets)) - fifo_push(&ca->free[RESERVE_BTREE], - b - ca->buckets); - } + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } @@ -2219,7 +2211,7 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err; split = set_blocks(btree_bset_first(n1), - block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned int keys = 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 257969980c49..50482107134f 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -194,7 +194,7 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16); } void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0164a1fe94a9..d8d9394a6beb 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -159,7 +159,7 @@ void closure_debug_destroy(struct closure *cl) static struct dentry *closure_debug; -static int debug_seq_show(struct seq_file *f, void *data) +static int debug_show(struct seq_file *f, void *data) { struct closure *cl; @@ -188,17 +188,7 @@ static int debug_seq_show(struct seq_file *f, void *data) return 0; } -static int debug_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_seq_show, NULL); -} - -static const struct file_operations debug_ops = { - .owner = THIS_MODULE, - .open = debug_seq_open, - .read = seq_read, - .release = single_release -}; +DEFINE_SHOW_ATTRIBUTE(debug); void __init closure_debug_init(void) { @@ -209,7 +199,7 @@ void __init closure_debug_init(void) * about this. */ closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_ops); + "closures", 0400, bcache_debug, NULL, &debug_fops); } #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 336f43910383..b00fd08d696b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -25,8 +25,8 @@ struct dentry *bcache_debug; for (i = (start); \ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ i->seq == (start)->seq; \ - i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ - block_bytes(b->c)) + i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) * \ + block_bytes(b->c->cache)) void bch_btree_verify(struct btree *b) { @@ -82,14 +82,14 @@ void bch_btree_verify(struct btree *b) for_each_written_bset(b, ondisk, i) { unsigned int block = ((void *) i - (void *) ondisk) / - block_bytes(b->c); + block_bytes(b->c->cache); pr_err("*** on disk block %u:\n", block); bch_dump_bset(&b->keys, i, block); } pr_err("*** block %zu not written\n", - ((void *) i - (void *) ondisk) / block_bytes(b->c)); + ((void *) i - (void *) ondisk) / block_bytes(b->c->cache)); for (j = 0; j < inmemory->keys; j++) if (inmemory->d[j] != sorted->d[j]) @@ -238,7 +238,7 @@ void bch_debug_init_cache_set(struct cache_set *c) if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; - snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + snprintf(name, 50, "bcache-%pU", c->set_uuid); c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 9162af5bb6ec..f4658a1f37b8 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -54,7 +54,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size || + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size || bucket < ca->sb.first_bucket || bucket >= ca->sb.nbuckets) return true; @@ -75,7 +75,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size) + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) return "bad, length too big"; if (bucket < ca->sb.first_bucket) return "bad, short offset"; @@ -136,7 +136,7 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) size_t n = PTR_BUCKET_NR(b->c, k, j); pr_cont(" bucket %zu", n); - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets) pr_cont(" prio %i", PTR_BUCKET(b->c, k, j)->prio); } diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index 4442df48d28c..6469223f0b77 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -30,7 +30,7 @@ static struct feature feature_list[] = { for (f = &feature_list[0]; f->compat != 0; f++) { \ if (f->compat != BCH_FEATURE_ ## type) \ continue; \ - if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) { \ if (first) { \ out += snprintf(out, buf + size - out, \ "["); \ @@ -44,7 +44,7 @@ static struct feature feature_list[] = { \ out += snprintf(out, buf + size - out, "%s", f->string);\ \ - if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) \ out += snprintf(out, buf + size - out, "]"); \ \ first = false; \ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index a14a445618b4..dad71a6b7889 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb)); + bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb)); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c1227bdb57e7..aefbdb7e003b 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -98,7 +98,7 @@ reread: left = ca->sb.bucket_size - offset; return ret; } - blocks = set_blocks(j, block_bytes(ca->set)); + blocks = set_blocks(j, block_bytes(ca)); /* * Nodes in 'list' are in linear increasing order of @@ -179,112 +179,109 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) ret; \ }) - struct cache *ca; - unsigned int iter; + struct cache *ca = c->cache; int ret = 0; + struct journal_device *ja = &ca->journal; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); + unsigned int i, l, r, m; + uint64_t seq; - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned int i, l, r, m; - uint64_t seq; - - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); - pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries + * We must try the index l with ZERO first for + * correctness due to the scenario that the journal + * bucket is circular buffer which might have wrapped */ - for (i = 0; i < ca->sb.njournal_buckets; i++) { - /* - * We must try the index l with ZERO first for - * correctness due to the scenario that the journal - * bucket is circular buffer which might have wrapped - */ - l = (i * 2654435769U) % ca->sb.njournal_buckets; + l = (i * 2654435769U) % ca->sb.njournal_buckets; - if (test_bit(l, bitmap)) - break; + if (test_bit(l, bitmap)) + break; - if (read_bucket(l)) - goto bsearch; - } + if (read_bucket(l)) + goto bsearch; + } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search\n"); + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search\n"); - for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) - if (read_bucket(l)) - goto bsearch; + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) + if (read_bucket(l)) + goto bsearch; - /* no journal entries on this device? */ - if (l == ca->sb.njournal_buckets) - continue; + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) + goto out; bsearch: - BUG_ON(list_empty(list)); + BUG_ON(list_empty(list)); - /* Binary search */ - m = l; - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); - pr_debug("starting binary search, l %u r %u\n", l, r); + /* Binary search */ + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u\n", l, r); - while (l + 1 < r) { - seq = list_entry(list->prev, struct journal_replay, - list)->j.seq; + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; - m = (l + r) >> 1; - read_bucket(m); + m = (l + r) >> 1; + read_bucket(m); - if (seq != list_entry(list->prev, struct journal_replay, - list)->j.seq) - l = m; - else - r = m; - } + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } - /* - * Read buckets in reverse order until we stop finding more - * journal entries - */ - pr_debug("finishing up: m %u njournal_buckets %u\n", - m, ca->sb.njournal_buckets); - l = m; + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u\n", + m, ca->sb.njournal_buckets); + l = m; - while (1) { - if (!l--) - l = ca->sb.njournal_buckets - 1; + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; - if (l == m) - break; + if (l == m) + break; - if (test_bit(l, bitmap)) - continue; + if (test_bit(l, bitmap)) + continue; - if (!read_bucket(l)) - break; - } + if (!read_bucket(l)) + break; + } - seq = 0; + seq = 0; - for (i = 0; i < ca->sb.njournal_buckets; i++) - if (ja->seq[i] > seq) { - seq = ja->seq[i]; - /* - * When journal_reclaim() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; - } - } + } +out: if (!list_empty(list)) c->journal.seq = list_entry(list->prev, struct journal_replay, @@ -342,12 +339,10 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) static bool is_discard_enabled(struct cache_set *s) { - struct cache *ca; - unsigned int i; + struct cache *ca = s->cache; - for_each_cache(ca, s, i) - if (ca->discard) - return true; + if (ca->discard) + return true; return false; } @@ -633,9 +628,10 @@ static void do_journal_discard(struct cache *ca) static void journal_reclaim(struct cache_set *c) { struct bkey *k = &c->journal.key; - struct cache *ca; + struct cache *ca = c->cache; uint64_t last_seq; - unsigned int iter, n = 0; + unsigned int next; + struct journal_device *ja = &ca->journal; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -647,46 +643,31 @@ static void journal_reclaim(struct cache_set *c) /* Update last_idx */ - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - while (ja->last_idx != ja->cur_idx && - ja->seq[ja->last_idx] < last_seq) - ja->last_idx = (ja->last_idx + 1) % - ca->sb.njournal_buckets; - } + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; - for_each_cache(ca, c, iter) - do_journal_discard(ca); + do_journal_discard(ca); if (c->journal.blocks_free) goto out; - /* - * Allocate: - * XXX: Sort by free journal space - */ - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + /* No space available on this device */ + if (next == ja->discard_idx) + goto out; - /* No space available on this device */ - if (next == ja->discard_idx) - continue; + ja->cur_idx = next; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); - ja->cur_idx = next; - k->ptr[n++] = MAKE_PTR(0, - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), - ca->sb.nr_this_dev); - atomic_long_inc(&c->reclaimed_journal_buckets); - } + bkey_init(k); + SET_KEY_PTRS(k, 1); + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; - if (n) { - bkey_init(k); - SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - } out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -750,11 +731,11 @@ static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { struct cache_set *c = container_of(cl, struct cache_set, journal.io); - struct cache *ca; + struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * - c->sb.block_size; + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * + ca->sb.block_size; struct bio *bio; struct bio_list list; @@ -773,17 +754,15 @@ static void journal_write_unlocked(struct closure *cl) return; } - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); w->data->btree_level = c->root->level; bkey_copy(&w->data->btree_root, &c->root->key); bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); - for_each_cache(ca, c, i) - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - - w->data->magic = jset_magic(&c->sb); + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + w->data->magic = jset_magic(&ca->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); @@ -859,6 +838,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, size_t sectors; struct closure cl; bool wait = false; + struct cache *ca = c->cache; closure_init_stack(&cl); @@ -868,10 +848,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; + block_bytes(ca)) * ca->sb.block_size; if (sectors <= min_t(size_t, - c->journal.blocks_free * c->sb.block_size, + c->journal.blocks_free * ca->sb.block_size, PAGE_SECTORS << JSET_BITS)) return w; @@ -936,7 +916,7 @@ atomic_t *bch_journal(struct cache_set *c, if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return NULL; - if (!CACHE_SYNC(&c->sb)) + if (!CACHE_SYNC(&c->cache->sb)) return NULL; w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 5872d6470470..b9c3d27ec093 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -196,50 +196,48 @@ static unsigned int bucket_heap_top(struct cache *ca) void bch_moving_gc(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; + unsigned long sectors_to_move, reserve_sectors; if (!c->copy_gc_enabled) return; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) { - unsigned long sectors_to_move = 0; - unsigned long reserve_sectors = ca->sb.bucket_size * + sectors_to_move = 0; + reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.used = 0; - - for_each_bucket(b, ca) { - if (GC_MARK(b) == GC_MARK_METADATA || - !GC_SECTORS_USED(b) || - GC_SECTORS_USED(b) == ca->sb.bucket_size || - atomic_read(&b->pin)) - continue; - - if (!heap_full(&ca->heap)) { - sectors_to_move += GC_SECTORS_USED(b); - heap_add(&ca->heap, b, bucket_cmp); - } else if (bucket_cmp(b, heap_peek(&ca->heap))) { - sectors_to_move -= bucket_heap_top(ca); - sectors_to_move += GC_SECTORS_USED(b); - - ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_cmp); - } - } + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) + continue; - while (sectors_to_move > reserve_sectors) { - heap_pop(&ca->heap, b, bucket_cmp); - sectors_to_move -= GC_SECTORS_USED(b); + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); } + } - while (heap_pop(&ca->heap, b, bucket_cmp)) - SET_GC_MOVE(b, 1); + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); } + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); + mutex_unlock(&c->bucket_lock); c->moving_gc_keys.last_scanned = ZERO_KEY; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7f54ae223644..214326383145 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -99,7 +99,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) return -ENOMEM; return __bch_keylist_realloc(l, u64s); @@ -394,8 +394,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || + bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); goto skip; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6bfa77167362..46a00134a36a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -343,34 +343,25 @@ static void bcache_write_super_unlock(struct closure *cl) void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; - struct cache *ca; - unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + struct cache *ca = c->cache; + struct bio *bio = &ca->sb_bio; + unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); - c->sb.seq++; - - if (c->sb.version > version) - version = c->sb.version; - - for_each_cache(ca, c, i) { - struct bio *bio = &ca->sb_bio; - - ca->sb.version = version; - ca->sb.seq = c->sb.seq; - ca->sb.last_mount = c->sb.last_mount; + ca->sb.seq++; - SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); + if (ca->sb.version < version) + ca->sb.version = version; - bio_init(bio, ca->sb_bv, 1); - bio_set_dev(bio, ca->bdev); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; + bio_init(bio, ca->sb_bv, 1); + bio_set_dev(bio, ca->bdev); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; - closure_get(cl); - __write_super(&ca->sb, ca->sb_disk, bio); - } + closure_get(cl); + __write_super(&ca->sb, ca->sb_disk, bio); closure_return_with_destructor(cl, bcache_write_super_unlock); } @@ -480,22 +471,21 @@ static int __uuid_write(struct cache_set *c) { BKEY_PADDED(key) k; struct closure cl; - struct cache *ca; + struct cache *ca = c->cache; unsigned int size; closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) return 1; - size = meta_bucket_pages(&c->sb) * PAGE_SECTORS; + size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; SET_KEY_SIZE(&k.key, size); uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); closure_sync(&cl); /* Only one bucket used for uuid write */ - ca = PTR_CACHE(c, &k.key, 0); atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written); bkey_copy(&c->uuid_bucket, &k.key); @@ -772,26 +762,22 @@ static void bcache_device_unlink(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - unsigned int i; - struct cache *ca; + struct cache *ca = d->c->cache; sysfs_remove_link(&d->c->kobj, d->name); sysfs_remove_link(&d->kobj, "cache"); - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + bd_unlink_disk_holder(ca->bdev, d->disk); } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, const char *name) { - unsigned int i; - struct cache *ca; + struct cache *ca = c->cache; int ret; - for_each_cache(ca, d->c, i) - bd_link_disk_holder(ca->bdev, d->disk); + bd_link_disk_holder(ca->bdev, d->disk); snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); @@ -1196,8 +1182,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, struct cached_dev *exist_dc, *t; int ret = 0; - if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || - (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) + if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16))) return -ENOENT; if (dc->disk.c) { @@ -1212,7 +1198,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return -EINVAL; } - if (dc->sb.block_size < c->sb.block_size) { + if (dc->sb.block_size < c->cache->sb.block_size) { /* Will die */ pr_err("Couldn't attach %s: block size less than set's block size\n", dc->backing_dev_name); @@ -1269,7 +1255,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, u->first_reg = u->last_reg = rtime; bch_uuid_write(c); - memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); + memcpy(dc->sb.set_uuid, c->set_uuid, 16); SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, &cl); @@ -1331,7 +1317,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, pr_info("Caching %s as %s on set %pU\n", dc->backing_dev_name, dc->disk.disk->disk_name, - dc->disk.c->sb.set_uuid); + dc->disk.c->set_uuid); return 0; } @@ -1534,7 +1520,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors, + if (bcache_device_init(d, block_bytes(c->cache), u->sectors, NULL, &bcache_flash_ops)) goto err; @@ -1638,7 +1624,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) vaf.va = &args; pr_err("error on %pU: %pV, disabling caching\n", - c->sb.set_uuid, &vaf); + c->set_uuid, &vaf); va_end(args); @@ -1662,7 +1648,6 @@ static void cache_set_free(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); struct cache *ca; - unsigned int i; debugfs_remove(c->debug); @@ -1671,15 +1656,16 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); mutex_lock(&bch_register_lock); - for_each_cache(ca, c, i) - if (ca) { - ca->set = NULL; - c->cache[ca->sb.nr_this_dev] = NULL; - kobject_put(&ca->kobj); - } - bch_bset_sort_state_free(&c->sort); - free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb))); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + + ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; + kobject_put(&ca->kobj); + } + if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); @@ -1692,7 +1678,7 @@ static void cache_set_free(struct closure *cl) list_del(&c->list); mutex_unlock(&bch_register_lock); - pr_info("Cache set %pU unregistered\n", c->sb.set_uuid); + pr_info("Cache set %pU unregistered\n", c->set_uuid); wake_up(&unregister_wait); closure_debug_destroy(&c->cl); @@ -1702,9 +1688,8 @@ static void cache_set_free(struct closure *cl) static void cache_set_flush(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cache *ca; + struct cache *ca = c->cache; struct btree *b; - unsigned int i; bch_cache_accounting_destroy(&c->accounting); @@ -1729,9 +1714,8 @@ static void cache_set_flush(struct closure *cl) mutex_unlock(&b->write_lock); } - for_each_cache(ca, c, i) - if (ca->alloc_thread) - kthread_stop(ca->alloc_thread); + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); if (c->journal.cur) { cancel_delayed_work_sync(&c->journal.work); @@ -1764,7 +1748,7 @@ static void conditional_stop_bcache_device(struct cache_set *c, { if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n", - d->disk->disk_name, c->sb.set_uuid); + d->disk->disk_name, c->set_uuid); bcache_device_stop(d); } else if (atomic_read(&dc->has_dirty)) { /* @@ -1841,15 +1825,13 @@ void bch_cache_set_unregister(struct cache_set *c) bch_cache_set_stop(c); } -#define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) - #define alloc_meta_bucket_pages(gfp, sb) \ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { int iter_size; + struct cache *ca = container_of(sb, struct cache, sb); struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); if (!c) @@ -1871,24 +1853,16 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_cache_accounting_init(&c->accounting, &c->cl); - memcpy(c->sb.set_uuid, sb->set_uuid, 16); - c->sb.block_size = sb->block_size; - c->sb.bucket_size = sb->bucket_size; - c->sb.nr_in_set = sb->nr_in_set; - c->sb.last_mount = sb->last_mount; - c->sb.version = sb->version; - if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { - c->sb.feature_compat = sb->feature_compat; - c->sb.feature_ro_compat = sb->feature_ro_compat; - c->sb.feature_incompat = sb->feature_incompat; - } + memcpy(c->set_uuid, sb->set_uuid, 16); + c->cache = ca; + c->cache->set = c; c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); - c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry); + c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry); c->devices_max_used = 0; atomic_set(&c->attached_dev_nr, 0); - c->btree_pages = meta_bucket_pages(&c->sb); + c->btree_pages = meta_bucket_pages(sb); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); @@ -1926,7 +1900,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (mempool_init_kmalloc_pool(&c->bio_meta, 2, sizeof(struct bbio) + - sizeof(struct bio_vec) * meta_bucket_pages(&c->sb))) + sizeof(struct bio_vec) * meta_bucket_pages(sb))) goto err; if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) @@ -1936,7 +1910,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto err; - c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb); + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); if (!c->uuids) goto err; @@ -1972,19 +1946,17 @@ static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; - struct cache *ca; + struct cache *ca = c->cache; struct closure cl; - unsigned int i; LIST_HEAD(journal); struct journal_replay *l; closure_init_stack(&cl); - for_each_cache(ca, c, i) - c->nbuckets += ca->sb.nbuckets; + c->nbuckets = ca->sb.nbuckets; set_gc_sectors(c); - if (CACHE_SYNC(&c->sb)) { + if (CACHE_SYNC(&c->cache->sb)) { struct bkey *k; struct jset *j; @@ -2001,10 +1973,8 @@ static int run_cache_set(struct cache_set *c) j = &list_entry(journal.prev, struct journal_replay, list)->j; err = "IO error reading priorities"; - for_each_cache(ca, c, i) { - if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) - goto err; - } + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; /* * If prio_read() fails it'll call cache_set_error and we'll @@ -2048,9 +2018,8 @@ static int run_cache_set(struct cache_set *c) bch_journal_next(&c->journal); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -2069,28 +2038,23 @@ static int run_cache_set(struct cache_set *c) if (bch_journal_replay(c, &journal)) goto err; } else { - pr_notice("invalidating existing data\n"); + unsigned int j; - for_each_cache(ca, c, i) { - unsigned int j; + pr_notice("invalidating existing data\n"); + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); - ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, - 2, SB_JOURNAL_BUCKETS); - - for (j = 0; j < ca->sb.keys; j++) - ca->sb.d[j] = ca->sb.first_bucket + j; - } + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; bch_initial_gc_finish(c); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - bch_prio_write(ca, true); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -2115,7 +2079,7 @@ static int run_cache_set(struct cache_set *c) * everything is set up - fortunately journal entries won't be * written until the SET_CACHE_SYNC() here: */ - SET_CACHE_SYNC(&c->sb, true); + SET_CACHE_SYNC(&c->cache->sb, true); bch_journal_next(&c->journal); bch_journal_meta(c, &cl); @@ -2126,7 +2090,7 @@ static int run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = (u32)ktime_get_real_seconds(); + c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) @@ -2150,13 +2114,6 @@ err: return -EIO; } -static bool can_attach_cache(struct cache *ca, struct cache_set *c) -{ - return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.bucket_size && - ca->sb.nr_in_set == c->sb.nr_in_set; -} - static const char *register_cache_set(struct cache *ca) { char buf[12]; @@ -2164,16 +2121,10 @@ static const char *register_cache_set(struct cache *ca) struct cache_set *c; list_for_each_entry(c, &bch_cache_sets, list) - if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { - if (c->cache[ca->sb.nr_this_dev]) + if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache) return "duplicate cache set member"; - if (!can_attach_cache(ca, c)) - return "cache sb does not match set"; - - if (!CACHE_SYNC(&ca->sb)) - SET_CACHE_SYNC(&c->sb, false); - goto found; } @@ -2182,7 +2133,7 @@ static const char *register_cache_set(struct cache *ca) return err; err = "error creating kobject"; - if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) || kobject_add(&c->internal, &c->kobj, "internal")) goto err; @@ -2198,31 +2149,13 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - /* - * A special case is both ca->sb.seq and c->sb.seq are 0, - * such condition happens on a new created cache device whose - * super block is never flushed yet. In this case c->sb.version - * and other members should be updated too, otherwise we will - * have a mistaken super block version in cache set. - */ - if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { - c->sb.version = ca->sb.version; - memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); - c->sb.flags = ca->sb.flags; - c->sb.seq = ca->sb.seq; - pr_debug("set version = %llu\n", c->sb.version); - } - kobject_get(&ca->kobj); ca->set = c; - ca->set->cache[ca->sb.nr_this_dev] = ca; - c->cache_by_alloc[c->caches_loaded++] = ca; + ca->set->cache = ca; - if (c->caches_loaded == c->sb.nr_in_set) { - err = "failed to run cache set"; - if (run_cache_set(c) < 0) - goto err; - } + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; return NULL; err: @@ -2239,8 +2172,8 @@ void bch_cache_release(struct kobject *kobj) unsigned int i; if (ca->set) { - BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); - ca->set->cache[ca->sb.nr_this_dev] = NULL; + BUG_ON(ca->set->cache != ca); + ca->set->cache = NULL; } free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); @@ -2448,7 +2381,6 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); -kobj_attribute_write(register_async, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); static bool bch_is_open_backing(struct block_device *bdev) @@ -2469,13 +2401,14 @@ static bool bch_is_open_backing(struct block_device *bdev) static bool bch_is_open_cache(struct block_device *bdev) { struct cache_set *c, *tc; - struct cache *ca; - unsigned int i; - list_for_each_entry_safe(c, tc, &bch_cache_sets, list) - for_each_cache(ca, c, i) - if (ca->bdev == bdev) - return true; + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + struct cache *ca = c->cache; + + if (ca->bdev == bdev) + return true; + } + return false; } @@ -2571,6 +2504,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, struct cache_sb_disk *sb_disk; struct block_device *bdev; ssize_t ret; + bool async_registration = false; + +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION + async_registration = true; +#endif ret = -EBUSY; err = "failed to reference bcache module"; @@ -2624,7 +2562,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_blkdev_put; err = "failed to register device"; - if (attr == &ksysfs_register_async) { + + if (async_registration) { /* register in asynchronous way */ struct async_reg_args *args = kzalloc(sizeof(struct async_reg_args), GFP_KERNEL); @@ -2719,7 +2658,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { char *pdev_set_uuid = pdev->dc->sb.set_uuid; - char *set_uuid = c->sb.uuid; + char *set_uuid = c->set_uuid; if (!memcmp(pdev_set_uuid, set_uuid, 16)) { list_del(&pdev->list); @@ -2887,9 +2826,6 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, -#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION - &ksysfs_register_async.attr, -#endif &ksysfs_pendings_cleanup.attr, NULL }; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index ac06c0bc3c0a..554e3afc9b68 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -711,10 +711,10 @@ SHOW(__bch_cache_set) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - sysfs_print(synchronous, CACHE_SYNC(&c->sb)); + sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb)); sysfs_print(journal_delay_ms, c->journal_delay_ms); - sysfs_hprint(bucket_size, bucket_bytes(c)); - sysfs_hprint(block_size, block_bytes(c)); + sysfs_hprint(bucket_size, bucket_bytes(c->cache)); + sysfs_hprint(block_size, block_bytes(c->cache)); sysfs_print(tree_depth, c->root->level); sysfs_print(root_usage_percent, bch_root_usage(c)); @@ -812,8 +812,8 @@ STORE(__bch_cache_set) if (attr == &sysfs_synchronous) { bool sync = strtoul_or_return(buf); - if (sync != CACHE_SYNC(&c->sb)) { - SET_CACHE_SYNC(&c->sb, sync); + if (sync != CACHE_SYNC(&c->cache->sb)) { + SET_CACHE_SYNC(&c->cache->sb, sync); bcache_write_super(c); } } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4f4ad6b3d43a..3c74996978da 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * This is the size of the cache, minus the amount used for * flash-only devices */ - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - atomic_long_read(&c->flash_dev_dirty_sectors); /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b10c51988c8e..200c5d0f08bf 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -357,11 +357,12 @@ static int read_page(struct file *file, unsigned long index, struct inode *inode = file_inode(file); struct buffer_head *bh; sector_t block, blk_cur; + unsigned long blocksize = i_blocksize(inode); pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); + bh = alloc_page_buffers(page, blocksize, false); if (!bh) { ret = -ENOMEM; goto out; @@ -383,10 +384,10 @@ static int read_page(struct file *file, unsigned long index, bh->b_blocknr = block; bh->b_bdev = inode->i_sb->s_bdev; - if (count < (1<<inode->i_blkbits)) + if (count < blocksize) count = 0; else - count -= (1<<inode->i_blkbits); + count -= blocksize; bh->b_end_io = end_bitmap_write; bh->b_private = bitmap; @@ -605,8 +606,8 @@ re_read: if (bitmap->cluster_slot >= 0) { sector_t bm_blocks = bitmap->mddev->resync_max_sectors; - sector_div(bm_blocks, - bitmap->mddev->bitmap_info.chunksize >> 9); + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, + (bitmap->mddev->bitmap_info.chunksize >> 9)); /* bits to bytes */ bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ @@ -1367,7 +1368,7 @@ __acquires(bitmap->lock) if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); + PAGE_COUNTER_SHIFT); else csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); @@ -1949,6 +1950,7 @@ out: } EXPORT_SYMBOL_GPL(md_bitmap_load); +/* caller need to free returned bitmap with md_bitmap_free() */ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) { int rv = 0; @@ -2012,6 +2014,7 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, md_bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; + md_bitmap_free(bitmap); return rv; } @@ -2615,4 +2618,3 @@ struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; - diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 0580b51a156a..4aaf4820b6f6 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1166,6 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz * can't resize bitmap */ goto out; + md_bitmap_free(bitmap); } return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index de8419b7ae98..98bac4f304ae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8582,6 +8582,26 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, + GFP_NOIO, 0, &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL(md_submit_discard_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9544,7 +9564,7 @@ static int __init md_init(void) goto err_misc_wq; md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_misc_wq) + if (!md_rdev_misc_wq) goto err_rdev_misc_wq; if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) diff --git a/drivers/md/md.h b/drivers/md/md.h index 2175a5ac4f7c..ccfb69868c2e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); +extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index aa2d72791768..6f44177593a5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -426,23 +426,6 @@ static void raid0_free(struct mddev *mddev, void *priv) kfree(conf); } -/* - * Is io distribute over 1 or more chunks ? -*/ -static inline int is_io_in_chunk_boundary(struct mddev *mddev, - unsigned int chunk_sects, struct bio *bio) -{ - if (likely(is_power_of_2(chunk_sects))) { - return chunk_sects >= - ((bio->bi_iter.bi_sector & (chunk_sects-1)) - + bio_sectors(bio)); - } else{ - sector_t sector = bio->bi_iter.bi_sector; - return chunk_sects >= (sector_div(sector, chunk_sects) - + bio_sectors(bio)); - } -} - static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; @@ -494,7 +477,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; - struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -517,18 +499,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - if (__blkdev_issue_discard(rdev->bdev, + md_submit_discard_bio(mddev, rdev, bio, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || - !discard_bio) - continue; - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); + dev_end - dev_start); } bio_endio(bio); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5d1bdee313ec..b7bca6703df8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->copies]); + int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->copies; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->copies; slot++) { + for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } - BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1276,12 +1275,75 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ + int i; + struct r10conf *conf = mddev->private; + struct md_rdev *blocked_rdev; + +retry_wait: + blocked_rdev = NULL; + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[i].replacement); + if (rdev == rrdev) + rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + /* Discard request doesn't care the write result + * so it doesn't need to wait blocked disk here. + */ + if (!r10_bio->sectors) + continue; + + is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + allow_barrier(conf); + raid10_log(conf->mddev, "%s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_wait; + } +} + static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; - struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1339,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); -retry_write: - blocked_rdev = NULL; + + wait_blocked_dev(mddev, r10_bio); + rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1351,16 +1414,6 @@ retry_write: conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1381,15 +1434,6 @@ retry_write: is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1425,35 +1469,6 @@ retry_write: } rcu_read_unlock(); - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - int j; - int d; - - for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } - } - allow_barrier(conf); - raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1493,7 +1508,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1501,6 +1516,296 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static struct bio *raid10_split_bio(struct r10conf *conf, + struct bio *bio, sector_t sectors, bool want_first) +{ + struct bio *split; + + split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + if (want_first) { + submit_bio_noacct(bio); + bio = split; + } else + submit_bio_noacct(split); + wait_barrier(conf); + + return bio; +} + +static void raid_end_discard_bio(struct r10bio *r10bio) +{ + struct r10conf *conf = r10bio->mddev->private; + struct r10bio *first_r10bio; + + while (atomic_dec_and_test(&r10bio->remaining)) { + + allow_barrier(conf); + + if (!test_bit(R10BIO_Discard, &r10bio->state)) { + first_r10bio = (struct r10bio *)r10bio->master_bio; + free_r10bio(r10bio); + r10bio = first_r10bio; + } else { + md_write_end(r10bio->mddev); + bio_endio(r10bio->master_bio); + free_r10bio(r10bio); + break; + } + } +} + +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + raid_end_discard_bio(r10_bio); + rdev_dec_pending(rdev, conf->mddev); +} + +/* There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + struct r10bio *r10_bio, *first_r10bio; + int far_copies = geo->far_copies; + bool first_copy = true; + + int disk; + sector_t chunk; + unsigned int stripe_size; + sector_t split_size; + + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + wait_barrier(conf); + + /* Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + stripe_size = geo->raid_disks << geo->chunk_shift; + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* Maybe one discard bio is smaller than strip size or across one stripe + * and discard region is larger than one stripe size. For far offset layout, + * if the discard region is not aligned with stripe size, there is hole + * when we submit discard bio to member disk. For simplicity, we only + * handle discard bio which discard region is bigger than stripe_size*2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* For far and far offset layout, if bio is not aligned with stripe size, + * it splits the part that is not aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if ((far_copies > 1) && remainder) { + split_size = stripe_size - remainder; + bio = raid10_split_bio(conf, bio, split_size, false); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if ((far_copies > 1) && remainder) { + split_size = bio_sectors(bio) - remainder; + bio = raid10_split_bio(conf, bio, split_size, true); + } + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + +retry_discard: + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + wait_blocked_dev(mddev, r10_bio); + + /* For far layout it needs more than one r10bio to cover all regions. + * Inspired by raid10_sync_request, we can use the first r10bio->master_bio + * to record the discard bio. Other r10bio->master_bio record the first + * r10bio. The first r10bio only release after all other r10bios finish. + * The discard bio returns only first r10bio finishes + */ + if (first_copy) { + r10_bio->master_bio = bio; + set_bit(R10BIO_Discard, &r10_bio->state); + first_copy = false; + first_r10bio = r10_bio; + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time + */ + if (r10_bio->devs[disk].bio) { + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (!geo->far_offset && --far_copies) { + first_stripe_index += geo->stride >> geo->chunk_shift; + start_disk_offset += geo->stride; + last_stripe_index += geo->stride >> geo->chunk_shift; + end_disk_offset += geo->stride; + atomic_inc(&first_r10bio->remaining); + raid_end_discard_bio(r10_bio); + wait_barrier(conf); + goto retry_discard; + } + + raid_end_discard_bio(r10_bio); + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1515,6 +1820,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -3754,7 +4063,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - mddev->chunk_sectors); + UINT_MAX); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); @@ -4458,8 +4767,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, last = conf->reshape_progress - 1; sector_nr = last & ~(sector_t)(conf->geo.chunk_mask & conf->prev.chunk_mask); - if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) - sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; + if (sector_nr + RESYNC_SECTORS < last) + sector_nr = last + 1 - RESYNC_SECTORS; } else { /* 'next' is after the last device address that we * might write to for this chunk in the new layout @@ -4481,8 +4790,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, last = sector_nr | (conf->geo.chunk_mask & conf->prev.chunk_mask); - if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) - last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; + if (sector_nr + RESYNC_SECTORS <= last) + last = sector_nr + RESYNC_SECTORS - 1; } if (need_flush || diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 79cd2b7d3128..1461fd55311b 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,5 +179,6 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, + R10BIO_Discard, }; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d589d26c86ea..39343479ac2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -448,13 +448,74 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh) +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static void free_stripe_pages(struct stripe_head *sh) +{ + int i; + struct page *p; + + /* Have not allocate page pool */ + if (!sh->pages) + return; + + for (i = 0; i < sh->nr_pages; i++) { + p = sh->pages[i]; + if (p) + put_page(p); + sh->pages[i] = NULL; + } +} + +static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) { + int i; struct page *p; + + for (i = 0; i < sh->nr_pages; i++) { + /* The page have allocated. */ + if (sh->pages[i]) + continue; + + p = alloc_page(gfp); + if (!p) { + free_stripe_pages(sh); + return -ENOMEM; + } + sh->pages[i] = p; + } + return 0; +} + +static int +init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) +{ + int nr_pages, cnt; + + if (sh->pages) + return 0; + + /* Each of the sh->dev[i] need one conf->stripe_size */ + cnt = PAGE_SIZE / conf->stripe_size; + nr_pages = (disks + cnt - 1) / cnt; + + sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!sh->pages) + return -ENOMEM; + sh->nr_pages = nr_pages; + sh->stripes_per_page = cnt; + return 0; +} +#endif + +static void shrink_buffers(struct stripe_head *sh) +{ int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num ; i++) { + struct page *p; + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) @@ -462,6 +523,11 @@ static void shrink_buffers(struct stripe_head *sh) sh->dev[i].page = NULL; put_page(p); } +#else + for (i = 0; i < num; i++) + sh->dev[i].page = NULL; + free_stripe_pages(sh); /* Free pages */ +#endif } static int grow_buffers(struct stripe_head *sh, gfp_t gfp) @@ -469,6 +535,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num; i++) { struct page *page; @@ -477,8 +544,18 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) } sh->dev[i].page = page; sh->dev[i].orig_page = page; + sh->dev[i].offset = 0; } +#else + if (alloc_stripe_pages(sh, gfp)) + return -ENOMEM; + for (i = 0; i < num; i++) { + sh->dev[i].page = raid5_get_dev_page(sh, i); + sh->dev[i].orig_page = sh->dev[i].page; + sh->dev[i].offset = raid5_get_page_offset(sh, i); + } +#endif return 0; } @@ -1130,7 +1207,7 @@ again: sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); - bi->bi_io_vec[0].bv_offset = 0; + bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) @@ -1184,7 +1261,7 @@ again: sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); - rbi->bi_io_vec[0].bv_offset = 0; + rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; @@ -1226,7 +1303,7 @@ again: static struct dma_async_tx_descriptor * async_copy_data(int frombio, struct bio *bio, struct page **page, - sector_t sector, struct dma_async_tx_descriptor *tx, + unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, struct stripe_head *sh, int no_skipcopy) { struct bio_vec bvl; @@ -1272,11 +1349,11 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, !no_skipcopy) *page = bio_page; else - tx = async_memcpy(*page, bio_page, page_offset, + tx = async_memcpy(*page, bio_page, page_offset + poff, b_offset, clen, &submit); } else tx = async_memcpy(bio_page, *page, b_offset, - page_offset, clen, &submit); + page_offset + poff, clen, &submit); } /* chain the operations */ submit.depend_tx = tx; @@ -1349,6 +1426,7 @@ static void ops_run_biofill(struct stripe_head *sh) while (rbi && rbi->bi_iter.bi_sector < dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, + dev->offset, dev->sector, tx, sh, 0); rbi = r5_next_bio(conf, rbi, dev->sector); } @@ -1404,14 +1482,25 @@ static addr_conv_t *to_addr_conv(struct stripe_head *sh, return (void *) (to_addr_page(percpu, i) + sh->disks + 2); } +/* + * Return a pointer to record offset address. + */ +static unsigned int * +to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); +} + static struct dma_async_tx_descriptor * ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; + unsigned int off_dest = tgt->offset; int count = 0; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; @@ -1423,19 +1512,22 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - for (i = disks; i--; ) - if (i != target) + for (i = disks; i--; ) { + if (i != target) { + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; + } + } atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1443,6 +1535,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) /* set_syndrome_sources - populate source buffers for gen_syndrome * @srcs - (struct page *) array of size sh->disks + * @offs - (unsigned int) array of offset for each page * @sh - stripe_head to parse * * Populates srcs in proper layout order for the stripe and returns the @@ -1451,6 +1544,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * is recorded in srcs[count+1]]. */ static int set_syndrome_sources(struct page **srcs, + unsigned int *offs, struct stripe_head *sh, int srctype) { @@ -1481,6 +1575,12 @@ static int set_syndrome_sources(struct page **srcs, srcs[slot] = sh->dev[i].orig_page; else srcs[slot] = sh->dev[i].page; + /* + * For R5_InJournal, PAGE_SIZE must be 4KB and will + * not shared page. In that case, dev[i].offset + * is 0. + */ + offs[slot] = sh->dev[i].offset; } i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1493,12 +1593,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int target; int qd_idx = sh->qd_idx; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; struct r5dev *tgt; struct page *dest; + unsigned int dest_off; int i; int count; @@ -1517,17 +1619,18 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) tgt = &sh->dev[target]; BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); dest = tgt->page; + dest_off = tgt->offset; atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ @@ -1535,13 +1638,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) for (i = disks; i-- ; ) { if (i == target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, + tx = async_xor_offs(dest, dest_off, blocks, offs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } @@ -1561,6 +1665,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) struct r5dev *tgt2 = &sh->dev[target2]; struct dma_async_tx_descriptor *tx; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; BUG_ON(sh->batch_head); @@ -1573,13 +1678,16 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) /* we need to open-code set_syndrome_sources to handle the * slot number conversion for 'faila' and 'failb' */ - for (i = 0; i < disks ; i++) + for (i = 0; i < disks ; i++) { + offs[i] = 0; blocks[i] = NULL; + } count = 0; i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + offs[slot] = sh->dev[i].offset; blocks[slot] = sh->dev[i].page; if (i == target) @@ -1604,11 +1712,12 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, syndrome_disks+2, + return async_gen_syndrome(blocks, offs, syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { struct page *dest; + unsigned int dest_off; int data_target; int qd_idx = sh->qd_idx; @@ -1622,22 +1731,24 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) for (i = disks; i-- ; ) { if (i == data_target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } dest = sh->dev[data_target].page; + dest_off = sh->dev[data_target].offset; init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, + tx = async_xor_offs(dest, dest_off, blocks, offs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, count+2, + return async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } @@ -1650,13 +1761,13 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) return async_raid6_datap_recov(syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), faila, - blocks, &submit); + blocks, offs, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), faila, failb, - blocks, &submit); + blocks, offs, &submit); } } } @@ -1682,10 +1793,12 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int count = 0, pd_idx = sh->pd_idx, i; struct async_submit_ctl submit; /* existing parity data subtracted */ + unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; BUG_ON(sh->batch_head); @@ -1695,15 +1808,22 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_InJournal, &dev->flags)) + if (test_bit(R5_InJournal, &dev->flags)) { + /* + * For this case, PAGE_SIZE must be equal to 4KB and + * page offset is zero. + */ + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->orig_page; - else if (test_bit(R5_Wantdrain, &dev->flags)) + } else if (test_bit(R5_Wantdrain, &dev->flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1714,17 +1834,18 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int count; struct async_submit_ctl submit; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1775,6 +1896,7 @@ again: set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, + dev->offset, dev->sector, tx, sh, r5c_is_writeback(conf->log)); if (dev->page != dev->orig_page && @@ -1854,9 +1976,11 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, { int disks = sh->disks; struct page **xor_srcs; + unsigned int *off_srcs; struct async_submit_ctl submit; int count, pd_idx = sh->pd_idx, i; struct page *xor_dest; + unsigned int off_dest; int prexor = 0; unsigned long flags; int j = 0; @@ -1881,24 +2005,31 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, again: count = 0; xor_srcs = to_addr_page(percpu, j); + off_srcs = to_addr_offs(sh, percpu); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { prexor = 1; + off_dest = off_srcs[count] = sh->dev[pd_idx].offset; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (head_sh->dev[i].written || - test_bit(R5_InJournal, &head_sh->dev[i].flags)) + test_bit(R5_InJournal, &head_sh->dev[i].flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } else { xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) + if (i != pd_idx) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } @@ -1924,10 +2055,10 @@ again: } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; @@ -1943,6 +2074,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks; + unsigned int *offs; int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; @@ -1967,6 +2099,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, again: blocks = to_addr_page(percpu, j); + offs = to_addr_offs(sh, percpu); if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { synflags = SYNDROME_SRC_WRITTEN; @@ -1976,7 +2109,7 @@ again: txflags = ASYNC_TX_ACK; } - count = set_syndrome_sources(blocks, sh, synflags); + count = set_syndrome_sources(blocks, offs, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; @@ -1988,7 +2121,7 @@ again: } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; @@ -2016,7 +2149,9 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; struct page *xor_dest; + unsigned int off_dest; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; int count; @@ -2028,16 +2163,19 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) BUG_ON(sh->batch_head); count = 0; xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; + off_srcs[count] = off_dest; xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { if (i == pd_idx || i == qd_idx) continue; + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; } init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, + tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); @@ -2049,6 +2187,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) { struct page **srcs = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; int count; @@ -2056,16 +2195,16 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, + async_syndrome_val(srcs, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), - &sh->ops.zero_sum_result, percpu->spare_page, &submit); + &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); } static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) @@ -2142,6 +2281,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) { +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + kfree(sh->pages); +#endif if (sh->ppl_page) __free_page(sh->ppl_page); kmem_cache_free(sc, sh); @@ -2175,9 +2317,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, sh->ppl_page = alloc_page(gfp); if (!sh->ppl_page) { free_stripe(sc, sh); - sh = NULL; + return NULL; } } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + if (init_stripe_shared_pages(sh, conf, disks)) { + free_stripe(sc, sh); + return NULL; + } +#endif } return sh; } @@ -2253,8 +2401,9 @@ static int scribble_alloc(struct raid5_percpu *percpu, int num, int cnt) { size_t obj_size = - sizeof(struct page *) * (num+2) + - sizeof(addr_conv_t) * (num+2); + sizeof(struct page *) * (num + 2) + + sizeof(addr_conv_t) * (num + 2) + + sizeof(unsigned int) * (num + 2); void *scribble; /* @@ -2386,9 +2535,16 @@ static int resize_stripes(struct r5conf *conf, int newsize) osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < osh->nr_pages; i++) { + nsh->pages[i] = osh->pages[i]; + osh->pages[i] = NULL; + } +#endif for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; nsh->dev[i].orig_page = osh->dev[i].page; + nsh->dev[i].offset = osh->dev[i].offset; } nsh->hash_lock_index = hash; free_stripe(conf->slab_cache, osh); @@ -2429,8 +2585,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; - mutex_unlock(&conf->cache_size_mutex); - conf->slab_cache = sc; conf->active_name = 1-conf->active_name; @@ -2439,20 +2593,41 @@ static int resize_stripes(struct r5conf *conf, int newsize) nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < nsh->nr_pages; i++) { + if (nsh->pages[i]) + continue; + nsh->pages[i] = alloc_page(GFP_NOIO); + if (!nsh->pages[i]) + err = -ENOMEM; + } + + for (i = conf->raid_disks; i < newsize; i++) { + if (nsh->dev[i].page) + continue; + nsh->dev[i].page = raid5_get_dev_page(nsh, i); + nsh->dev[i].orig_page = nsh->dev[i].page; + nsh->dev[i].offset = raid5_get_page_offset(nsh, i); + } +#else for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; nsh->dev[i].orig_page = p; + nsh->dev[i].offset = 0; if (!p) err = -ENOMEM; } +#endif raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ if (!err) conf->pool_size = newsize; + mutex_unlock(&conf->cache_size_mutex); + return err; } @@ -4369,7 +4544,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf), + sh->dev[i].page, sh2->dev[dd_idx].offset, + sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -6506,6 +6682,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) struct r5conf *conf; unsigned long new; int err; + int size; if (len >= PAGE_SIZE) return -EINVAL; @@ -6538,10 +6715,29 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) pr_debug("md/raid: change stripe_size from %lu to %lu\n", conf->stripe_size, new); + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) { + err = -EBUSY; + goto out_unlock; + } + mddev_suspend(mddev); + mutex_lock(&conf->cache_size_mutex); + size = conf->max_nr_stripes; + + shrink_stripes(conf); + conf->stripe_size = new; conf->stripe_shift = ilog2(new) - 9; conf->stripe_sectors = new >> 9; + if (grow_stripes(conf, size)) { + pr_warn("md/raid:%s: couldn't allocate buffers\n", + mdname(mddev)); + err = -ENOMEM; + } + mutex_unlock(&conf->cache_size_mutex); mddev_resume(mddev); out_unlock: diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 16fc29472f5c..5c05acf20e1f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -195,6 +195,7 @@ enum reconstruct_states { reconstruct_state_result, }; +#define DEFAULT_STRIPE_SIZE 4096 struct stripe_head { struct hlist_node hash; struct list_head lru; /* inactive_list or handle_list */ @@ -246,6 +247,13 @@ struct stripe_head { int target, target2; enum sum_check_flags zero_sum_result; } ops; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + /* These pages will be used by bios in dev[i] */ + struct page **pages; + int nr_pages; /* page array size */ + int stripes_per_page; +#endif struct r5dev { /* rreq and rvec are used for the replacement device when * writing data to both devices. @@ -253,6 +261,7 @@ struct stripe_head { struct bio req, rreq; struct bio_vec vec, rvec; struct page *page, *orig_page; + unsigned int offset; /* offset of the page */ struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; @@ -472,7 +481,6 @@ struct disk_info { */ #define NR_STRIPES 256 -#define DEFAULT_STRIPE_SIZE 4096 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE #define STRIPE_SIZE PAGE_SIZE @@ -771,6 +779,25 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +/* + * Return offset of the corresponding page for r5dev. + */ +static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx) +{ + return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf); +} + +/* + * Return corresponding page address for r5dev. + */ +static inline struct page * +raid5_get_dev_page(struct stripe_head *sh, int disk_idx) +{ + return sh->pages[disk_idx / sh->stripes_per_page]; +} +#endif + extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); |