diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
commit | 7cd4ecd9177b94af783b8e21de7c65b41a871342 (patch) | |
tree | 3ca393f3eaeeaad56d4ab60f87e28d7197b0ba21 /drivers | |
parent | 79ec6d9cac46d59db9b006bc9cde2811ef365292 (diff) | |
parent | 79cd16681acccffcf5521f6e3d8c7c50aaffca0a (diff) |
Merge tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"Here are the driver updates for 5.10.
A few SCSI updates in here too, in coordination with Martin as they
depend on core block changes for the shared tag bitmap.
This contains:
- NVMe pull requests via Christoph:
- fix keep alive timer modification (Amit Engel)
- order the PCI ID list more sensibly (Andy Shevchenko)
- cleanup the open by controller helper (Chaitanya Kulkarni)
- use an xarray for the CSE log lookup (Chaitanya Kulkarni)
- support ZNS in nvmet passthrough mode (Chaitanya Kulkarni)
- fix nvme_ns_report_zones (Christoph Hellwig)
- add a sanity check to nvmet-fc (James Smart)
- fix interrupt allocation when too many polled queues are
specified (Jeffle Xu)
- small nvmet-tcp optimization (Mark Wunderlich)
- fix a controller refcount leak on init failure (Chaitanya
Kulkarni)
- misc cleanups (Chaitanya Kulkarni)
- major refactoring of the scanning code (Christoph Hellwig)
- MD updates via Song:
- Bug fixes in bitmap code, from Zhao Heming
- Fix a work queue check, from Guoqing Jiang
- Fix raid5 oops with reshape, from Song Liu
- Clean up unused code, from Jason Yan
- Discard improvements, from Xiao Ni
- raid5/6 page offset support, from Yufen Yu
- Shared tag bitmap for SCSI/hisi_sas/null_blk (John, Kashyap,
Hannes)
- null_blk open/active zone limit support (Niklas)
- Set of bcache updates (Coly, Dongsheng, Qinglang)"
* tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block: (78 commits)
md/raid5: fix oops during stripe resizing
md/bitmap: fix memory leak of temporary bitmap
md: fix the checking of wrong work queue
md/bitmap: md_bitmap_get_counter returns wrong blocks
md/bitmap: md_bitmap_read_sb uses wrong bitmap blocks
md/raid0: remove unused function is_io_in_chunk_boundary()
nvme-core: remove extra condition for vwc
nvme-core: remove extra variable
nvme: remove nvme_identify_ns_list
nvme: refactor nvme_validate_ns
nvme: move nvme_validate_ns
nvme: query namespace identifiers before adding the namespace
nvme: revalidate zone bitmaps in nvme_update_ns_info
nvme: remove nvme_update_formats
nvme: update the known admin effects
nvme: set the queue limits in nvme_update_ns_info
nvme: remove the 0 lba_shift check in nvme_update_ns_info
nvme: clean up the check for too large logic block sizes
nvme: freeze the queue over ->lba_shift updates
nvme: factor out a nvme_configure_metadata helper
...
Diffstat (limited to 'drivers')
48 files changed, 1766 insertions, 1164 deletions
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index daed4a9c3436..d2e7db43a52a 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -42,6 +42,9 @@ struct nullb_device { struct badblocks badblocks; unsigned int nr_zones; + unsigned int nr_zones_imp_open; + unsigned int nr_zones_exp_open; + unsigned int nr_zones_closed; struct blk_zone *zones; sector_t zone_size_sects; @@ -51,6 +54,8 @@ struct nullb_device { unsigned long zone_size; /* zone size in MB if device is zoned */ unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ unsigned int zone_nr_conv; /* number of conventional zones */ + unsigned int zone_max_open; /* max number of open zones */ + unsigned int zone_max_active; /* max number of active zones */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index d74443a9c8fa..4685ea401d5b 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -164,6 +164,10 @@ static bool shared_tags; module_param(shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); +static bool g_shared_tag_bitmap; +module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); +MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); + static int g_irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -208,6 +212,14 @@ static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); +static unsigned int g_zone_max_open; +module_param_named(zone_max_open, g_zone_max_open, uint, 0444); +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); + +static unsigned int g_zone_max_active; +module_param_named(zone_max_active, g_zone_max_active, uint, 0444); +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -347,6 +359,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -464,6 +478,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_size, &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_nr_conv, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_max_active, NULL, }; @@ -517,7 +533,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n"); + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -580,6 +596,8 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_size = g_zone_size; dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; + dev->zone_max_open = g_zone_max_open; + dev->zone_max_active = g_zone_max_active; return dev; } @@ -1692,6 +1710,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->flags = BLK_MQ_F_SHOULD_MERGE; if (g_no_sched) set->flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; set->driver_data = NULL; if ((nullb && nullb->dev->blocking) || g_blocking) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 3d25c9ad2383..fa0cc70f05e6 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -51,6 +51,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_nr_conv); } + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ + if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_active = 0; + pr_info("zone_max_active limit disabled, limit >= zone count\n"); + } + + /* Max open zones has to be <= max active zones */ + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { + dev->zone_max_open = dev->zone_max_active; + pr_info("changed the maximum number of open zones to %u\n", + dev->nr_zones); + } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_open = 0; + pr_info("zone_max_open limit disabled, limit >= zone count\n"); + } + for (i = 0; i < dev->zone_nr_conv; i++) { struct blk_zone *zone = &dev->zones[i]; @@ -99,6 +115,8 @@ int null_register_zoned_dev(struct nullb *nullb) } blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); + blk_queue_max_open_zones(q, dev->zone_max_open); + blk_queue_max_active_zones(q, dev->zone_max_active); return 0; } @@ -159,6 +177,103 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } + + return BLK_STS_OK; +} + +static void null_close_first_imp_zone(struct nullb_device *dev) +{ + unsigned int i; + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { + null_close_zone(dev, &dev->zones[i]); + return; + } + } +} + +static bool null_can_set_active(struct nullb_device *dev) +{ + if (!dev->zone_max_active) + return true; + + return dev->nr_zones_exp_open + dev->nr_zones_imp_open + + dev->nr_zones_closed < dev->zone_max_active; +} + +static bool null_can_open(struct nullb_device *dev) +{ + if (!dev->zone_max_open) + return true; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) + return true; + + if (dev->nr_zones_imp_open && null_can_set_active(dev)) { + null_close_first_imp_zone(dev); + return true; + } + + return false; +} + +/* + * This function matches the manage open zone resources function in the ZBC standard, + * with the addition of max active zones support (added in the ZNS standard). + * + * The function determines if a zone can transition to implicit open or explicit open, + * while maintaining the max open zone (and max active zone) limit(s). It may close an + * implicit open zone in order to make additional zone resources available. + * + * ZBC states that an implicit open zone shall be closed only if there is not + * room within the open limit. However, with the addition of an active limit, + * it is not certain that closing an implicit open zone will allow a new zone + * to be opened, since we might already be at the active limit capacity. + */ +static bool null_has_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + if (!null_can_set_active(dev)) + return false; + fallthrough; + case BLK_ZONE_COND_CLOSED: + return null_can_open(dev); + default: + /* Should never be called for other states */ + WARN_ON(1); + return false; + } +} + static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors, bool append) { @@ -177,43 +292,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, /* Cannot write to a full zone */ return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_CLOSED: + if (!null_has_zone_resources(dev, zone)) + return BLK_STS_IOERR; + break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: + break; + default: + /* Invalid zone condition */ + return BLK_STS_IOERR; + } + + /* + * Regular writes must be at the write pointer position. + * Zone append writes are automatically issued at the write + * pointer and the position returned using the request or BIO + * sector. + */ + if (append) { + sector = zone->wp; + if (cmd->bio) + cmd->bio->bi_iter.bi_sector = sector; + else + cmd->rq->__sector = sector; + } else if (sector != zone->wp) { + return BLK_STS_IOERR; + } + + if (zone->wp + nr_sectors > zone->start + zone->capacity) + return BLK_STS_IOERR; + + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + + zone->wp += nr_sectors; + if (zone->wp == zone->start + zone->capacity) { + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + zone->cond = BLK_ZONE_COND_FULL; + } + return BLK_STS_OK; +} + +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + /* open operation on exp open is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + if (!null_has_zone_resources(dev, zone)) + return BLK_STS_IOERR; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; case BLK_ZONE_COND_CLOSED: - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. - */ - if (append) { - sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else - cmd->rq->__sector = sector; - } else if (sector != zone->wp) { + if (!null_has_zone_resources(dev, zone)) return BLK_STS_IOERR; - } + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + dev->nr_zones_exp_open++; - if (zone->wp + nr_sectors > zone->start + zone->capacity) + return BLK_STS_OK; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* finish operation on full is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + if (!null_has_zone_resources(dev, zone)) return BLK_STS_IOERR; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + if (!null_has_zone_resources(dev, zone)) + return BLK_STS_IOERR; + dev->nr_zones_closed--; + break; + default: + return BLK_STS_IOERR; + } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - return ret; + return BLK_STS_OK; +} + +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; - zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->capacity) - zone->cond = BLK_ZONE_COND_FULL; + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + /* reset operation on empty is not an error */ return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + break; default: - /* Invalid zone condition */ return BLK_STS_IOERR; } + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + + return BLK_STS_OK; } static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, @@ -222,56 +449,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, struct nullb_device *dev = cmd->nq->dev; unsigned int zone_no = null_zone_no(dev, sector); struct blk_zone *zone = &dev->zones[zone_no]; + blk_status_t ret = BLK_STS_OK; size_t i; switch (op) { case REQ_OP_ZONE_RESET_ALL: - for (i = 0; i < dev->nr_zones; i++) { - if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL) - continue; - zone[i].cond = BLK_ZONE_COND_EMPTY; - zone[i].wp = zone[i].start; - } + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) + null_reset_zone(dev, &dev->zones[i]); break; case REQ_OP_ZONE_RESET: - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; + ret = null_reset_zone(dev, zone); break; case REQ_OP_ZONE_OPEN: - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - if (zone->cond == BLK_ZONE_COND_FULL) - return BLK_STS_IOERR; - - zone->cond = BLK_ZONE_COND_EXP_OPEN; + ret = null_open_zone(dev, zone); break; case REQ_OP_ZONE_CLOSE: - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - if (zone->cond == BLK_ZONE_COND_FULL) - return BLK_STS_IOERR; - - if (zone->wp == zone->start) - zone->cond = BLK_ZONE_COND_EMPTY; - else - zone->cond = BLK_ZONE_COND_CLOSED; + ret = null_close_zone(dev, zone); break; case REQ_OP_ZONE_FINISH: - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zone->len; + ret = null_finish_zone(dev, zone); break; default: return BLK_STS_NOTSUPP; } - trace_nullb_zone_op(cmd, zone_no, zone->cond); - return BLK_STS_OK; + if (ret == BLK_STS_OK) + trace_nullb_zone_op(cmd, zone_no, zone->cond); + + return ret; } blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 8799e3bab067..63f549889f87 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -439,7 +439,7 @@ static void card_state_change(struct rsxx_cardinfo *card, case CARD_STATE_FAULT: dev_crit(CARD_TO_DEV(card), "Hardware Fault reported!\n"); - /* Fall through. */ + fallthrough; /* Everything else, detach DMA interface if it's attached. */ case CARD_STATE_SHUTDOWN: diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 52035a78d836..8c371d5eef8e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -49,7 +49,7 @@ * * bch_bucket_alloc() allocates a single bucket from a specific cache. * - * bch_bucket_alloc_set() allocates one or more buckets from different caches + * bch_bucket_alloc_set() allocates one bucket from different caches * out of a cache set. * * free_some_buckets() drives all the processes described above. It's called @@ -87,8 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; - unsigned int i; + unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024; int r; atomic_sub(sectors, &c->rescale); @@ -104,14 +103,14 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) c->min_prio = USHRT_MAX; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) - if (b->prio && - b->prio != BTREE_PRIO && - !atomic_read(&b->pin)) { - b->prio--; - c->min_prio = min(c->min_prio, b->prio); - } + ca = c->cache; + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } mutex_unlock(&c->bucket_lock); } @@ -362,7 +361,7 @@ retry_invalidate: * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb)) { + if (CACHE_SYNC(&ca->sb)) { /* * This could deadlock if an allocation with a btree * node locked ever blocked - having the btree node @@ -488,34 +487,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { - int i; + struct cache *ca; + long b; /* No allocation if CACHE_SET_IO_DISABLE bit is set */ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return -1; lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); bkey_init(k); - /* sort by free space/prio of oldest data in caches */ - - for (i = 0; i < n; i++) { - struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, reserve, wait); + ca = c->cache; + b = bch_bucket_alloc(ca, reserve, wait); + if (b == -1) + goto err; - if (b == -1) - goto err; + k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); - k->ptr[i] = MAKE_PTR(ca->buckets[b].gen, - bucket_to_sector(c, b), - ca->sb.nr_this_dev); - - SET_KEY_PTRS(k, i + 1); - } + SET_KEY_PTRS(k, 1); return 0; err: @@ -525,12 +519,12 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -589,7 +583,7 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, struct open_bucket, list); found: if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; + ret->sectors_free = c->cache->sb.bucket_size; bkey_copy(&ret->key, alloc); bkey_init(alloc); } @@ -638,7 +632,7 @@ bool bch_alloc_sectors(struct cache_set *c, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) return false; spin_lock(&c->data_bucket_lock); @@ -683,7 +677,7 @@ bool bch_alloc_sectors(struct cache_set *c, &PTR_CACHE(c, &b->key, i)->sectors_written); } - if (b->sectors_free < c->sb.block_size) + if (b->sectors_free < c->cache->sb.block_size) b->sectors_free = 0; /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4fd03d2496d8..1d57f48307e6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -517,11 +517,7 @@ struct cache_set { atomic_t idle_counter; atomic_t at_max_writeback_rate; - struct cache_sb sb; - - struct cache *cache[MAX_CACHES_PER_SET]; - struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; - int caches_loaded; + struct cache *cache; struct bcache_device **devices; unsigned int devices_max_used; @@ -670,6 +666,7 @@ struct cache_set { struct mutex verify_lock; #endif + uint8_t set_uuid[16]; unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); @@ -758,9 +755,8 @@ struct bbio { #define btree_default_blocks(c) \ ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) -#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) -#define bucket_bytes(c) ((c)->sb.bucket_size << 9) -#define block_bytes(c) ((c)->sb.block_size << 9) +#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9) +#define block_bytes(ca) ((ca)->sb.block_size << 9) static inline unsigned int meta_bucket_pages(struct cache_sb *sb) { @@ -801,14 +797,14 @@ static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) { - return s & (c->sb.bucket_size - 1); + return s & (c->cache->sb.bucket_size - 1); } static inline struct cache *PTR_CACHE(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return c->cache[PTR_DEV(k, ptr)]; + return c->cache; } static inline size_t PTR_BUCKET_NR(struct cache_set *c, @@ -889,9 +885,6 @@ do { \ /* Looping macros */ -#define for_each_cache(ca, cs, iter) \ - for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) - #define for_each_bucket(b, ca) \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) @@ -933,11 +926,9 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) static inline void wake_up_allocators(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - wake_up_process(ca->alloc_thread); + wake_up_process(ca->alloc_thread); } static inline void closure_bio_submit(struct cache_set *c, @@ -994,9 +985,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k); long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3d8bd0692af3..910df242c83d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -104,7 +104,7 @@ static inline struct bset *write_block(struct btree *b) { - return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); } static void bch_btree_init_next(struct btree *b) @@ -117,7 +117,7 @@ static void bch_btree_init_next(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); } @@ -155,7 +155,7 @@ void bch_btree_node_read_done(struct btree *b) * See the comment arount cache_set->fill_iter. */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter->size = b->c->sb.bucket_size / b->c->sb.block_size; + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG @@ -173,12 +173,12 @@ void bch_btree_node_read_done(struct btree *b) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, block_bytes(b->c)) > + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > btree_blocks(b)) goto err; err = "bad magic"; - if (i->magic != bset_magic(&b->c->sb)) + if (i->magic != bset_magic(&b->c->cache->sb)) goto err; err = "bad checksum"; @@ -199,13 +199,13 @@ void bch_btree_node_read_done(struct btree *b) bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } err = "corrupted btree"; for (i = write_block(b); bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); - i = ((void *) i) + block_bytes(b->c)) + i = ((void *) i) + block_bytes(b->c->cache)) if (i->seq == b->keys.set[0].data->seq) goto err; @@ -219,7 +219,7 @@ void bch_btree_node_read_done(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); out: mempool_free(iter, &b->c->fill_iter); return; @@ -347,7 +347,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; - b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); @@ -423,10 +423,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } void bch_btree_node_write(struct btree *b, struct closure *parent) @@ -514,7 +514,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -#define mca_reserve(c) (((c->root && c->root->level) \ +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ max_t(int, 0, c->btree_cache_used - mca_reserve(c)) @@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -785,7 +785,8 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); if (!c->verify_ondisk) { /* * Don't worry about the mca_rereserve buckets @@ -1091,7 +1092,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); @@ -1108,7 +1109,7 @@ retry: } b->parent = parent; - bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); mutex_unlock(&c->bucket_lock); @@ -1167,19 +1168,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; - struct cache *ca; - unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { - if (op) - prepare_to_wait(&c->btree_cache_wait, &op->wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->bucket_lock); - return -EINTR; - } + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } mutex_unlock(&c->bucket_lock); @@ -1345,7 +1345,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (nodes < 2 || __set_blocks(b->keys.set[0].data, keys, - block_bytes(b->c)) > blocks * (nodes - 1)) + block_bytes(b->c->cache)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { @@ -1379,7 +1379,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), - block_bytes(b->c)) > blocks) + block_bytes(b->c->cache)) > blocks) break; last = k; @@ -1395,7 +1395,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - block_bytes(b->c)) > + block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])) goto out_unlock_nocoalesce; @@ -1404,7 +1404,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])); if (last) @@ -1695,7 +1695,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned int i; if (!c->gc_mark_valid) return; @@ -1705,14 +1704,14 @@ static void btree_gc_start(struct cache_set *c) c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) { - b->last_gc = b->gen; - if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, 0); - SET_GC_SECTORS_USED(b, 0); - } + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); } + } mutex_unlock(&c->bucket_lock); } @@ -1721,7 +1720,8 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned int i; + unsigned int i, j; + uint64_t *k; mutex_lock(&c->bucket_lock); @@ -1739,7 +1739,6 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1756,29 +1755,27 @@ static void bch_btree_gc_finish(struct cache_set *c) rcu_read_unlock(); c->avail_nbuckets = 0; - for_each_cache(ca, c, i) { - uint64_t *i; - ca->invalidate_needs_gc = 0; + ca = c->cache; + ca->invalidate_needs_gc = 0; - for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for (i = ca->prio_buckets; - i < ca->prio_buckets + prio_buckets(ca) * 2; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for_each_bucket(b, ca) { - c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (atomic_read(&b->pin)) - continue; + if (atomic_read(&b->pin)) + continue; - BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); - if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - c->avail_nbuckets++; - } + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; } mutex_unlock(&c->bucket_lock); @@ -1830,12 +1827,10 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) - return true; + if (ca->invalidate_needs_gc) + return true; if (atomic_read(&c->sectors_to_gc) < 0) return true; @@ -2081,9 +2076,8 @@ out: void bch_initial_gc_finish(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; bch_btree_gc_finish(c); @@ -2098,20 +2092,18 @@ void bch_initial_gc_finish(struct cache_set *c) * This is only safe for buckets that have no live data in them, which * there should always be some of. */ - for_each_cache(ca, c, i) { - for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO]) && - fifo_full(&ca->free[RESERVE_BTREE])) - break; + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; - if (bch_can_invalidate_bucket(ca, b) && - !GC_MARK(b)) { - __bch_invalidate_one_bucket(ca, b); - if (!fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets)) - fifo_push(&ca->free[RESERVE_BTREE], - b - ca->buckets); - } + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } @@ -2219,7 +2211,7 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err; split = set_blocks(btree_bset_first(n1), - block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned int keys = 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 257969980c49..50482107134f 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -194,7 +194,7 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16); } void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0164a1fe94a9..d8d9394a6beb 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -159,7 +159,7 @@ void closure_debug_destroy(struct closure *cl) static struct dentry *closure_debug; -static int debug_seq_show(struct seq_file *f, void *data) +static int debug_show(struct seq_file *f, void *data) { struct closure *cl; @@ -188,17 +188,7 @@ static int debug_seq_show(struct seq_file *f, void *data) return 0; } -static int debug_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_seq_show, NULL); -} - -static const struct file_operations debug_ops = { - .owner = THIS_MODULE, - .open = debug_seq_open, - .read = seq_read, - .release = single_release -}; +DEFINE_SHOW_ATTRIBUTE(debug); void __init closure_debug_init(void) { @@ -209,7 +199,7 @@ void __init closure_debug_init(void) * about this. */ closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_ops); + "closures", 0400, bcache_debug, NULL, &debug_fops); } #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 336f43910383..b00fd08d696b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -25,8 +25,8 @@ struct dentry *bcache_debug; for (i = (start); \ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ i->seq == (start)->seq; \ - i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ - block_bytes(b->c)) + i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) * \ + block_bytes(b->c->cache)) void bch_btree_verify(struct btree *b) { @@ -82,14 +82,14 @@ void bch_btree_verify(struct btree *b) for_each_written_bset(b, ondisk, i) { unsigned int block = ((void *) i - (void *) ondisk) / - block_bytes(b->c); + block_bytes(b->c->cache); pr_err("*** on disk block %u:\n", block); bch_dump_bset(&b->keys, i, block); } pr_err("*** block %zu not written\n", - ((void *) i - (void *) ondisk) / block_bytes(b->c)); + ((void *) i - (void *) ondisk) / block_bytes(b->c->cache)); for (j = 0; j < inmemory->keys; j++) if (inmemory->d[j] != sorted->d[j]) @@ -238,7 +238,7 @@ void bch_debug_init_cache_set(struct cache_set *c) if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; - snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + snprintf(name, 50, "bcache-%pU", c->set_uuid); c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 9162af5bb6ec..f4658a1f37b8 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -54,7 +54,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size || + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size || bucket < ca->sb.first_bucket || bucket >= ca->sb.nbuckets) return true; @@ -75,7 +75,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size) + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) return "bad, length too big"; if (bucket < ca->sb.first_bucket) return "bad, short offset"; @@ -136,7 +136,7 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) size_t n = PTR_BUCKET_NR(b->c, k, j); pr_cont(" bucket %zu", n); - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets) pr_cont(" prio %i", PTR_BUCKET(b->c, k, j)->prio); } diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index 4442df48d28c..6469223f0b77 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -30,7 +30,7 @@ static struct feature feature_list[] = { for (f = &feature_list[0]; f->compat != 0; f++) { \ if (f->compat != BCH_FEATURE_ ## type) \ continue; \ - if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) { \ if (first) { \ out += snprintf(out, buf + size - out, \ "["); \ @@ -44,7 +44,7 @@ static struct feature feature_list[] = { \ out += snprintf(out, buf + size - out, "%s", f->string);\ \ - if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) \ out += snprintf(out, buf + size - out, "]"); \ \ first = false; \ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index a14a445618b4..dad71a6b7889 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb)); + bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb)); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c1227bdb57e7..aefbdb7e003b 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -98,7 +98,7 @@ reread: left = ca->sb.bucket_size - offset; return ret; } - blocks = set_blocks(j, block_bytes(ca->set)); + blocks = set_blocks(j, block_bytes(ca)); /* * Nodes in 'list' are in linear increasing order of @@ -179,112 +179,109 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) ret; \ }) - struct cache *ca; - unsigned int iter; + struct cache *ca = c->cache; int ret = 0; + struct journal_device *ja = &ca->journal; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); + unsigned int i, l, r, m; + uint64_t seq; - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned int i, l, r, m; - uint64_t seq; - - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); - pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries + * We must try the index l with ZERO first for + * correctness due to the scenario that the journal + * bucket is circular buffer which might have wrapped */ - for (i = 0; i < ca->sb.njournal_buckets; i++) { - /* - * We must try the index l with ZERO first for - * correctness due to the scenario that the journal - * bucket is circular buffer which might have wrapped - */ - l = (i * 2654435769U) % ca->sb.njournal_buckets; + l = (i * 2654435769U) % ca->sb.njournal_buckets; - if (test_bit(l, bitmap)) - break; + if (test_bit(l, bitmap)) + break; - if (read_bucket(l)) - goto bsearch; - } + if (read_bucket(l)) + goto bsearch; + } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search\n"); + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search\n"); - for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) - if (read_bucket(l)) - goto bsearch; + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) + if (read_bucket(l)) + goto bsearch; - /* no journal entries on this device? */ - if (l == ca->sb.njournal_buckets) - continue; + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) + goto out; bsearch: - BUG_ON(list_empty(list)); + BUG_ON(list_empty(list)); - /* Binary search */ - m = l; - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); - pr_debug("starting binary search, l %u r %u\n", l, r); + /* Binary search */ + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u\n", l, r); - while (l + 1 < r) { - seq = list_entry(list->prev, struct journal_replay, - list)->j.seq; + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; - m = (l + r) >> 1; - read_bucket(m); + m = (l + r) >> 1; + read_bucket(m); - if (seq != list_entry(list->prev, struct journal_replay, - list)->j.seq) - l = m; - else - r = m; - } + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } - /* - * Read buckets in reverse order until we stop finding more - * journal entries - */ - pr_debug("finishing up: m %u njournal_buckets %u\n", - m, ca->sb.njournal_buckets); - l = m; + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u\n", + m, ca->sb.njournal_buckets); + l = m; - while (1) { - if (!l--) - l = ca->sb.njournal_buckets - 1; + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; - if (l == m) - break; + if (l == m) + break; - if (test_bit(l, bitmap)) - continue; + if (test_bit(l, bitmap)) + continue; - if (!read_bucket(l)) - break; - } + if (!read_bucket(l)) + break; + } - seq = 0; + seq = 0; - for (i = 0; i < ca->sb.njournal_buckets; i++) - if (ja->seq[i] > seq) { - seq = ja->seq[i]; - /* - * When journal_reclaim() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; - } - } + } +out: if (!list_empty(list)) c->journal.seq = list_entry(list->prev, struct journal_replay, @@ -342,12 +339,10 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) static bool is_discard_enabled(struct cache_set *s) { - struct cache *ca; - unsigned int i; + struct cache *ca = s->cache; - for_each_cache(ca, s, i) - if (ca->discard) - return true; + if (ca->discard) + return true; return false; } @@ -633,9 +628,10 @@ static void do_journal_discard(struct cache *ca) static void journal_reclaim(struct cache_set *c) { struct bkey *k = &c->journal.key; - struct cache *ca; + struct cache *ca = c->cache; uint64_t last_seq; - unsigned int iter, n = 0; + unsigned int next; + struct journal_device *ja = &ca->journal; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -647,46 +643,31 @@ static void journal_reclaim(struct cache_set *c) /* Update last_idx */ - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - while (ja->last_idx != ja->cur_idx && - ja->seq[ja->last_idx] < last_seq) - ja->last_idx = (ja->last_idx + 1) % - ca->sb.njournal_buckets; - } + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; - for_each_cache(ca, c, iter) - do_journal_discard(ca); + do_journal_discard(ca); if (c->journal.blocks_free) goto out; - /* - * Allocate: - * XXX: Sort by free journal space - */ - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + /* No space available on this device */ + if (next == ja->discard_idx) + goto out; - /* No space available on this device */ - if (next == ja->discard_idx) - continue; + ja->cur_idx = next; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); - ja->cur_idx = next; - k->ptr[n++] = MAKE_PTR(0, - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), - ca->sb.nr_this_dev); - atomic_long_inc(&c->reclaimed_journal_buckets); - } + bkey_init(k); + SET_KEY_PTRS(k, 1); + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; - if (n) { - bkey_init(k); - SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - } out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -750,11 +731,11 @@ static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { struct cache_set *c = container_of(cl, struct cache_set, journal.io); - struct cache *ca; + struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * - c->sb.block_size; + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * + ca->sb.block_size; struct bio *bio; struct bio_list list; @@ -773,17 +754,15 @@ static void journal_write_unlocked(struct closure *cl) return; } - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); w->data->btree_level = c->root->level; bkey_copy(&w->data->btree_root, &c->root->key); bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); - for_each_cache(ca, c, i) - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - - w->data->magic = jset_magic(&c->sb); + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + w->data->magic = jset_magic(&ca->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); @@ -859,6 +838,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, size_t sectors; struct closure cl; bool wait = false; + struct cache *ca = c->cache; closure_init_stack(&cl); @@ -868,10 +848,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; + block_bytes(ca)) * ca->sb.block_size; if (sectors <= min_t(size_t, - c->journal.blocks_free * c->sb.block_size, + c->journal.blocks_free * ca->sb.block_size, PAGE_SECTORS << JSET_BITS)) return w; @@ -936,7 +916,7 @@ atomic_t *bch_journal(struct cache_set *c, if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return NULL; - if (!CACHE_SYNC(&c->sb)) + if (!CACHE_SYNC(&c->cache->sb)) return NULL; w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 5872d6470470..b9c3d27ec093 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -196,50 +196,48 @@ static unsigned int bucket_heap_top(struct cache *ca) void bch_moving_gc(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; + unsigned long sectors_to_move, reserve_sectors; if (!c->copy_gc_enabled) return; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) { - unsigned long sectors_to_move = 0; - unsigned long reserve_sectors = ca->sb.bucket_size * + sectors_to_move = 0; + reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.used = 0; - - for_each_bucket(b, ca) { - if (GC_MARK(b) == GC_MARK_METADATA || - !GC_SECTORS_USED(b) || - GC_SECTORS_USED(b) == ca->sb.bucket_size || - atomic_read(&b->pin)) - continue; - - if (!heap_full(&ca->heap)) { - sectors_to_move += GC_SECTORS_USED(b); - heap_add(&ca->heap, b, bucket_cmp); - } else if (bucket_cmp(b, heap_peek(&ca->heap))) { - sectors_to_move -= bucket_heap_top(ca); - sectors_to_move += GC_SECTORS_USED(b); - - ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_cmp); - } - } + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) + continue; - while (sectors_to_move > reserve_sectors) { - heap_pop(&ca->heap, b, bucket_cmp); - sectors_to_move -= GC_SECTORS_USED(b); + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); } + } - while (heap_pop(&ca->heap, b, bucket_cmp)) - SET_GC_MOVE(b, 1); + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); } + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); + mutex_unlock(&c->bucket_lock); c->moving_gc_keys.last_scanned = ZERO_KEY; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7f54ae223644..214326383145 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -99,7 +99,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) return -ENOMEM; return __bch_keylist_realloc(l, u64s); @@ -394,8 +394,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || + bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); goto skip; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6bfa77167362..46a00134a36a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -343,34 +343,25 @@ static void bcache_write_super_unlock(struct closure *cl) void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; - struct cache *ca; - unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + struct cache *ca = c->cache; + struct bio *bio = &ca->sb_bio; + unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); - c->sb.seq++; - - if (c->sb.version > version) - version = c->sb.version; - - for_each_cache(ca, c, i) { - struct bio *bio = &ca->sb_bio; - - ca->sb.version = version; - ca->sb.seq = c->sb.seq; - ca->sb.last_mount = c->sb.last_mount; + ca->sb.seq++; - SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); + if (ca->sb.version < version) + ca->sb.version = version; - bio_init(bio, ca->sb_bv, 1); - bio_set_dev(bio, ca->bdev); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; + bio_init(bio, ca->sb_bv, 1); + bio_set_dev(bio, ca->bdev); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; - closure_get(cl); - __write_super(&ca->sb, ca->sb_disk, bio); - } + closure_get(cl); + __write_super(&ca->sb, ca->sb_disk, bio); closure_return_with_destructor(cl, bcache_write_super_unlock); } @@ -480,22 +471,21 @@ static int __uuid_write(struct cache_set *c) { BKEY_PADDED(key) k; struct closure cl; - struct cache *ca; + struct cache *ca = c->cache; unsigned int size; closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) return 1; - size = meta_bucket_pages(&c->sb) * PAGE_SECTORS; + size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; SET_KEY_SIZE(&k.key, size); uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); closure_sync(&cl); /* Only one bucket used for uuid write */ - ca = PTR_CACHE(c, &k.key, 0); atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written); bkey_copy(&c->uuid_bucket, &k.key); @@ -772,26 +762,22 @@ static void bcache_device_unlink(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - unsigned int i; - struct cache *ca; + struct cache *ca = d->c->cache; sysfs_remove_link(&d->c->kobj, d->name); sysfs_remove_link(&d->kobj, "cache"); - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + bd_unlink_disk_holder(ca->bdev, d->disk); } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, const char *name) { - unsigned int i; - struct cache *ca; + struct cache *ca = c->cache; int ret; - for_each_cache(ca, d->c, i) - bd_link_disk_holder(ca->bdev, d->disk); + bd_link_disk_holder(ca->bdev, d->disk); snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); @@ -1196,8 +1182,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, struct cached_dev *exist_dc, *t; int ret = 0; - if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || - (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) + if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16))) return -ENOENT; if (dc->disk.c) { @@ -1212,7 +1198,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return -EINVAL; } - if (dc->sb.block_size < c->sb.block_size) { + if (dc->sb.block_size < c->cache->sb.block_size) { /* Will die */ pr_err("Couldn't attach %s: block size less than set's block size\n", dc->backing_dev_name); @@ -1269,7 +1255,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, u->first_reg = u->last_reg = rtime; bch_uuid_write(c); - memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); + memcpy(dc->sb.set_uuid, c->set_uuid, 16); SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, &cl); @@ -1331,7 +1317,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, pr_info("Caching %s as %s on set %pU\n", dc->backing_dev_name, dc->disk.disk->disk_name, - dc->disk.c->sb.set_uuid); + dc->disk.c->set_uuid); return 0; } @@ -1534,7 +1520,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors, + if (bcache_device_init(d, block_bytes(c->cache), u->sectors, NULL, &bcache_flash_ops)) goto err; @@ -1638,7 +1624,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) vaf.va = &args; pr_err("error on %pU: %pV, disabling caching\n", - c->sb.set_uuid, &vaf); + c->set_uuid, &vaf); va_end(args); @@ -1662,7 +1648,6 @@ static void cache_set_free(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); struct cache *ca; - unsigned int i; debugfs_remove(c->debug); @@ -1671,15 +1656,16 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); mutex_lock(&bch_register_lock); - for_each_cache(ca, c, i) - if (ca) { - ca->set = NULL; - c->cache[ca->sb.nr_this_dev] = NULL; - kobject_put(&ca->kobj); - } - bch_bset_sort_state_free(&c->sort); - free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb))); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + + ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; + kobject_put(&ca->kobj); + } + if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); @@ -1692,7 +1678,7 @@ static void cache_set_free(struct closure *cl) list_del(&c->list); mutex_unlock(&bch_register_lock); - pr_info("Cache set %pU unregistered\n", c->sb.set_uuid); + pr_info("Cache set %pU unregistered\n", c->set_uuid); wake_up(&unregister_wait); closure_debug_destroy(&c->cl); @@ -1702,9 +1688,8 @@ static void cache_set_free(struct closure *cl) static void cache_set_flush(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cache *ca; + struct cache *ca = c->cache; struct btree *b; - unsigned int i; bch_cache_accounting_destroy(&c->accounting); @@ -1729,9 +1714,8 @@ static void cache_set_flush(struct closure *cl) mutex_unlock(&b->write_lock); } - for_each_cache(ca, c, i) - if (ca->alloc_thread) - kthread_stop(ca->alloc_thread); + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); if (c->journal.cur) { cancel_delayed_work_sync(&c->journal.work); @@ -1764,7 +1748,7 @@ static void conditional_stop_bcache_device(struct cache_set *c, { if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n", - d->disk->disk_name, c->sb.set_uuid); + d->disk->disk_name, c->set_uuid); bcache_device_stop(d); } else if (atomic_read(&dc->has_dirty)) { /* @@ -1841,15 +1825,13 @@ void bch_cache_set_unregister(struct cache_set *c) bch_cache_set_stop(c); } -#define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) - #define alloc_meta_bucket_pages(gfp, sb) \ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { int iter_size; + struct cache *ca = container_of(sb, struct cache, sb); struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); if (!c) @@ -1871,24 +1853,16 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_cache_accounting_init(&c->accounting, &c->cl); - memcpy(c->sb.set_uuid, sb->set_uuid, 16); - c->sb.block_size = sb->block_size; - c->sb.bucket_size = sb->bucket_size; - c->sb.nr_in_set = sb->nr_in_set; - c->sb.last_mount = sb->last_mount; - c->sb.version = sb->version; - if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { - c->sb.feature_compat = sb->feature_compat; - c->sb.feature_ro_compat = sb->feature_ro_compat; - c->sb.feature_incompat = sb->feature_incompat; - } + memcpy(c->set_uuid, sb->set_uuid, 16); + c->cache = ca; + c->cache->set = c; c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); - c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry); + c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry); c->devices_max_used = 0; atomic_set(&c->attached_dev_nr, 0); - c->btree_pages = meta_bucket_pages(&c->sb); + c->btree_pages = meta_bucket_pages(sb); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); @@ -1926,7 +1900,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (mempool_init_kmalloc_pool(&c->bio_meta, 2, sizeof(struct bbio) + - sizeof(struct bio_vec) * meta_bucket_pages(&c->sb))) + sizeof(struct bio_vec) * meta_bucket_pages(sb))) goto err; if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) @@ -1936,7 +1910,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto err; - c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb); + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); if (!c->uuids) goto err; @@ -1972,19 +1946,17 @@ static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; - struct cache *ca; + struct cache *ca = c->cache; struct closure cl; - unsigned int i; LIST_HEAD(journal); struct journal_replay *l; closure_init_stack(&cl); - for_each_cache(ca, c, i) - c->nbuckets += ca->sb.nbuckets; + c->nbuckets = ca->sb.nbuckets; set_gc_sectors(c); - if (CACHE_SYNC(&c->sb)) { + if (CACHE_SYNC(&c->cache->sb)) { struct bkey *k; struct jset *j; @@ -2001,10 +1973,8 @@ static int run_cache_set(struct cache_set *c) j = &list_entry(journal.prev, struct journal_replay, list)->j; err = "IO error reading priorities"; - for_each_cache(ca, c, i) { - if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) - goto err; - } + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; /* * If prio_read() fails it'll call cache_set_error and we'll @@ -2048,9 +2018,8 @@ static int run_cache_set(struct cache_set *c) bch_journal_next(&c->journal); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -2069,28 +2038,23 @@ static int run_cache_set(struct cache_set *c) if (bch_journal_replay(c, &journal)) goto err; } else { - pr_notice("invalidating existing data\n"); + unsigned int j; - for_each_cache(ca, c, i) { - unsigned int j; + pr_notice("invalidating existing data\n"); + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); - ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, - 2, SB_JOURNAL_BUCKETS); - - for (j = 0; j < ca->sb.keys; j++) - ca->sb.d[j] = ca->sb.first_bucket + j; - } + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; bch_initial_gc_finish(c); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - bch_prio_write(ca, true); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -2115,7 +2079,7 @@ static int run_cache_set(struct cache_set *c) * everything is set up - fortunately journal entries won't be * written until the SET_CACHE_SYNC() here: */ - SET_CACHE_SYNC(&c->sb, true); + SET_CACHE_SYNC(&c->cache->sb, true); bch_journal_next(&c->journal); bch_journal_meta(c, &cl); @@ -2126,7 +2090,7 @@ static int run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = (u32)ktime_get_real_seconds(); + c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) @@ -2150,13 +2114,6 @@ err: return -EIO; } -static bool can_attach_cache(struct cache *ca, struct cache_set *c) -{ - return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.bucket_size && - ca->sb.nr_in_set == c->sb.nr_in_set; -} - static const char *register_cache_set(struct cache *ca) { char buf[12]; @@ -2164,16 +2121,10 @@ static const char *register_cache_set(struct cache *ca) struct cache_set *c; list_for_each_entry(c, &bch_cache_sets, list) - if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { - if (c->cache[ca->sb.nr_this_dev]) + if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache) return "duplicate cache set member"; - if (!can_attach_cache(ca, c)) - return "cache sb does not match set"; - - if (!CACHE_SYNC(&ca->sb)) - SET_CACHE_SYNC(&c->sb, false); - goto found; } @@ -2182,7 +2133,7 @@ static const char *register_cache_set(struct cache *ca) return err; err = "error creating kobject"; - if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) || kobject_add(&c->internal, &c->kobj, "internal")) goto err; @@ -2198,31 +2149,13 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - /* - * A special case is both ca->sb.seq and c->sb.seq are 0, - * such condition happens on a new created cache device whose - * super block is never flushed yet. In this case c->sb.version - * and other members should be updated too, otherwise we will - * have a mistaken super block version in cache set. - */ - if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { - c->sb.version = ca->sb.version; - memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); - c->sb.flags = ca->sb.flags; - c->sb.seq = ca->sb.seq; - pr_debug("set version = %llu\n", c->sb.version); - } - kobject_get(&ca->kobj); ca->set = c; - ca->set->cache[ca->sb.nr_this_dev] = ca; - c->cache_by_alloc[c->caches_loaded++] = ca; + ca->set->cache = ca; - if (c->caches_loaded == c->sb.nr_in_set) { - err = "failed to run cache set"; - if (run_cache_set(c) < 0) - goto err; - } + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; return NULL; err: @@ -2239,8 +2172,8 @@ void bch_cache_release(struct kobject *kobj) unsigned int i; if (ca->set) { - BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); - ca->set->cache[ca->sb.nr_this_dev] = NULL; + BUG_ON(ca->set->cache != ca); + ca->set->cache = NULL; } free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); @@ -2448,7 +2381,6 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); -kobj_attribute_write(register_async, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); static bool bch_is_open_backing(struct block_device *bdev) @@ -2469,13 +2401,14 @@ static bool bch_is_open_backing(struct block_device *bdev) static bool bch_is_open_cache(struct block_device *bdev) { struct cache_set *c, *tc; - struct cache *ca; - unsigned int i; - list_for_each_entry_safe(c, tc, &bch_cache_sets, list) - for_each_cache(ca, c, i) - if (ca->bdev == bdev) - return true; + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + struct cache *ca = c->cache; + + if (ca->bdev == bdev) + return true; + } + return false; } @@ -2571,6 +2504,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, struct cache_sb_disk *sb_disk; struct block_device *bdev; ssize_t ret; + bool async_registration = false; + +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION + async_registration = true; +#endif ret = -EBUSY; err = "failed to reference bcache module"; @@ -2624,7 +2562,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_blkdev_put; err = "failed to register device"; - if (attr == &ksysfs_register_async) { + + if (async_registration) { /* register in asynchronous way */ struct async_reg_args *args = kzalloc(sizeof(struct async_reg_args), GFP_KERNEL); @@ -2719,7 +2658,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { char *pdev_set_uuid = pdev->dc->sb.set_uuid; - char *set_uuid = c->sb.uuid; + char *set_uuid = c->set_uuid; if (!memcmp(pdev_set_uuid, set_uuid, 16)) { list_del(&pdev->list); @@ -2887,9 +2826,6 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, -#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION - &ksysfs_register_async.attr, -#endif &ksysfs_pendings_cleanup.attr, NULL }; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index ac06c0bc3c0a..554e3afc9b68 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -711,10 +711,10 @@ SHOW(__bch_cache_set) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - sysfs_print(synchronous, CACHE_SYNC(&c->sb)); + sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb)); sysfs_print(journal_delay_ms, c->journal_delay_ms); - sysfs_hprint(bucket_size, bucket_bytes(c)); - sysfs_hprint(block_size, block_bytes(c)); + sysfs_hprint(bucket_size, bucket_bytes(c->cache)); + sysfs_hprint(block_size, block_bytes(c->cache)); sysfs_print(tree_depth, c->root->level); sysfs_print(root_usage_percent, bch_root_usage(c)); @@ -812,8 +812,8 @@ STORE(__bch_cache_set) if (attr == &sysfs_synchronous) { bool sync = strtoul_or_return(buf); - if (sync != CACHE_SYNC(&c->sb)) { - SET_CACHE_SYNC(&c->sb, sync); + if (sync != CACHE_SYNC(&c->cache->sb)) { + SET_CACHE_SYNC(&c->cache->sb, sync); bcache_write_super(c); } } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4f4ad6b3d43a..3c74996978da 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * This is the size of the cache, minus the amount used for * flash-only devices */ - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - atomic_long_read(&c->flash_dev_dirty_sectors); /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b10c51988c8e..200c5d0f08bf 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -357,11 +357,12 @@ static int read_page(struct file *file, unsigned long index, struct inode *inode = file_inode(file); struct buffer_head *bh; sector_t block, blk_cur; + unsigned long blocksize = i_blocksize(inode); pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); + bh = alloc_page_buffers(page, blocksize, false); if (!bh) { ret = -ENOMEM; goto out; @@ -383,10 +384,10 @@ static int read_page(struct file *file, unsigned long index, bh->b_blocknr = block; bh->b_bdev = inode->i_sb->s_bdev; - if (count < (1<<inode->i_blkbits)) + if (count < blocksize) count = 0; else - count -= (1<<inode->i_blkbits); + count -= blocksize; bh->b_end_io = end_bitmap_write; bh->b_private = bitmap; @@ -605,8 +606,8 @@ re_read: if (bitmap->cluster_slot >= 0) { sector_t bm_blocks = bitmap->mddev->resync_max_sectors; - sector_div(bm_blocks, - bitmap->mddev->bitmap_info.chunksize >> 9); + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, + (bitmap->mddev->bitmap_info.chunksize >> 9)); /* bits to bytes */ bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ @@ -1367,7 +1368,7 @@ __acquires(bitmap->lock) if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); + PAGE_COUNTER_SHIFT); else csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); @@ -1949,6 +1950,7 @@ out: } EXPORT_SYMBOL_GPL(md_bitmap_load); +/* caller need to free returned bitmap with md_bitmap_free() */ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) { int rv = 0; @@ -2012,6 +2014,7 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, md_bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; + md_bitmap_free(bitmap); return rv; } @@ -2615,4 +2618,3 @@ struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; - diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 0580b51a156a..4aaf4820b6f6 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1166,6 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz * can't resize bitmap */ goto out; + md_bitmap_free(bitmap); } return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index de8419b7ae98..98bac4f304ae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8582,6 +8582,26 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, + GFP_NOIO, 0, &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL(md_submit_discard_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9544,7 +9564,7 @@ static int __init md_init(void) goto err_misc_wq; md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_misc_wq) + if (!md_rdev_misc_wq) goto err_rdev_misc_wq; if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) diff --git a/drivers/md/md.h b/drivers/md/md.h index 2175a5ac4f7c..ccfb69868c2e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); +extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index aa2d72791768..6f44177593a5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -426,23 +426,6 @@ static void raid0_free(struct mddev *mddev, void *priv) kfree(conf); } -/* - * Is io distribute over 1 or more chunks ? -*/ -static inline int is_io_in_chunk_boundary(struct mddev *mddev, - unsigned int chunk_sects, struct bio *bio) -{ - if (likely(is_power_of_2(chunk_sects))) { - return chunk_sects >= - ((bio->bi_iter.bi_sector & (chunk_sects-1)) - + bio_sectors(bio)); - } else{ - sector_t sector = bio->bi_iter.bi_sector; - return chunk_sects >= (sector_div(sector, chunk_sects) - + bio_sectors(bio)); - } -} - static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; @@ -494,7 +477,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; - struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -517,18 +499,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - if (__blkdev_issue_discard(rdev->bdev, + md_submit_discard_bio(mddev, rdev, bio, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || - !discard_bio) - continue; - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); + dev_end - dev_start); } bio_endio(bio); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5d1bdee313ec..b7bca6703df8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->copies]); + int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->copies; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->copies; slot++) { + for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } - BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1276,12 +1275,75 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ + int i; + struct r10conf *conf = mddev->private; + struct md_rdev *blocked_rdev; + +retry_wait: + blocked_rdev = NULL; + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[i].replacement); + if (rdev == rrdev) + rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + /* Discard request doesn't care the write result + * so it doesn't need to wait blocked disk here. + */ + if (!r10_bio->sectors) + continue; + + is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + allow_barrier(conf); + raid10_log(conf->mddev, "%s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_wait; + } +} + static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; - struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1339,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); -retry_write: - blocked_rdev = NULL; + + wait_blocked_dev(mddev, r10_bio); + rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1351,16 +1414,6 @@ retry_write: conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1381,15 +1434,6 @@ retry_write: is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1425,35 +1469,6 @@ retry_write: } rcu_read_unlock(); - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - int j; - int d; - - for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } - } - allow_barrier(conf); - raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1493,7 +1508,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1501,6 +1516,296 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static struct bio *raid10_split_bio(struct r10conf *conf, + struct bio *bio, sector_t sectors, bool want_first) +{ + struct bio *split; + + split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + if (want_first) { + submit_bio_noacct(bio); + bio = split; + } else + submit_bio_noacct(split); + wait_barrier(conf); + + return bio; +} + +static void raid_end_discard_bio(struct r10bio *r10bio) +{ + struct r10conf *conf = r10bio->mddev->private; + struct r10bio *first_r10bio; + + while (atomic_dec_and_test(&r10bio->remaining)) { + + allow_barrier(conf); + + if (!test_bit(R10BIO_Discard, &r10bio->state)) { + first_r10bio = (struct r10bio *)r10bio->master_bio; + free_r10bio(r10bio); + r10bio = first_r10bio; + } else { + md_write_end(r10bio->mddev); + bio_endio(r10bio->master_bio); + free_r10bio(r10bio); + break; + } + } +} + +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + raid_end_discard_bio(r10_bio); + rdev_dec_pending(rdev, conf->mddev); +} + +/* There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + struct r10bio *r10_bio, *first_r10bio; + int far_copies = geo->far_copies; + bool first_copy = true; + + int disk; + sector_t chunk; + unsigned int stripe_size; + sector_t split_size; + + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + wait_barrier(conf); + + /* Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + stripe_size = geo->raid_disks << geo->chunk_shift; + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* Maybe one discard bio is smaller than strip size or across one stripe + * and discard region is larger than one stripe size. For far offset layout, + * if the discard region is not aligned with stripe size, there is hole + * when we submit discard bio to member disk. For simplicity, we only + * handle discard bio which discard region is bigger than stripe_size*2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* For far and far offset layout, if bio is not aligned with stripe size, + * it splits the part that is not aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if ((far_copies > 1) && remainder) { + split_size = stripe_size - remainder; + bio = raid10_split_bio(conf, bio, split_size, false); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if ((far_copies > 1) && remainder) { + split_size = bio_sectors(bio) - remainder; + bio = raid10_split_bio(conf, bio, split_size, true); + } + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + +retry_discard: + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + wait_blocked_dev(mddev, r10_bio); + + /* For far layout it needs more than one r10bio to cover all regions. + * Inspired by raid10_sync_request, we can use the first r10bio->master_bio + * to record the discard bio. Other r10bio->master_bio record the first + * r10bio. The first r10bio only release after all other r10bios finish. + * The discard bio returns only first r10bio finishes + */ + if (first_copy) { + r10_bio->master_bio = bio; + set_bit(R10BIO_Discard, &r10_bio->state); + first_copy = false; + first_r10bio = r10_bio; + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time + */ + if (r10_bio->devs[disk].bio) { + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (!geo->far_offset && --far_copies) { + first_stripe_index += geo->stride >> geo->chunk_shift; + start_disk_offset += geo->stride; + last_stripe_index += geo->stride >> geo->chunk_shift; + end_disk_offset += geo->stride; + atomic_inc(&first_r10bio->remaining); + raid_end_discard_bio(r10_bio); + wait_barrier(conf); + goto retry_discard; + } + + raid_end_discard_bio(r10_bio); + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1515,6 +1820,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -3754,7 +4063,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - mddev->chunk_sectors); + UINT_MAX); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); @@ -4458,8 +4767,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, last = conf->reshape_progress - 1; sector_nr = last & ~(sector_t)(conf->geo.chunk_mask & conf->prev.chunk_mask); - if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) - sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; + if (sector_nr + RESYNC_SECTORS < last) + sector_nr = last + 1 - RESYNC_SECTORS; } else { /* 'next' is after the last device address that we * might write to for this chunk in the new layout @@ -4481,8 +4790,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, last = sector_nr | (conf->geo.chunk_mask & conf->prev.chunk_mask); - if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) - last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; + if (sector_nr + RESYNC_SECTORS <= last) + last = sector_nr + RESYNC_SECTORS - 1; } if (need_flush || diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 79cd2b7d3128..1461fd55311b 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,5 +179,6 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, + R10BIO_Discard, }; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d589d26c86ea..39343479ac2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -448,13 +448,74 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh) +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static void free_stripe_pages(struct stripe_head *sh) +{ + int i; + struct page *p; + + /* Have not allocate page pool */ + if (!sh->pages) + return; + + for (i = 0; i < sh->nr_pages; i++) { + p = sh->pages[i]; + if (p) + put_page(p); + sh->pages[i] = NULL; + } +} + +static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) { + int i; struct page *p; + + for (i = 0; i < sh->nr_pages; i++) { + /* The page have allocated. */ + if (sh->pages[i]) + continue; + + p = alloc_page(gfp); + if (!p) { + free_stripe_pages(sh); + return -ENOMEM; + } + sh->pages[i] = p; + } + return 0; +} + +static int +init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) +{ + int nr_pages, cnt; + + if (sh->pages) + return 0; + + /* Each of the sh->dev[i] need one conf->stripe_size */ + cnt = PAGE_SIZE / conf->stripe_size; + nr_pages = (disks + cnt - 1) / cnt; + + sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!sh->pages) + return -ENOMEM; + sh->nr_pages = nr_pages; + sh->stripes_per_page = cnt; + return 0; +} +#endif + +static void shrink_buffers(struct stripe_head *sh) +{ int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num ; i++) { + struct page *p; + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) @@ -462,6 +523,11 @@ static void shrink_buffers(struct stripe_head *sh) sh->dev[i].page = NULL; put_page(p); } +#else + for (i = 0; i < num; i++) + sh->dev[i].page = NULL; + free_stripe_pages(sh); /* Free pages */ +#endif } static int grow_buffers(struct stripe_head *sh, gfp_t gfp) @@ -469,6 +535,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num; i++) { struct page *page; @@ -477,8 +544,18 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) } sh->dev[i].page = page; sh->dev[i].orig_page = page; + sh->dev[i].offset = 0; } +#else + if (alloc_stripe_pages(sh, gfp)) + return -ENOMEM; + for (i = 0; i < num; i++) { + sh->dev[i].page = raid5_get_dev_page(sh, i); + sh->dev[i].orig_page = sh->dev[i].page; + sh->dev[i].offset = raid5_get_page_offset(sh, i); + } +#endif return 0; } @@ -1130,7 +1207,7 @@ again: sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); - bi->bi_io_vec[0].bv_offset = 0; + bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) @@ -1184,7 +1261,7 @@ again: sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); - rbi->bi_io_vec[0].bv_offset = 0; + rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; @@ -1226,7 +1303,7 @@ again: static struct dma_async_tx_descriptor * async_copy_data(int frombio, struct bio *bio, struct page **page, - sector_t sector, struct dma_async_tx_descriptor *tx, + unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, struct stripe_head *sh, int no_skipcopy) { struct bio_vec bvl; @@ -1272,11 +1349,11 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, !no_skipcopy) *page = bio_page; else - tx = async_memcpy(*page, bio_page, page_offset, + tx = async_memcpy(*page, bio_page, page_offset + poff, b_offset, clen, &submit); } else tx = async_memcpy(bio_page, *page, b_offset, - page_offset, clen, &submit); + page_offset + poff, clen, &submit); } /* chain the operations */ submit.depend_tx = tx; @@ -1349,6 +1426,7 @@ static void ops_run_biofill(struct stripe_head *sh) while (rbi && rbi->bi_iter.bi_sector < dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, + dev->offset, dev->sector, tx, sh, 0); rbi = r5_next_bio(conf, rbi, dev->sector); } @@ -1404,14 +1482,25 @@ static addr_conv_t *to_addr_conv(struct stripe_head *sh, return (void *) (to_addr_page(percpu, i) + sh->disks + 2); } +/* + * Return a pointer to record offset address. + */ +static unsigned int * +to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); +} + static struct dma_async_tx_descriptor * ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; + unsigned int off_dest = tgt->offset; int count = 0; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; @@ -1423,19 +1512,22 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - for (i = disks; i--; ) - if (i != target) + for (i = disks; i--; ) { + if (i != target) { + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; + } + } atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1443,6 +1535,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) /* set_syndrome_sources - populate source buffers for gen_syndrome * @srcs - (struct page *) array of size sh->disks + * @offs - (unsigned int) array of offset for each page * @sh - stripe_head to parse * * Populates srcs in proper layout order for the stripe and returns the @@ -1451,6 +1544,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * is recorded in srcs[count+1]]. */ static int set_syndrome_sources(struct page **srcs, + unsigned int *offs, struct stripe_head *sh, int srctype) { @@ -1481,6 +1575,12 @@ static int set_syndrome_sources(struct page **srcs, srcs[slot] = sh->dev[i].orig_page; else srcs[slot] = sh->dev[i].page; + /* + * For R5_InJournal, PAGE_SIZE must be 4KB and will + * not shared page. In that case, dev[i].offset + * is 0. + */ + offs[slot] = sh->dev[i].offset; } i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1493,12 +1593,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int target; int qd_idx = sh->qd_idx; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; struct r5dev *tgt; struct page *dest; + unsigned int dest_off; int i; int count; @@ -1517,17 +1619,18 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) tgt = &sh->dev[target]; BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); dest = tgt->page; + dest_off = tgt->offset; atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ @@ -1535,13 +1638,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) for (i = disks; i-- ; ) { if (i == target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, + tx = async_xor_offs(dest, dest_off, blocks, offs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } @@ -1561,6 +1665,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) struct r5dev *tgt2 = &sh->dev[target2]; struct dma_async_tx_descriptor *tx; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; BUG_ON(sh->batch_head); @@ -1573,13 +1678,16 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) /* we need to open-code set_syndrome_sources to handle the * slot number conversion for 'faila' and 'failb' */ - for (i = 0; i < disks ; i++) + for (i = 0; i < disks ; i++) { + offs[i] = 0; blocks[i] = NULL; + } count = 0; i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + offs[slot] = sh->dev[i].offset; blocks[slot] = sh->dev[i].page; if (i == target) @@ -1604,11 +1712,12 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, syndrome_disks+2, + return async_gen_syndrome(blocks, offs, syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { struct page *dest; + unsigned int dest_off; int data_target; int qd_idx = sh->qd_idx; @@ -1622,22 +1731,24 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) for (i = disks; i-- ; ) { if (i == data_target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } dest = sh->dev[data_target].page; + dest_off = sh->dev[data_target].offset; init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, + tx = async_xor_offs(dest, dest_off, blocks, offs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, count+2, + return async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } @@ -1650,13 +1761,13 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) return async_raid6_datap_recov(syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), faila, - blocks, &submit); + blocks, offs, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, RAID5_STRIPE_SIZE(sh->raid_conf), faila, failb, - blocks, &submit); + blocks, offs, &submit); } } } @@ -1682,10 +1793,12 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int count = 0, pd_idx = sh->pd_idx, i; struct async_submit_ctl submit; /* existing parity data subtracted */ + unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; BUG_ON(sh->batch_head); @@ -1695,15 +1808,22 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_InJournal, &dev->flags)) + if (test_bit(R5_InJournal, &dev->flags)) { + /* + * For this case, PAGE_SIZE must be equal to 4KB and + * page offset is zero. + */ + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->orig_page; - else if (test_bit(R5_Wantdrain, &dev->flags)) + } else if (test_bit(R5_Wantdrain, &dev->flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1714,17 +1834,18 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int count; struct async_submit_ctl submit; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; @@ -1775,6 +1896,7 @@ again: set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, + dev->offset, dev->sector, tx, sh, r5c_is_writeback(conf->log)); if (dev->page != dev->orig_page && @@ -1854,9 +1976,11 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, { int disks = sh->disks; struct page **xor_srcs; + unsigned int *off_srcs; struct async_submit_ctl submit; int count, pd_idx = sh->pd_idx, i; struct page *xor_dest; + unsigned int off_dest; int prexor = 0; unsigned long flags; int j = 0; @@ -1881,24 +2005,31 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, again: count = 0; xor_srcs = to_addr_page(percpu, j); + off_srcs = to_addr_offs(sh, percpu); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { prexor = 1; + off_dest = off_srcs[count] = sh->dev[pd_idx].offset; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (head_sh->dev[i].written || - test_bit(R5_InJournal, &head_sh->dev[i].flags)) + test_bit(R5_InJournal, &head_sh->dev[i].flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } else { xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) + if (i != pd_idx) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } @@ -1924,10 +2055,10 @@ again: } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; @@ -1943,6 +2074,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks; + unsigned int *offs; int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; @@ -1967,6 +2099,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, again: blocks = to_addr_page(percpu, j); + offs = to_addr_offs(sh, percpu); if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { synflags = SYNDROME_SRC_WRITTEN; @@ -1976,7 +2109,7 @@ again: txflags = ASYNC_TX_ACK; } - count = set_syndrome_sources(blocks, sh, synflags); + count = set_syndrome_sources(blocks, offs, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; @@ -1988,7 +2121,7 @@ again: } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, + tx = async_gen_syndrome(blocks, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; @@ -2016,7 +2149,9 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; struct page *xor_dest; + unsigned int off_dest; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; int count; @@ -2028,16 +2163,19 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) BUG_ON(sh->batch_head); count = 0; xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; + off_srcs[count] = off_dest; xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { if (i == pd_idx || i == qd_idx) continue; + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; } init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, + tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); @@ -2049,6 +2187,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) { struct page **srcs = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; int count; @@ -2056,16 +2195,16 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, + async_syndrome_val(srcs, offs, count+2, RAID5_STRIPE_SIZE(sh->raid_conf), - &sh->ops.zero_sum_result, percpu->spare_page, &submit); + &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); } static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) @@ -2142,6 +2281,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) { +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + kfree(sh->pages); +#endif if (sh->ppl_page) __free_page(sh->ppl_page); kmem_cache_free(sc, sh); @@ -2175,9 +2317,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, sh->ppl_page = alloc_page(gfp); if (!sh->ppl_page) { free_stripe(sc, sh); - sh = NULL; + return NULL; } } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + if (init_stripe_shared_pages(sh, conf, disks)) { + free_stripe(sc, sh); + return NULL; + } +#endif } return sh; } @@ -2253,8 +2401,9 @@ static int scribble_alloc(struct raid5_percpu *percpu, int num, int cnt) { size_t obj_size = - sizeof(struct page *) * (num+2) + - sizeof(addr_conv_t) * (num+2); + sizeof(struct page *) * (num + 2) + + sizeof(addr_conv_t) * (num + 2) + + sizeof(unsigned int) * (num + 2); void *scribble; /* @@ -2386,9 +2535,16 @@ static int resize_stripes(struct r5conf *conf, int newsize) osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < osh->nr_pages; i++) { + nsh->pages[i] = osh->pages[i]; + osh->pages[i] = NULL; + } +#endif for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; nsh->dev[i].orig_page = osh->dev[i].page; + nsh->dev[i].offset = osh->dev[i].offset; } nsh->hash_lock_index = hash; free_stripe(conf->slab_cache, osh); @@ -2429,8 +2585,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; - mutex_unlock(&conf->cache_size_mutex); - conf->slab_cache = sc; conf->active_name = 1-conf->active_name; @@ -2439,20 +2593,41 @@ static int resize_stripes(struct r5conf *conf, int newsize) nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < nsh->nr_pages; i++) { + if (nsh->pages[i]) + continue; + nsh->pages[i] = alloc_page(GFP_NOIO); + if (!nsh->pages[i]) + err = -ENOMEM; + } + + for (i = conf->raid_disks; i < newsize; i++) { + if (nsh->dev[i].page) + continue; + nsh->dev[i].page = raid5_get_dev_page(nsh, i); + nsh->dev[i].orig_page = nsh->dev[i].page; + nsh->dev[i].offset = raid5_get_page_offset(nsh, i); + } +#else for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; nsh->dev[i].orig_page = p; + nsh->dev[i].offset = 0; if (!p) err = -ENOMEM; } +#endif raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ if (!err) conf->pool_size = newsize; + mutex_unlock(&conf->cache_size_mutex); + return err; } @@ -4369,7 +4544,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf), + sh->dev[i].page, sh2->dev[dd_idx].offset, + sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -6506,6 +6682,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) struct r5conf *conf; unsigned long new; int err; + int size; if (len >= PAGE_SIZE) return -EINVAL; @@ -6538,10 +6715,29 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) pr_debug("md/raid: change stripe_size from %lu to %lu\n", conf->stripe_size, new); + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) { + err = -EBUSY; + goto out_unlock; + } + mddev_suspend(mddev); + mutex_lock(&conf->cache_size_mutex); + size = conf->max_nr_stripes; + + shrink_stripes(conf); + conf->stripe_size = new; conf->stripe_shift = ilog2(new) - 9; conf->stripe_sectors = new >> 9; + if (grow_stripes(conf, size)) { + pr_warn("md/raid:%s: couldn't allocate buffers\n", + mdname(mddev)); + err = -ENOMEM; + } + mutex_unlock(&conf->cache_size_mutex); mddev_resume(mddev); out_unlock: diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 16fc29472f5c..5c05acf20e1f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -195,6 +195,7 @@ enum reconstruct_states { reconstruct_state_result, }; +#define DEFAULT_STRIPE_SIZE 4096 struct stripe_head { struct hlist_node hash; struct list_head lru; /* inactive_list or handle_list */ @@ -246,6 +247,13 @@ struct stripe_head { int target, target2; enum sum_check_flags zero_sum_result; } ops; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + /* These pages will be used by bios in dev[i] */ + struct page **pages; + int nr_pages; /* page array size */ + int stripes_per_page; +#endif struct r5dev { /* rreq and rvec are used for the replacement device when * writing data to both devices. @@ -253,6 +261,7 @@ struct stripe_head { struct bio req, rreq; struct bio_vec vec, rvec; struct page *page, *orig_page; + unsigned int offset; /* offset of the page */ struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; @@ -472,7 +481,6 @@ struct disk_info { */ #define NR_STRIPES 256 -#define DEFAULT_STRIPE_SIZE 4096 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE #define STRIPE_SIZE PAGE_SIZE @@ -771,6 +779,25 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +/* + * Return offset of the corresponding page for r5dev. + */ +static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx) +{ + return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf); +} + +/* + * Return corresponding page address for r5dev. + */ +static inline struct page * +raid5_get_dev_page(struct stripe_head *sh, int disk_idx) +{ + return sh->pages[disk_idx / sh->stripes_per_page]; +} +#endif + extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d2397cfe178f..56e2a22e8a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,6 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -968,10 +967,10 @@ static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | NVME_CMD_EFFECTS_CSE_MASK; case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; default: break; } @@ -1009,7 +1008,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * For simplicity, IO to all namespaces is quiesced even if the command * effects say only one namespace is affected. */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { mutex_lock(&ctrl->scan_lock); mutex_lock(&ctrl->subsys->lock); nvme_mpath_start_freeze(ctrl->subsys); @@ -1020,36 +1019,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (_nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - else if (blk_queue_is_zoned(ns->disk->queue)) { - /* - * IO commands are required to fully revalidate a zoned - * device. Force the command effects to trigger rescan - * work so report zones can run in a context with - * unfrozen IO queues. - */ - *effects |= NVME_CMD_EFFECTS_NCC; - } - up_read(&ctrl->namespaces_rwsem); -} - static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) { - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl, &effects); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); @@ -1309,6 +1281,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, int status, pos, len; void *data; + if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) + return 0; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1352,19 +1326,8 @@ free_data: return status; } -static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) -{ - struct nvme_command c = { }; - - c.identify.opcode = nvme_admin_identify; - c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; - c.identify.nsid = cpu_to_le32(nsid); - return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, - NVME_IDENTIFY_DATA_SIZE); -} - -static int nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns **id) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids, struct nvme_id_ns **id) { struct nvme_command c = { }; int error; @@ -1381,9 +1344,24 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(*id); + goto out_free_id; } + error = -ENODEV; + if ((*id)->ncap == 0) /* namespace not allocated or attached */ + goto out_free_id; + + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + return 0; + +out_free_id: + kfree(*id); return error; } @@ -1905,20 +1883,6 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) nvme_lba_to_sect(ns, max_blocks)); } -static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, - struct nvme_id_ns *id, struct nvme_ns_ids *ids) -{ - memset(ids, 0, sizeof(*ids)); - - if (ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) - return nvme_identify_ns_descs(ctrl, nsid, ids); - return 0; -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -1959,6 +1923,68 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return 0; } +static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + + /* + * The PI implementation requires the metadata size to be equal to the + * t10 pi tuple size. + */ + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; + + ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + return 0; + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * The NVMe over Fabrics specification only supports metadata as + * part of the extended data LBA. We rely on HCA/HBA support to + * remap the separate metadata buffer from the block layer. + */ + if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) + return -EINVAL; + if (ctrl->max_integrity_segments) + ns->features |= + (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + } else { + /* + * For PCIe controllers, we can't easily remap the separate + * metadata buffer from the block layer and thus require a + * separate metadata buffer for block layer metadata/PI support. + * We allow extended LBAs for the passthrough interface, though. + */ + if (id->flbas & NVME_NS_FLBAS_META_EXT) + ns->features |= NVME_NS_EXT_LBAS; + else + ns->features |= NVME_NS_METADATA_SUPPORTED; + } + + return 0; +} + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; + + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; + + max_segments = min_not_zero(max_segments, ctrl->max_segments); + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); + blk_queue_dma_alignment(q, 7); + blk_queue_write_cache(q, vwc, vwc); +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { @@ -1966,11 +1992,15 @@ static void nvme_update_disk_info(struct gendisk *disk, unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + /* + * The block layer can't support LBA sizes larger than the page size + * yet, so catch this early and don't allow block I/O. + */ if (ns->lba_shift > PAGE_SHIFT) { - /* unsupported block size, set capacity to 0 later */ + capacity = 0; bs = (1 << 9); } - blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); atomic_bs = phys_bs = bs; @@ -2005,13 +2035,6 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_queue_io_opt(disk->queue, io_opt); /* - * The block layer can't support LBA sizes larger than the page size - * yet, so catch this early and don't allow block I/O. - */ - if (ns->lba_shift > PAGE_SHIFT) - capacity = 0; - - /* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block * I/O to namespaces with metadata except when the namespace supports @@ -2035,8 +2058,6 @@ static void nvme_update_disk_info(struct gendisk *disk, set_disk_ro(disk, true); else set_disk_ro(disk, false); - - blk_mq_unfreeze_queue(disk->queue); } static inline bool nvme_first_scan(struct gendisk *disk) @@ -2076,151 +2097,49 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) blk_queue_chunk_sectors(ns->queue, iob); } -static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) { unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; int ret; - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ + blk_mq_freeze_queue(ns->disk->queue); ns->lba_shift = id->lbaf[lbaf].ds; - if (ns->lba_shift == 0) - ns->lba_shift = 9; + nvme_set_queue_limits(ns->ctrl, ns->queue); - switch (ns->head->ids.csi) { - case NVME_CSI_NVM: - break; - case NVME_CSI_ZNS: - ret = nvme_update_zone_info(disk, ns, lbaf); - if (ret) { - dev_warn(ctrl->device, - "failed to add zoned namespace:%u ret:%d\n", - ns->head->ns_id, ret); - return ret; - } - break; - default: - dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", - ns->head->ids.csi, ns->head->ns_id); - return -ENODEV; + if (ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf); + if (ret) + goto out_unfreeze; } - ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - /* the PI implementation requires metadata equal t10 pi tuple size */ - if (ns->ms == sizeof(struct t10_pi_tuple)) - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - ns->pi_type = 0; + ret = nvme_configure_metadata(ns, id); + if (ret) + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + blk_mq_unfreeze_queue(ns->disk->queue); - if (ns->ms) { - /* - * For PCIe only the separate metadata pointer is supported, - * as the block layer supplies metadata in a separate bio_vec - * chain. For Fabrics, only metadata as part of extended data - * LBA is supported on the wire per the Fabrics specification, - * but the HBA/HCA will do the remapping from the separate - * metadata buffers for us. - */ - if (id->flbas & NVME_NS_FLBAS_META_EXT) { - ns->features |= NVME_NS_EXT_LBAS; - if ((ctrl->ops->flags & NVME_F_FABRICS) && - (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) && - ctrl->max_integrity_segments) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } else { - if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS)) - return -EINVAL; - if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } + if (blk_queue_is_zoned(ns->queue)) { + ret = nvme_revalidate_zones(ns); + if (ret) + return ret; } - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { + blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); blk_queue_update_readahead(ns->head->disk->queue); nvme_update_bdev_size(ns->head->disk); + blk_mq_unfreeze_queue(ns->head->disk->queue); } #endif return 0; -} - -static int _nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id; - struct nvme_ns_ids ids; - int ret = 0; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) { - set_capacity(disk, 0); - return -ENODEV; - } - ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); - if (ret) - goto out; - - if (id->ncap == 0) { - ret = -ENODEV; - goto free_id; - } - - ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); - if (ret) - goto free_id; - - if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { - dev_err(ctrl->device, - "identifiers changed for nsid %d\n", ns->head->ns_id); - ret = -ENODEV; - goto free_id; - } - - ret = __nvme_revalidate_disk(disk, id); -free_id: - kfree(id); -out: - /* - * Only fail the function if we got a fatal error back from the - * device, otherwise ignore the error and just move on. - */ - if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) - ret = 0; - else if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - int ret; - - ret = _nvme_revalidate_disk(disk); - if (ret) - return ret; - -#ifdef CONFIG_BLK_DEV_ZONED - if (blk_queue_is_zoned(disk->queue)) { - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - - ret = blk_revalidate_disk_zones(disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(disk->queue, - ctrl->max_zone_append); - } -#endif +out_unfreeze: + blk_mq_unfreeze_queue(ns->disk->queue); return ret; } @@ -2502,26 +2421,6 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) -{ - bool vwc = false; - - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; - - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 7); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - vwc = true; - blk_queue_write_cache(q, vwc, vwc); -} - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; @@ -3025,26 +2924,10 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) -{ - struct nvme_cel *cel, *ret = NULL; - - spin_lock_irq(&ctrl->lock); - list_for_each_entry(cel, &ctrl->cels, entry) { - if (cel->csi == csi) { - ret = cel; - break; - } - } - spin_unlock_irq(&ctrl->lock); - - return ret; -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { - struct nvme_cel *cel = nvme_find_cel(ctrl, csi); + struct nvme_cel *cel = xa_load(&ctrl->cels, csi); int ret; if (cel) @@ -3062,10 +2945,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, } cel->csi = csi; - - spin_lock_irq(&ctrl->lock); - list_add_tail(&cel->entry, &ctrl->cels); - spin_unlock_irq(&ctrl->lock); + xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL); out: *log = &cel->log; return 0; @@ -3846,25 +3726,16 @@ out: } static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, - struct nvme_id_ns *id) + struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ctrl *ctrl = ns->ctrl; - bool is_shared = id->nmic & NVME_NS_NMIC_SHARED; struct nvme_ns_head *head = NULL; - struct nvme_ns_ids ids; int ret = 0; - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) { - if (ret < 0) - return ret; - return blk_status_to_errno(nvme_error_status(ret)); - } - mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, &ids); + head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; @@ -3877,7 +3748,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, "Duplicate unshared namespace %d\n", nsid); goto out_put_ns_head; } - if (!nvme_ns_ids_equal(&head->ids, &ids)) { + if (!nvme_ns_ids_equal(&head->ids, ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", nsid); @@ -3925,7 +3796,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids) { struct nvme_ns *ns; struct gendisk *disk; @@ -3933,9 +3805,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) char disk_name[DISK_NAME_LEN]; int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + if (nvme_identify_ns(ctrl, nsid, ids, &id)) + return; + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return; + goto out_free_id; ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) @@ -3950,23 +3825,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue->queuedata = ns; ns->ctrl = ctrl; - kref_init(&ns->kref); - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - nvme_set_queue_limits(ctrl, ns->queue); - ret = nvme_identify_ns(ctrl, nsid, &id); + ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); if (ret) goto out_free_queue; - - if (id->ncap == 0) /* no namespace (legacy quirk) */ - goto out_free_id; - - ret = nvme_init_ns_head(ns, nsid, id); - if (ret) - goto out_free_id; nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); @@ -3980,7 +3843,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; - if (__nvme_revalidate_disk(disk, id)) + if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { @@ -4015,12 +3878,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_id: - kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + out_free_id: + kfree(id); } static void nvme_ns_remove(struct nvme_ns *ns) @@ -4028,6 +3891,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + set_capacity(ns->disk, 0); nvme_fault_inject_fini(&ns->fault_inject); mutex_lock(&ns->ctrl->subsys->lock); @@ -4065,22 +3929,75 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) } } -static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) +{ + struct nvme_id_ns *id; + int ret = -ENODEV; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) + goto out; + + ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); + if (ret) + goto out; + + ret = -ENODEV; + if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { + dev_err(ns->ctrl->device, + "identifiers changed for nsid %d\n", ns->head->ns_id); + goto out_free_id; + } + + ret = nvme_update_ns_info(ns, id); + +out_free_id: + kfree(id); +out: + /* + * Only remove the namespace if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + * + * TODO: we should probably schedule a delayed retry here. + */ + if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) + nvme_ns_remove(ns); + else + revalidate_disk_size(ns->disk, true); +} + +static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { + struct nvme_ns_ids ids = { }; struct nvme_ns *ns; - int ret; + + if (nvme_identify_ns_descs(ctrl, nsid, &ids)) + return; ns = nvme_find_get_ns(ctrl, nsid); - if (!ns) { - nvme_alloc_ns(ctrl, nsid); + if (ns) { + nvme_validate_ns(ns, &ids); + nvme_put_ns(ns); return; } - ret = nvme_revalidate_disk(ns->disk); - revalidate_disk_size(ns->disk, ret == 0); - if (ret) - nvme_ns_remove(ns); - nvme_put_ns(ns); + switch (ids.csi) { + case NVME_CSI_NVM: + nvme_alloc_ns(ctrl, nsid, &ids); + break; + case NVME_CSI_ZNS: + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + dev_warn(ctrl->device, + "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", + nsid); + break; + } + nvme_alloc_ns(ctrl, nsid, &ids); + break; + default: + dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", + ids.csi, nsid); + break; + } } static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -4116,7 +4033,14 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) return -ENOMEM; for (;;) { - ret = nvme_identify_ns_list(ctrl, prev, ns_list); + struct nvme_command cmd = { + .identify.opcode = nvme_admin_identify, + .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, + .identify.nsid = cpu_to_le32(prev), + }; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, + NVME_IDENTIFY_DATA_SIZE); if (ret) goto free; @@ -4125,7 +4049,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) if (!nsid) /* end of the list? */ goto out; - nvme_validate_ns(ctrl, nsid); + nvme_validate_or_alloc_ns(ctrl, nsid); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } @@ -4148,7 +4072,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) kfree(id); for (i = 1; i <= nn; i++) - nvme_validate_ns(ctrl, i); + nvme_validate_or_alloc_ns(ctrl, i); nvme_remove_invalid_namespaces(ctrl, nn); } @@ -4453,15 +4377,11 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - struct nvme_cel *cel, *next; if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { - list_del(&cel->entry); - kfree(cel); - } + xa_destroy(&ctrl->cels); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4493,7 +4413,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); - INIT_LIST_HEAD(&ctrl->cels); + xa_init(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4673,28 +4593,13 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) { - struct nvme_ctrl *ctrl; - struct file *f; - - f = filp_open(path, O_RDWR, 0); - if (IS_ERR(f)) - return ERR_CAST(f); - - if (f->f_op != &nvme_dev_fops) { - ctrl = ERR_PTR(-EINVAL); - goto out_close; - } - - ctrl = f->private_data; - nvme_get_ctrl(ctrl); - -out_close: - filp_close(f, NULL); - return ctrl; + if (file->f_op != &nvme_dev_fops) + return NULL; + return file->private_data; } -EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); /* * Check we didn't inadvertently grow the command structure sizes: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8ac37430347c..e7c88b40f5bb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -300,7 +300,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; - struct list_head cels; + struct xarray cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -758,10 +758,9 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) } #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_revalidate_zones(struct nvme_ns *ns); #ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf); - +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); int nvme_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -778,9 +777,7 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, return BLK_STS_NOTSUPP; } -static inline int nvme_update_zone_info(struct gendisk *disk, - struct nvme_ns *ns, - unsigned lbaf) +static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { dev_warn(ns->ctrl->device, "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); @@ -825,7 +822,7 @@ static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); void nvme_execute_passthru_rq(struct request *rq); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8984796db0c8..e5b02242f3ca 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2038,32 +2038,30 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .calc_sets = nvme_calc_irq_sets, .priv = dev, }; - unsigned int irq_queues, this_p_queues; + unsigned int irq_queues, poll_queues; /* - * Poll queues don't need interrupts, but we need at least one IO - * queue left over for non-polled IO. + * Poll queues don't need interrupts, but we need at least one I/O queue + * left over for non-polled I/O. */ - this_p_queues = dev->nr_poll_queues; - if (this_p_queues >= nr_io_queues) { - this_p_queues = nr_io_queues - 1; - irq_queues = 1; - } else { - irq_queues = nr_io_queues - this_p_queues + 1; - } - dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; - /* Initialize for the single interrupt case */ + /* + * Initialize for the single interrupt case, will be updated in + * nvme_calc_irq_sets(). + */ dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; /* - * Some Apple controllers require all queues to use the - * first vector. + * We need interrupts for the admin queue and each non-polled I/O queue, + * but some Apple controllers require all queues to use the first + * vector. */ - if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) - irq_queues = 1; - + irq_queues = 1; + if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) + irq_queues += (nr_io_queues - poll_queues); return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -3187,7 +3185,6 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, @@ -3195,6 +3192,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_SINGLE_VECTOR | NVME_QUIRK_128_BYTES_SQES | NVME_QUIRK_SHARED_TAGS }, + + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 57cfd78731fb..67e87e9f306f 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -7,6 +7,17 @@ #include <linux/vmalloc.h> #include "nvme.h" +int nvme_revalidate_zones(struct nvme_ns *ns) +{ + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; +} + static int nvme_set_max_append(struct nvme_ctrl *ctrl) { struct nvme_command c = { }; @@ -35,11 +46,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf) +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { struct nvme_effects_log *log = ns->head->effects; - struct request_queue *q = disk->queue; + struct request_queue *q = ns->queue; struct nvme_command c = { }; struct nvme_id_ns_zns *id; int status; @@ -133,28 +143,6 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, return NULL; } -static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - struct nvme_zone_report *report, - size_t buflen) -{ - struct nvme_command c = { }; - int ret; - - c.zmr.opcode = nvme_cmd_zone_mgmt_recv; - c.zmr.nsid = cpu_to_le32(ns->head->ns_id); - c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); - c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); - c.zmr.zra = NVME_ZRA_ZONE_REPORT; - c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; - c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; - - ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); - if (ret) - return ret; - - return le64_to_cpu(report->nr_zones); -} - static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, unsigned int idx, report_zones_cb cb, @@ -182,6 +170,7 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nvme_zone_report *report; + struct nvme_command c = { }; int ret, zone_idx = 0; unsigned int nz, i; size_t buflen; @@ -190,14 +179,26 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, if (!report) return -ENOMEM; + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + sector &= ~(ns->zsze - 1); while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { memset(report, 0, buflen); - ret = __nvme_ns_report_zones(ns, sector, report, buflen); - if (ret < 0) + + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) { + if (ret > 0) + ret = -EIO; goto out_free; + } - nz = min_t(unsigned int, ret, nr_zones); + nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); if (!nz) break; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e9fe91786bbb..dca34489a1dc 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -727,7 +727,9 @@ u16 nvmet_set_feat_kato(struct nvmet_req *req) { u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + nvmet_stop_keep_alive_timer(req->sq->ctrl); req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_start_keep_alive_timer(req->sq->ctrl); nvmet_set_result(req, req->sq->ctrl->kato); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b7b63330b5ef..25d62d867563 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -395,7 +395,7 @@ static void nvmet_keep_alive_timer(struct work_struct *work) nvmet_ctrl_fatal_error(ctrl); } -static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { if (unlikely(ctrl->kato == 0)) return; @@ -407,7 +407,7 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } -static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { if (unlikely(ctrl->kato == 0)) return; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index e6861cc10e7d..cd4e73aa9807 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1019,7 +1019,7 @@ static void nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) { /* if LLDD not implemented, leave as NULL */ - if (!hostport->hosthandle) + if (!hostport || !hostport->hosthandle) return; nvmet_fc_hostport_put(hostport); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 0d6008cf66a2..f6d81239be21 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -579,7 +579,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) - goto out_put_ctrl; + goto out; if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) WARN_ON_ONCE(1); @@ -635,8 +635,8 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); -out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); +out: if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 47ee3fb193bd..559a15ccc322 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -395,6 +395,8 @@ void nvmet_get_feat_async_event(struct nvmet_req *req); u16 nvmet_set_feat_kato(struct nvmet_req *req); u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); void nvmet_execute_async_event(struct nvmet_req *req); +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl); +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index dacfa7435d0b..56c571052216 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -456,10 +456,26 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) req->execute = nvmet_passthru_execute_cmd; req->p.use_workqueue = true; return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_CTRL: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; case NVME_ID_CNS_NS: req->execute = nvmet_passthru_execute_cmd; req->p.use_workqueue = true; return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_NS: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; default: return nvmet_setup_passthru_command(req); } @@ -474,6 +490,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) { struct nvme_ctrl *ctrl; + struct file *file; int ret = -EINVAL; void *old; @@ -488,24 +505,29 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) goto out_unlock; } - ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path); - if (IS_ERR(ctrl)) { - ret = PTR_ERR(ctrl); + file = filp_open(subsys->passthru_ctrl_path, O_RDWR, 0); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_unlock; + } + + ctrl = nvme_ctrl_from_file(file); + if (!ctrl) { pr_err("failed to open nvme controller %s\n", subsys->passthru_ctrl_path); - goto out_unlock; + goto out_put_file; } old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, subsys, GFP_KERNEL); if (xa_is_err(old)) { ret = xa_err(old); - goto out_put_ctrl; + goto out_put_file; } if (old) - goto out_put_ctrl; + goto out_put_file; subsys->passthru_ctrl = ctrl; subsys->ver = ctrl->vs; @@ -516,13 +538,12 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) NVME_TERTIARY(subsys->ver)); subsys->ver = NVME_VS(1, 2, 1); } - + nvme_get_ctrl(ctrl); __module_get(subsys->passthru_ctrl->ops->module); - mutex_unlock(&subsys->lock); - return 0; + ret = 0; -out_put_ctrl: - nvme_put_ctrl(ctrl); +out_put_file: + filp_close(file, NULL); out_unlock: mutex_unlock(&subsys->lock); return ret; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 8e0d766d2722..dc1f0f647189 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -94,7 +94,6 @@ struct nvmet_tcp_queue { struct socket *sock; struct nvmet_tcp_port *port; struct work_struct io_work; - int cpu; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; @@ -144,7 +143,6 @@ struct nvmet_tcp_port { struct work_struct accept_work; struct nvmet_port *nport; struct sockaddr_storage addr; - int last_cpu; void (*data_ready)(struct sock *); }; @@ -219,6 +217,11 @@ static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) list_add_tail(&cmd->entry, &cmd->queue->free_list); } +static inline int queue_cpu(struct nvmet_tcp_queue *queue) +{ + return queue->sock->sk->sk_incoming_cpu; +} + static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) { return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; @@ -506,7 +509,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) struct nvmet_tcp_queue *queue = cmd->queue; llist_add(&cmd->lentry, &queue->resp_list); - queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); } static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) @@ -1223,7 +1226,7 @@ static void nvmet_tcp_io_work(struct work_struct *w) * We exahusted our budget, requeue our selves */ if (pending) - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, @@ -1383,7 +1386,7 @@ static void nvmet_tcp_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue)) - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); read_unlock_bh(&sk->sk_callback_lock); } @@ -1403,7 +1406,7 @@ static void nvmet_tcp_write_space(struct sock *sk) if (sk_stream_is_writeable(sk)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } out: read_unlock_bh(&sk->sk_callback_lock); @@ -1512,9 +1515,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_free_connect; - port->last_cpu = cpumask_next_wrap(port->last_cpu, - cpu_online_mask, -1, false); - queue->cpu = port->last_cpu; nvmet_prepare_receive_pdu(queue); mutex_lock(&nvmet_tcp_queue_mutex); @@ -1525,7 +1525,7 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_destroy_sq; - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); return 0; out_destroy_sq: @@ -1612,7 +1612,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) } port->nport = nport; - port->last_cpu = -1; INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); if (port->nport->inline_data_size < 0) port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 2bdd64648ef0..e6acbf940712 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -8,6 +8,8 @@ #define _HISI_SAS_H_ #include <linux/acpi.h> +#include <linux/blk-mq.h> +#include <linux/blk-mq-pci.h> #include <linux/clk.h> #include <linux/debugfs.h> #include <linux/dmapool.h> @@ -431,7 +433,6 @@ struct hisi_hba { u32 intr_coal_count; /* Interrupt count to coalesce */ int cq_nvecs; - unsigned int *reply_map; /* bist */ enum sas_linkrate debugfs_bist_linkrate; diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index d9d21d23372e..a994c7b8d26f 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -417,6 +417,7 @@ static int hisi_sas_task_prep(struct sas_task *task, struct device *dev = hisi_hba->dev; int dlvry_queue_slot, dlvry_queue, rc, slot_idx; int n_elem = 0, n_elem_dif = 0, n_elem_req = 0; + struct scsi_cmnd *scmd = NULL; struct hisi_sas_dq *dq; unsigned long flags; int wr_q_index; @@ -432,10 +433,23 @@ static int hisi_sas_task_prep(struct sas_task *task, return -ECOMM; } - if (hisi_hba->reply_map) { - int cpu = raw_smp_processor_id(); - unsigned int dq_index = hisi_hba->reply_map[cpu]; + if (task->uldd_task) { + struct ata_queued_cmd *qc; + if (dev_is_sata(device)) { + qc = task->uldd_task; + scmd = qc->scsicmd; + } else { + scmd = task->uldd_task; + } + } + + if (scmd) { + unsigned int dq_index; + u32 blk_tag; + + blk_tag = blk_mq_unique_tag(scmd->request); + dq_index = blk_mq_unique_tag_to_hwq(blk_tag); *dq_pointer = dq = &hisi_hba->dq[dq_index]; } else { *dq_pointer = dq = sas_dev->dq; @@ -464,21 +478,9 @@ static int hisi_sas_task_prep(struct sas_task *task, if (hisi_hba->hw->slot_index_alloc) rc = hisi_hba->hw->slot_index_alloc(hisi_hba, device); - else { - struct scsi_cmnd *scsi_cmnd = NULL; - - if (task->uldd_task) { - struct ata_queued_cmd *qc; + else + rc = hisi_sas_slot_index_alloc(hisi_hba, scmd); - if (dev_is_sata(device)) { - qc = task->uldd_task; - scsi_cmnd = qc->scsicmd; - } else { - scsi_cmnd = task->uldd_task; - } - } - rc = hisi_sas_slot_index_alloc(hisi_hba, scsi_cmnd); - } if (rc < 0) goto err_out_dif_dma_unmap; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 60adf5c32143..b6f75a1764df 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2362,68 +2362,36 @@ static irqreturn_t cq_interrupt_v3_hw(int irq_no, void *p) return IRQ_WAKE_THREAD; } -static void setup_reply_map_v3_hw(struct hisi_hba *hisi_hba, int nvecs) +static int interrupt_preinit_v3_hw(struct hisi_hba *hisi_hba) { - const struct cpumask *mask; - int queue, cpu; + int vectors; + int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi; + struct Scsi_Host *shost = hisi_hba->shost; + struct irq_affinity desc = { + .pre_vectors = BASE_VECTORS_V3_HW, + }; - for (queue = 0; queue < nvecs; queue++) { - struct hisi_sas_cq *cq = &hisi_hba->cq[queue]; + min_msi = MIN_AFFINE_VECTORS_V3_HW; + vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev, + min_msi, max_msi, + PCI_IRQ_MSI | + PCI_IRQ_AFFINITY, + &desc); + if (vectors < 0) + return -ENOENT; - mask = pci_irq_get_affinity(hisi_hba->pci_dev, queue + - BASE_VECTORS_V3_HW); - if (!mask) - goto fallback; - cq->irq_mask = mask; - for_each_cpu(cpu, mask) - hisi_hba->reply_map[cpu] = queue; - } - return; -fallback: - for_each_possible_cpu(cpu) - hisi_hba->reply_map[cpu] = cpu % hisi_hba->queue_count; - /* Don't clean all CQ masks */ + hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW; + shost->nr_hw_queues = hisi_hba->cq_nvecs; + + return 0; } static int interrupt_init_v3_hw(struct hisi_hba *hisi_hba) { struct device *dev = hisi_hba->dev; struct pci_dev *pdev = hisi_hba->pci_dev; - int vectors, rc, i; - int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi; - - if (auto_affine_msi_experimental) { - struct irq_affinity desc = { - .pre_vectors = BASE_VECTORS_V3_HW, - }; - - dev_info(dev, "Enable MSI auto-affinity\n"); - - min_msi = MIN_AFFINE_VECTORS_V3_HW; - - hisi_hba->reply_map = devm_kcalloc(dev, nr_cpu_ids, - sizeof(unsigned int), - GFP_KERNEL); - if (!hisi_hba->reply_map) - return -ENOMEM; - vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev, - min_msi, max_msi, - PCI_IRQ_MSI | - PCI_IRQ_AFFINITY, - &desc); - if (vectors < 0) - return -ENOENT; - setup_reply_map_v3_hw(hisi_hba, vectors - BASE_VECTORS_V3_HW); - } else { - min_msi = max_msi; - vectors = pci_alloc_irq_vectors(hisi_hba->pci_dev, min_msi, - max_msi, PCI_IRQ_MSI); - if (vectors < 0) - return vectors; - } - - hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW; + int rc, i; rc = devm_request_irq(dev, pci_irq_vector(pdev, 1), int_phy_up_down_bcast_v3_hw, 0, @@ -3072,6 +3040,15 @@ static int debugfs_set_bist_v3_hw(struct hisi_hba *hisi_hba, bool enable) return 0; } +static int hisi_sas_map_queues(struct Scsi_Host *shost) +{ + struct hisi_hba *hisi_hba = shost_priv(shost); + struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; + + return blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, + BASE_VECTORS_V3_HW); +} + static struct scsi_host_template sht_v3_hw = { .name = DRV_NAME, .proc_name = DRV_NAME, @@ -3082,6 +3059,7 @@ static struct scsi_host_template sht_v3_hw = { .slave_configure = hisi_sas_slave_configure, .scan_finished = hisi_sas_scan_finished, .scan_start = hisi_sas_scan_start, + .map_queues = hisi_sas_map_queues, .change_queue_depth = sas_change_queue_depth, .bios_param = sas_bios_param, .this_id = -1, @@ -3098,6 +3076,7 @@ static struct scsi_host_template sht_v3_hw = { .shost_attrs = host_attrs_v3_hw, .tag_alloc_policy = BLK_TAG_ALLOC_RR, .host_reset = hisi_sas_host_reset, + .host_tagset = 1, }; static const struct hisi_sas_hw hisi_sas_v3_hw = { @@ -3269,6 +3248,10 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (hisi_sas_debugfs_enable) hisi_sas_debugfs_init(hisi_hba); + rc = interrupt_preinit_v3_hw(hisi_hba); + if (rc) + goto err_out_ha; + dev_err(dev, "%d hw queues\n", shost->nr_hw_queues); rc = scsi_add_host(shost, dev); if (rc) goto err_out_ha; diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 37d1c5565d90..2f162603876f 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -421,6 +421,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost->cmd_per_lun = sht->cmd_per_lun; shost->unchecked_isa_dma = sht->unchecked_isa_dma; shost->no_write_same = sht->no_write_same; + shost->host_tagset = sht->host_tagset; if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler) shost->eh_deadline = -1; diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 2b7e7b5f38ed..020270ce790b 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -37,6 +37,7 @@ #include <linux/poll.h> #include <linux/vmalloc.h> #include <linux/irq_poll.h> +#include <linux/blk-mq-pci.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd; module_param(enable_sdev_max_qd, int, 0444); MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as can_queue. Default: 0"); +int host_tagset_enable = 1; +module_param(host_tagset_enable, int, 0444); +MODULE_PARM_DESC(host_tagset_enable, "Shared host tagset enable/disable Default: enable(1)"); + MODULE_LICENSE("GPL"); MODULE_VERSION(MEGASAS_VERSION); MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com"); @@ -3119,6 +3124,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, return 0; } +static int megasas_map_queues(struct Scsi_Host *shost) +{ + struct megasas_instance *instance; + + instance = (struct megasas_instance *)shost->hostdata; + + if (shost->nr_hw_queues == 1) + return 0; + + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + instance->pdev, instance->low_latency_index_start); +} + static void megasas_aen_polling(struct work_struct *work); /** @@ -3427,6 +3445,7 @@ static struct scsi_host_template megasas_template = { .eh_timed_out = megasas_reset_timer, .shost_attrs = megaraid_host_attrs, .bios_param = megasas_bios_param, + .map_queues = megasas_map_queues, .change_queue_depth = scsi_change_queue_depth, .max_segment_size = 0xffffffff, }; @@ -6808,6 +6827,26 @@ static int megasas_io_attach(struct megasas_instance *instance) host->max_lun = MEGASAS_MAX_LUN; host->max_cmd_len = 16; + /* Use shared host tagset only for fusion adaptors + * if there are managed interrupts (smp affinity enabled case). + * Single msix_vectors in kdump, so shared host tag is also disabled. + */ + + host->host_tagset = 0; + host->nr_hw_queues = 1; + + if ((instance->adapter_type != MFI_SERIES) && + (instance->msix_vectors > instance->low_latency_index_start) && + host_tagset_enable && + instance->smp_affinity_enable) { + host->host_tagset = 1; + host->nr_hw_queues = instance->msix_vectors - + instance->low_latency_index_start; + } + + dev_info(&instance->pdev->dev, + "Max firmware commands: %d shared with nr_hw_queues = %d\n", + instance->max_fw_cmds, host->nr_hw_queues); /* * Notify the mid-layer about the new controller */ diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index b0c01cf0428f..fd607287608e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -359,24 +359,29 @@ megasas_get_msix_index(struct megasas_instance *instance, { int sdev_busy; - /* nr_hw_queue = 1 for MegaRAID */ - struct blk_mq_hw_ctx *hctx = - scmd->device->request_queue->queue_hw_ctx[0]; - - sdev_busy = atomic_read(&hctx->nr_active); + /* TBD - if sml remove device_busy in future, driver + * should track counter in internal structure. + */ + sdev_busy = atomic_read(&scmd->device->device_busy); if (instance->perf_mode == MR_BALANCED_PERF_MODE && - sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) + sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) { cmd->request_desc->SCSIIO.MSIxIndex = mega_mod64((atomic64_add_return(1, &instance->high_iops_outstanding) / MR_HIGH_IOPS_BATCH_COUNT), instance->low_latency_index_start); - else if (instance->msix_load_balance) + } else if (instance->msix_load_balance) { cmd->request_desc->SCSIIO.MSIxIndex = (mega_mod64(atomic64_add_return(1, &instance->total_io_count), instance->msix_vectors)); - else + } else if (instance->host->nr_hw_queues > 1) { + u32 tag = blk_mq_unique_tag(scmd->request); + + cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag) + + instance->low_latency_index_start; + } else { cmd->request_desc->SCSIIO.MSIxIndex = instance->reply_map[raw_smp_processor_id()]; + } } /** @@ -956,9 +961,6 @@ megasas_alloc_cmds_fusion(struct megasas_instance *instance) if (megasas_alloc_cmdlist_fusion(instance)) goto fail_exit; - dev_info(&instance->pdev->dev, "Configured max firmware commands: %d\n", - instance->max_fw_cmds); - /* The first 256 bytes (SMID 0) is not used. Don't add to the cmd list */ io_req_base = fusion->io_request_frames + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE; io_req_base_phys = fusion->io_request_frames_phys + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE; @@ -1102,8 +1104,9 @@ megasas_ioc_init_fusion(struct megasas_instance *instance) MR_HIGH_IOPS_QUEUE_COUNT) && cur_intr_coalescing) instance->perf_mode = MR_BALANCED_PERF_MODE; - dev_info(&instance->pdev->dev, "Performance mode :%s\n", - MEGASAS_PERF_MODE_2STR(instance->perf_mode)); + dev_info(&instance->pdev->dev, "Performance mode :%s (latency index = %d)\n", + MEGASAS_PERF_MODE_2STR(instance->perf_mode), + instance->low_latency_index_start); instance->fw_sync_cache_support = (scratch_pad_1 & MR_CAN_HANDLE_SYNC_CACHE_OFFSET) ? 1 : 0; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 1ad7260d4758..a87e40aec11f 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -4696,19 +4696,14 @@ fini: static struct sdebug_queue *get_queue(struct scsi_cmnd *cmnd) { u16 hwq; + u32 tag = blk_mq_unique_tag(cmnd->request); - if (sdebug_host_max_queue) { - /* Provide a simple method to choose the hwq */ - hwq = smp_processor_id() % submit_queues; - } else { - u32 tag = blk_mq_unique_tag(cmnd->request); + hwq = blk_mq_unique_tag_to_hwq(tag); - hwq = blk_mq_unique_tag_to_hwq(tag); + pr_debug("tag=%#x, hwq=%d\n", tag, hwq); + if (WARN_ON_ONCE(hwq >= submit_queues)) + hwq = 0; - pr_debug("tag=%#x, hwq=%d\n", tag, hwq); - if (WARN_ON_ONCE(hwq >= submit_queues)) - hwq = 0; - } return sdebug_q_arr + hwq; } @@ -7347,10 +7342,7 @@ static int sdebug_driver_probe(struct device *dev) sdbg_host = to_sdebug_host(dev); - if (sdebug_host_max_queue) - sdebug_driver_template.can_queue = sdebug_host_max_queue; - else - sdebug_driver_template.can_queue = sdebug_max_queue; + sdebug_driver_template.can_queue = sdebug_max_queue; if (!sdebug_clustering) sdebug_driver_template.dma_boundary = PAGE_SIZE - 1; @@ -7367,11 +7359,11 @@ static int sdebug_driver_probe(struct device *dev) } /* * Decide whether to tell scsi subsystem that we want mq. The - * following should give the same answer for each host. If the host - * has a limit of hostwide max commands, then do not set. + * following should give the same answer for each host. */ - if (!sdebug_host_max_queue) - hpnt->nr_hw_queues = submit_queues; + hpnt->nr_hw_queues = submit_queues; + if (sdebug_host_max_queue) + hpnt->host_tagset = 1; sdbg_host->shost = hpnt; *((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7affaaf8b98e..a89478a0c588 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1891,6 +1891,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) tag_set->flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); tag_set->driver_data = shost; + if (shost->host_tagset) + tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; return blk_mq_alloc_tag_set(tag_set); } diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 163dbcb741c1..d6e344fa33ad 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -393,6 +393,16 @@ show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf) } static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL); +static ssize_t +show_nr_hw_queues(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct blk_mq_tag_set *tag_set = &shost->tag_set; + + return snprintf(buf, 20, "%d\n", tag_set->nr_hw_queues); +} +static DEVICE_ATTR(nr_hw_queues, S_IRUGO, show_nr_hw_queues, NULL); + static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_use_blk_mq.attr, &dev_attr_unique_id.attr, @@ -411,6 +421,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_prot_guard_type.attr, &dev_attr_host_reset.attr, &dev_attr_eh_deadline.attr, + &dev_attr_nr_hw_queues.attr, NULL }; |