diff options
Diffstat (limited to 'drivers')
69 files changed, 1841 insertions, 4283 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a41145d52de9..a2184b428493 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -285,49 +285,6 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config CDROM_PKTCDVD - tristate "Packet writing on CD/DVD media (DEPRECATED)" - depends on !UML - depends on SCSI - select CDROM - help - Note: This driver is deprecated and will be removed from the - kernel in the near future! - - If you have a CDROM/DVD drive that supports packet writing, say - Y to include support. It should work with any MMC/Mt Fuji - compliant ATAPI or SCSI drive, which is just about any newer - DVD/CD writer. - - Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs - is possible. - DVD-RW disks must be in restricted overwrite mode. - - See the file <file:Documentation/cdrom/packet-writing.rst> - for further information on the use of this driver. - - To compile this driver as a module, choose M here: the - module will be called pktcdvd. - -config CDROM_PKTCDVD_BUFFERS - int "Free buffers for data gathering" - depends on CDROM_PKTCDVD - default "8" - help - This controls the maximum number of active concurrent packets. More - concurrent packets can increase write performance, but also require - more memory. Each concurrent packet will require approximately 64Kb - of non-swappable kernel memory, memory which will be allocated when - a disc is opened for writing. - -config CDROM_PKTCDVD_WCACHE - bool "Enable write caching" - depends on CDROM_PKTCDVD - help - If enabled, write caching will be set for the CD-R/W device. For now - this option is dangerous unless the CD-RW media is known good, as we - don't do deferred write error handling yet. - config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 101612cba303..962ee65d8ca3 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig index cbacddc55a1d..6fb4e38fca88 100644 --- a/drivers/block/drbd/Kconfig +++ b/drivers/block/drbd/Kconfig @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: GPL-2.0-only # # DRBD device driver configuration # diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 8bd534697d1b..c93e462130ff 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: GPL-2.0-only drbd-y := drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e27478ae579c..429255876800 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_actlog.c @@ -868,9 +868,9 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, nr_sectors = get_capacity(device->vdisk); esector = sector + (size >> 9) - 1; - if (!expect(sector < nr_sectors)) + if (!expect(device, sector < nr_sectors)) goto out; - if (!expect(esector < nr_sectors)) + if (!expect(device, esector < nr_sectors)) esector = nr_sectors - 1; lbnr = BM_SECT_TO_BIT(nr_sectors-1); @@ -1143,7 +1143,7 @@ void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; if (!bm_ext) { spin_unlock_irqrestore(&device->al_lock, flags); - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); return; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 7d9db33363de..289876ffbc31 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_bitmap.c @@ -113,7 +113,7 @@ struct drbd_bitmap { static void __bm_print_lock_info(struct drbd_device *device, const char *func) { struct drbd_bitmap *b = device->bitmap; - if (!__ratelimit(&drbd_ratelimit_state)) + if (!drbd_ratelimit()) return; drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n", current->comm, task_pid_nr(current), @@ -448,7 +448,7 @@ int drbd_bm_init(struct drbd_device *device) sector_t drbd_bm_capacity(struct drbd_device *device) { - if (!expect(device->bitmap)) + if (!expect(device, device->bitmap)) return 0; return device->bitmap->bm_dev_capacity; } @@ -457,7 +457,7 @@ sector_t drbd_bm_capacity(struct drbd_device *device) */ void drbd_bm_cleanup(struct drbd_device *device) { - if (!expect(device->bitmap)) + if (!expect(device, device->bitmap)) return; bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); bm_vk_free(device->bitmap->bm_pages); @@ -636,7 +636,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi int err = 0; bool growing; - if (!expect(b)) + if (!expect(device, b)) return -ENOMEM; drbd_bm_lock(device, "resize", BM_LOCKED_MASK); @@ -757,9 +757,9 @@ unsigned long _drbd_bm_total_weight(struct drbd_device *device) unsigned long s; unsigned long flags; - if (!expect(b)) + if (!expect(device, b)) return 0; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 0; spin_lock_irqsave(&b->bm_lock, flags); @@ -783,9 +783,9 @@ unsigned long drbd_bm_total_weight(struct drbd_device *device) size_t drbd_bm_words(struct drbd_device *device) { struct drbd_bitmap *b = device->bitmap; - if (!expect(b)) + if (!expect(device, b)) return 0; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 0; return b->bm_words; @@ -794,7 +794,7 @@ size_t drbd_bm_words(struct drbd_device *device) unsigned long drbd_bm_bits(struct drbd_device *device) { struct drbd_bitmap *b = device->bitmap; - if (!expect(b)) + if (!expect(device, b)) return 0; return b->bm_bits; @@ -816,9 +816,9 @@ void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number, end = offset + number; - if (!expect(b)) + if (!expect(device, b)) return; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return; if (number == 0) return; @@ -863,9 +863,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number, end = offset + number; - if (!expect(b)) + if (!expect(device, b)) return; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return; spin_lock_irq(&b->bm_lock); @@ -894,9 +894,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number, void drbd_bm_set_all(struct drbd_device *device) { struct drbd_bitmap *b = device->bitmap; - if (!expect(b)) + if (!expect(device, b)) return; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return; spin_lock_irq(&b->bm_lock); @@ -910,9 +910,9 @@ void drbd_bm_set_all(struct drbd_device *device) void drbd_bm_clear_all(struct drbd_device *device) { struct drbd_bitmap *b = device->bitmap; - if (!expect(b)) + if (!expect(device, b)) return; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return; spin_lock_irq(&b->bm_lock); @@ -952,7 +952,7 @@ static void drbd_bm_endio(struct bio *bio) bm_set_page_io_err(b->bm_pages[idx]); /* Not identical to on disk version of it. * Is BM_PAGE_IO_ERROR enough? */ - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", bio->bi_status, idx); } else { @@ -1013,7 +1013,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho else len = PAGE_SIZE; } else { - if (__ratelimit(&drbd_ratelimit_state)) { + if (drbd_ratelimit()) { drbd_err(device, "Invalid offset during on-disk bitmap access: " "page idx %u, sector %llu\n", page_nr, on_disk_sector); } @@ -1332,9 +1332,9 @@ static unsigned long bm_find_next(struct drbd_device *device, struct drbd_bitmap *b = device->bitmap; unsigned long i = DRBD_END_OF_BITMAP; - if (!expect(b)) + if (!expect(device, b)) return i; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return i; spin_lock_irq(&b->bm_lock); @@ -1436,9 +1436,9 @@ static int bm_change_bits_to(struct drbd_device *device, const unsigned long s, struct drbd_bitmap *b = device->bitmap; int c = 0; - if (!expect(b)) + if (!expect(device, b)) return 1; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 0; spin_lock_irqsave(&b->bm_lock, flags); @@ -1582,9 +1582,9 @@ int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr) unsigned long *p_addr; int i; - if (!expect(b)) + if (!expect(device, b)) return 0; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 0; spin_lock_irqsave(&b->bm_lock, flags); @@ -1619,9 +1619,9 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const * robust in case we screwed up elsewhere, in that case pretend there * was one dirty bit in the requested area, so we won't try to do a * local read there (no bitmap probably implies no disk) */ - if (!expect(b)) + if (!expect(device, b)) return 1; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 1; spin_lock_irqsave(&b->bm_lock, flags); @@ -1635,7 +1635,7 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const bm_unmap(p_addr); p_addr = bm_map_pidx(b, idx); } - if (expect(bitnr < b->bm_bits)) + if (expect(device, bitnr < b->bm_bits)) c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); else drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); @@ -1668,9 +1668,9 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) unsigned long flags; unsigned long *p_addr, *bm; - if (!expect(b)) + if (!expect(device, b)) return 0; - if (!expect(b->bm_pages)) + if (!expect(device, b->bm_pages)) return 0; spin_lock_irqsave(&b->bm_lock, flags); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index b3b9cd5628fd..a72c096aa5b1 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "drbd debugfs: " fmt #include <linux/kernel.h> #include <linux/module.h> diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h index 58e31cef0844..ee3d66eb40c6 100644 --- a/drivers/block/drbd/drbd_debugfs.h +++ b/drivers/block/drbd/drbd_debugfs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/debugfs.h> diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 4d661282ff41..ae713338aa46 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* drbd_int.h @@ -37,6 +37,7 @@ #include "drbd_strings.h" #include "drbd_state.h" #include "drbd_protocol.h" +#include "drbd_polymorph_printk.h" #ifdef __CHECKER__ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) @@ -75,71 +76,6 @@ extern int drbd_proc_details; struct drbd_device; struct drbd_connection; -#define __drbd_printk_device(level, device, fmt, args...) \ - dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args) -#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \ - dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args) -#define __drbd_printk_resource(level, resource, fmt, args...) \ - printk(level "drbd %s: " fmt, (resource)->name, ## args) -#define __drbd_printk_connection(level, connection, fmt, args...) \ - printk(level "drbd %s: " fmt, (connection)->resource->name, ## args) - -void drbd_printk_with_wrong_object_type(void); - -#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \ - (__builtin_types_compatible_p(typeof(obj), type) || \ - __builtin_types_compatible_p(typeof(obj), const type)), \ - func(level, (const type)(obj), fmt, ## args) - -#define drbd_printk(level, obj, fmt, args...) \ - __builtin_choose_expr( \ - __drbd_printk_if_same_type(obj, struct drbd_device *, \ - __drbd_printk_device, level, fmt, ## args), \ - __builtin_choose_expr( \ - __drbd_printk_if_same_type(obj, struct drbd_resource *, \ - __drbd_printk_resource, level, fmt, ## args), \ - __builtin_choose_expr( \ - __drbd_printk_if_same_type(obj, struct drbd_connection *, \ - __drbd_printk_connection, level, fmt, ## args), \ - __builtin_choose_expr( \ - __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \ - __drbd_printk_peer_device, level, fmt, ## args), \ - drbd_printk_with_wrong_object_type())))) - -#define drbd_dbg(obj, fmt, args...) \ - drbd_printk(KERN_DEBUG, obj, fmt, ## args) -#define drbd_alert(obj, fmt, args...) \ - drbd_printk(KERN_ALERT, obj, fmt, ## args) -#define drbd_err(obj, fmt, args...) \ - drbd_printk(KERN_ERR, obj, fmt, ## args) -#define drbd_warn(obj, fmt, args...) \ - drbd_printk(KERN_WARNING, obj, fmt, ## args) -#define drbd_info(obj, fmt, args...) \ - drbd_printk(KERN_INFO, obj, fmt, ## args) -#define drbd_emerg(obj, fmt, args...) \ - drbd_printk(KERN_EMERG, obj, fmt, ## args) - -#define dynamic_drbd_dbg(device, fmt, args...) \ - dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args) - -#define D_ASSERT(device, exp) do { \ - if (!(exp)) \ - drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \ - } while (0) - -/** - * expect - Make an assertion - * - * Unlike the assert macro, this macro returns a boolean result. - */ -#define expect(exp) ({ \ - bool _bool = (exp); \ - if (!_bool) \ - drbd_err(device, "ASSERTION %s FAILED in %s\n", \ - #exp, __func__); \ - _bool; \ - }) - /* Defines to control fault insertion */ enum { DRBD_FAULT_MD_WR = 0, /* meta data write */ @@ -395,6 +331,7 @@ struct drbd_peer_request { struct drbd_peer_device *peer_device; struct drbd_epoch *epoch; /* for writes */ struct page *pages; + blk_opf_t opf; atomic_t pending_bios; struct drbd_interval i; /* see comments on ee flag bits below */ @@ -406,6 +343,10 @@ struct drbd_peer_request { }; }; +/* Equivalent to bio_op and req_op. */ +#define peer_req_op(peer_req) \ + ((peer_req)->opf & REQ_OP_MASK) + /* ee flag bits. * While corresponding bios are in flight, the only modification will be * set_bit WAS_ERROR, which has to be atomic. @@ -1545,8 +1486,7 @@ extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); -extern int drbd_submit_peer_request(struct drbd_device *, - struct drbd_peer_request *, blk_opf_t, int); +extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, @@ -1718,7 +1658,7 @@ static inline void __drbd_chk_io_error_(struct drbd_device *device, switch (ep) { case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Local IO failed in %s.\n", where); if (device->state.disk > D_INCONSISTENT) _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL); diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index f07b4378388b..5024ffd6143d 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only #include <asm/bug.h> #include <linux/rbtree_augmented.h> #include "drbd_interval.h" diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h index b8c2dee5edc8..366489b72fe9 100644 --- a/drivers/block/drbd/drbd_interval.h +++ b/drivers/block/drbd/drbd_interval.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __DRBD_INTERVAL_H #define __DRBD_INTERVAL_H diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 0aa1dde07a98..2f16e1bfb6e7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd.c @@ -1259,7 +1259,7 @@ static int _drbd_send_bitmap(struct drbd_device *device) struct bm_xfer_ctx c; int err; - if (!expect(device->bitmap)) + if (!expect(device, device->bitmap)) return false; if (get_ldev(device)) { @@ -2217,7 +2217,8 @@ void drbd_destroy_device(struct kref *kref) kref_put(&peer_device->connection->kref, drbd_destroy_connection); kfree(peer_device); } - memset(device, 0xfd, sizeof(*device)); + if (device->submit.wq) + destroy_workqueue(device->submit.wq); kfree(device); kref_put(&resource->kref, drbd_destroy_resource); } @@ -2249,9 +2250,9 @@ static void do_retry(struct work_struct *ws) bool expected; expected = - expect(atomic_read(&req->completion_ref) == 0) && - expect(req->rq_state & RQ_POSTPONED) && - expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || + expect(device, atomic_read(&req->completion_ref) == 0) && + expect(device, req->rq_state & RQ_POSTPONED) && + expect(device, (req->rq_state & RQ_LOCAL_PENDING) == 0 || (req->rq_state & RQ_LOCAL_ABORTED) != 0); if (!expected) @@ -2309,7 +2310,6 @@ void drbd_destroy_resource(struct kref *kref) idr_destroy(&resource->devices); free_cpumask_var(resource->cpu_mask); kfree(resource->name); - memset(resource, 0xf2, sizeof(*resource)); kfree(resource); } @@ -2650,7 +2650,6 @@ void drbd_destroy_connection(struct kref *kref) drbd_free_socket(&connection->data); kfree(connection->int_dig_in); kfree(connection->int_dig_vv); - memset(connection, 0xfc, sizeof(*connection)); kfree(connection); kref_put(&resource->kref, drbd_destroy_resource); } @@ -2774,7 +2773,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig err = add_disk(disk); if (err) - goto out_idr_remove_from_resource; + goto out_destroy_workqueue; /* inherit the connection state */ device->state.conn = first_connection(resource)->cstate; @@ -2788,6 +2787,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_debugfs_device_add(device); return NO_ERROR; +out_destroy_workqueue: + destroy_workqueue(device->submit.wq); out_idr_remove_from_resource: for_each_connection_safe(connection, n, resource) { peer_device = idr_remove(&connection->peer_devices, vnr); @@ -3766,7 +3767,7 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type) if (ret) { drbd_fault_count++; - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_warn(device, "***Simulating %s failure\n", _drbd_fault_str(type)); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 864c98e74875..60757ac31701 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_nl.c @@ -1210,6 +1210,7 @@ static void decide_on_discard_support(struct drbd_device *device, struct drbd_connection *connection = first_peer_device(device)->connection; struct request_queue *q = device->rq_queue; + unsigned int max_discard_sectors; if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) goto not_supported; @@ -1230,15 +1231,14 @@ static void decide_on_discard_support(struct drbd_device *device, * topology on all peers. */ blk_queue_discard_granularity(q, 512); - q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); - q->limits.max_write_zeroes_sectors = - drbd_max_discard_sectors(connection); + max_discard_sectors = drbd_max_discard_sectors(connection); + blk_queue_max_discard_sectors(q, max_discard_sectors); + blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); return; not_supported: blk_queue_discard_granularity(q, 0); - q->limits.max_discard_sectors = 0; - q->limits.max_write_zeroes_sectors = 0; + blk_queue_max_discard_sectors(q, 0); } static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) @@ -1256,6 +1256,18 @@ static void fixup_write_zeroes(struct drbd_device *device, struct request_queue q->limits.max_write_zeroes_sectors = 0; } +static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) +{ + unsigned int max_discard = device->rq_queue->limits.max_discard_sectors; + unsigned int discard_granularity = + device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT; + + if (discard_granularity > max_discard) { + blk_queue_discard_granularity(q, 0); + blk_queue_max_discard_sectors(q, 0); + } +} + static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int max_bio_size, struct o_qlim *o) { @@ -1288,6 +1300,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi disk_update_readahead(device->vdisk); } fixup_write_zeroes(device, q); + fixup_discard_support(device, q); } void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) @@ -1530,7 +1543,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) goto fail_unlock; } - if (!expect(new_disk_conf->resync_rate >= 1)) + if (!expect(device, new_disk_conf->resync_rate >= 1)) new_disk_conf->resync_rate = 1; sanitize_disk_conf(device, new_disk_conf, device->ldev); diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c index 6a09b0b98018..df0d241d3f6a 100644 --- a/drivers/block/drbd/drbd_nla.c +++ b/drivers/block/drbd/drbd_nla.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <net/netlink.h> #include <linux/drbd_genl_api.h> diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h index f5eaffb6474e..d3555df0d353 100644 --- a/drivers/block/drbd/drbd_nla.h +++ b/drivers/block/drbd/drbd_nla.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __DRBD_NLA_H #define __DRBD_NLA_H diff --git a/drivers/block/drbd/drbd_polymorph_printk.h b/drivers/block/drbd/drbd_polymorph_printk.h new file mode 100644 index 000000000000..8e0082d139ba --- /dev/null +++ b/drivers/block/drbd/drbd_polymorph_printk.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef DRBD_POLYMORPH_PRINTK_H +#define DRBD_POLYMORPH_PRINTK_H + +#if !defined(CONFIG_DYNAMIC_DEBUG) +#undef DEFINE_DYNAMIC_DEBUG_METADATA +#undef __dynamic_pr_debug +#undef DYNAMIC_DEBUG_BRANCH +#define DEFINE_DYNAMIC_DEBUG_METADATA(D, F) const char *D = F; ((void)D) +#define __dynamic_pr_debug(D, F, args...) do { (void)(D); if (0) printk(F, ## args); } while (0) +#define DYNAMIC_DEBUG_BRANCH(D) false +#endif + + +#define __drbd_printk_drbd_device_prep(device) \ + const struct drbd_device *__d = (device); \ + const struct drbd_resource *__r = __d->resource +#define __drbd_printk_drbd_device_fmt(fmt) "drbd %s/%u drbd%u: " fmt +#define __drbd_printk_drbd_device_args() __r->name, __d->vnr, __d->minor +#define __drbd_printk_drbd_device_unprep() + +#define __drbd_printk_drbd_peer_device_prep(peer_device) \ + const struct drbd_device *__d; \ + const struct drbd_resource *__r; \ + __d = (peer_device)->device; \ + __r = __d->resource +#define __drbd_printk_drbd_peer_device_fmt(fmt) \ + "drbd %s/%u drbd%u: " fmt +#define __drbd_printk_drbd_peer_device_args() \ + __r->name, __d->vnr, __d->minor +#define __drbd_printk_drbd_peer_device_unprep() + +#define __drbd_printk_drbd_resource_prep(resource) \ + const struct drbd_resource *__r = resource +#define __drbd_printk_drbd_resource_fmt(fmt) "drbd %s: " fmt +#define __drbd_printk_drbd_resource_args() __r->name +#define __drbd_printk_drbd_resource_unprep(resource) + +#define __drbd_printk_drbd_connection_prep(connection) \ + const struct drbd_connection *__c = (connection); \ + const struct drbd_resource *__r = __c->resource +#define __drbd_printk_drbd_connection_fmt(fmt) \ + "drbd %s: " fmt +#define __drbd_printk_drbd_connection_args() \ + __r->name +#define __drbd_printk_drbd_connection_unprep() + +void drbd_printk_with_wrong_object_type(void); +void drbd_dyn_dbg_with_wrong_object_type(void); + +#define __drbd_printk_choose_cond(obj, struct_name) \ + (__builtin_types_compatible_p(typeof(obj), struct struct_name *) || \ + __builtin_types_compatible_p(typeof(obj), const struct struct_name *)) +#define __drbd_printk_if_same_type(obj, struct_name, level, fmt, args...) \ + __drbd_printk_choose_cond(obj, struct_name), \ +({ \ + __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \ + printk(level __drbd_printk_ ## struct_name ## _fmt(fmt), \ + __drbd_printk_ ## struct_name ## _args(), ## args); \ + __drbd_printk_ ## struct_name ## _unprep(); \ +}) + +#define drbd_printk(level, obj, fmt, args...) \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, drbd_device, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, drbd_resource, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, drbd_connection, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, drbd_peer_device, level, fmt, ## args), \ + drbd_printk_with_wrong_object_type())))) + +#define __drbd_dyn_dbg_if_same_type(obj, struct_name, fmt, args...) \ + __drbd_printk_choose_cond(obj, struct_name), \ +({ \ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { \ + __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \ + __dynamic_pr_debug(&descriptor, __drbd_printk_ ## struct_name ## _fmt(fmt), \ + __drbd_printk_ ## struct_name ## _args(), ## args); \ + __drbd_printk_ ## struct_name ## _unprep(); \ + } \ +}) + +#define dynamic_drbd_dbg(obj, fmt, args...) \ + __builtin_choose_expr( \ + __drbd_dyn_dbg_if_same_type(obj, drbd_device, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_dyn_dbg_if_same_type(obj, drbd_resource, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_dyn_dbg_if_same_type(obj, drbd_connection, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_dyn_dbg_if_same_type(obj, drbd_peer_device, fmt, ## args), \ + drbd_dyn_dbg_with_wrong_object_type())))) + +#define drbd_emerg(device, fmt, args...) \ + drbd_printk(KERN_EMERG, device, fmt, ## args) +#define drbd_alert(device, fmt, args...) \ + drbd_printk(KERN_ALERT, device, fmt, ## args) +#define drbd_crit(device, fmt, args...) \ + drbd_printk(KERN_CRIT, device, fmt, ## args) +#define drbd_err(device, fmt, args...) \ + drbd_printk(KERN_ERR, device, fmt, ## args) +#define drbd_warn(device, fmt, args...) \ + drbd_printk(KERN_WARNING, device, fmt, ## args) +#define drbd_notice(device, fmt, args...) \ + drbd_printk(KERN_NOTICE, device, fmt, ## args) +#define drbd_info(device, fmt, args...) \ + drbd_printk(KERN_INFO, device, fmt, ## args) + + +#define drbd_ratelimit() \ +({ \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + __ratelimit(&_rs); \ +}) + +#define D_ASSERT(x, exp) \ + do { \ + if (!(exp)) \ + drbd_err(x, "ASSERTION %s FAILED in %s\n", \ + #exp, __func__); \ + } while (0) + +/** + * expect - Make an assertion + * + * Unlike the assert macro, this macro returns a boolean result. + */ +#define expect(x, exp) ({ \ + bool _bool = (exp); \ + if (!_bool && drbd_ratelimit()) \ + drbd_err(x, "ASSERTION %s FAILED in %s\n", \ + #exp, __func__); \ + _bool; \ + }) + +#endif diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 3c0193de2498..2227fb0db1ce 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_proc.c diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index a882b65ab5d2..56bbca9d7700 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __DRBD_PROTOCOL_H #define __DRBD_PROTOCOL_H diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9ace76156e4b..0e58a3187345 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_receiver.c @@ -413,7 +413,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request * drbd_free_pages(device, peer_req->pages, is_net); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; drbd_al_complete_io(device, &peer_req->i); } @@ -1603,9 +1603,19 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru drbd_endio_write_sec_final(peer_req); } +static int peer_request_fault_type(struct drbd_peer_request *peer_req) +{ + if (peer_req_op(peer_req) == REQ_OP_READ) { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD; + } else { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR; + } +} + /** * drbd_submit_peer_request() - * @device: DRBD device. * @peer_req: peer request * * May spread the pages to multiple bios, @@ -1619,10 +1629,9 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru * on certain Xen deployments. */ /* TODO allocate from our own bio_set. */ -int drbd_submit_peer_request(struct drbd_device *device, - struct drbd_peer_request *peer_req, - const blk_opf_t opf, const int fault_type) +int drbd_submit_peer_request(struct drbd_peer_request *peer_req) { + struct drbd_device *device = peer_req->peer_device->device; struct bio *bios = NULL; struct bio *bio; struct page *page = peer_req->pages; @@ -1667,7 +1676,18 @@ int drbd_submit_peer_request(struct drbd_device *device, * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO); + /* _DISCARD, _WRITE_ZEROES handled above. + * REQ_OP_FLUSH (empty flush) not expected, + * should have been mapped to a "drbd protocol barrier". + * REQ_OP_SECURE_ERASE: I don't see how we could ever support that. + */ + if (!(peer_req_op(peer_req) == REQ_OP_WRITE || + peer_req_op(peer_req) == REQ_OP_READ)) { + drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf); + return -EINVAL; + } + + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; bio->bi_private = peer_req; @@ -1697,7 +1717,7 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - drbd_submit_bio_noacct(device, fault_type, bio); + drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio); } while (bios); return 0; } @@ -1853,21 +1873,21 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, /* assume request_size == data_size, but special case trim. */ ds = data_size; if (trim) { - if (!expect(data_size == 0)) + if (!expect(peer_device, data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); } else if (zeroes) { - if (!expect(data_size == 0)) + if (!expect(peer_device, data_size == 0)) return NULL; ds = be32_to_cpu(zeroes->size); } - if (!expect(IS_ALIGNED(ds, 512))) + if (!expect(peer_device, IS_ALIGNED(ds, 512))) return NULL; if (trim || zeroes) { - if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) + if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; - } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) + } else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -2051,6 +2071,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto * respective _drbd_clear_done_ee */ peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_WRITE; peer_req->submit_jif = jiffies; spin_lock_irq(&device->resource->req_lock); @@ -2058,8 +2079,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, - DRBD_FAULT_RS_WR) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0; /* don't care for the reason here */ @@ -2145,7 +2165,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet * or in drbd_peer_request_endio. */ err = recv_resync_read(peer_device, sector, pi); } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Can not write resync data to local disk.\n"); err = drbd_drain_block(peer_device, pi->size); @@ -2375,16 +2395,6 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co return ret; } -/* see also bio_flags_to_wire() - * DRBD_REQ_*, because we need to semantically map the flags to data packet - * flags and back. We may replicate to other kernel versions. */ -static blk_opf_t wire_flags_to_bio_flags(u32 dpf) -{ - return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | - (dpf & DP_FUA ? REQ_FUA : 0) | - (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); -} - static enum req_op wire_flags_to_bio_op(u32 dpf) { if (dpf & DP_ZEROES) @@ -2395,6 +2405,15 @@ static enum req_op wire_flags_to_bio_op(u32 dpf) return REQ_OP_WRITE; } +/* see also bio_flags_to_wire() */ +static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf) +{ + return wire_flags_to_bio_op(dpf) | + (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); +} + static void fail_postponed_requests(struct drbd_device *device, sector_t sector, unsigned int size) { @@ -2538,8 +2557,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - enum req_op op; - blk_opf_t op_flags; u32 dp_flags; int err, tp; @@ -2578,11 +2595,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_APPLICATION; dp_flags = be32_to_cpu(p->dp_flags); - op = wire_flags_to_bio_op(dp_flags); - op_flags = wire_flags_to_bio_flags(dp_flags); + peer_req->opf = wire_flags_to_bio(connection, dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, op == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD); D_ASSERT(peer_device, peer_req->pages == NULL); /* need to play safe: an older DRBD sender * may mean zero-out while sending P_TRIM. */ @@ -2590,7 +2606,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_ZEROOUT; } else if (pi->cmd == P_ZEROES) { D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ if (dp_flags & DP_DISCARD) @@ -2677,8 +2693,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } - err = drbd_submit_peer_request(device, peer_req, op | op_flags, - DRBD_FAULT_DT_WR); + err = drbd_submit_peer_request(peer_req); if (!err) return 0; @@ -2789,7 +2804,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet struct drbd_peer_request *peer_req; struct digest_info *di = NULL; int size, verb; - unsigned int fault_type; struct p_block_req *p = pi->data; peer_device = conn_peer_device(connection, pi->vnr); @@ -2832,7 +2846,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet default: BUG(); } - if (verb && __ratelimit(&drbd_ratelimit_state)) + if (verb && drbd_ratelimit()) drbd_err(device, "Can not satisfy peer's read request, " "no local data.\n"); @@ -2849,11 +2863,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet put_ldev(device); return -ENOMEM; } + peer_req->opf = REQ_OP_READ; switch (pi->cmd) { case P_DATA_REQUEST: peer_req->w.cb = w_e_end_data_req; - fault_type = DRBD_FAULT_DT_RD; /* application IO, don't drbd_rs_begin_io */ peer_req->flags |= EE_APPLICATION; goto submit; @@ -2867,14 +2881,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet fallthrough; case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; - fault_type = DRBD_FAULT_RS_RD; /* used in the sector offset progress display */ device->bm_resync_fo = BM_SECT_TO_BIT(sector); break; case P_OV_REPLY: case P_CSUM_RS_REQUEST: - fault_type = DRBD_FAULT_RS_RD; di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); if (!di) goto out_free_e; @@ -2923,7 +2935,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet (unsigned long long)sector); } peer_req->w.cb = w_e_end_ov_req; - fault_type = DRBD_FAULT_RS_RD; break; default: @@ -2975,8 +2986,7 @@ submit_for_resync: submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, - fault_type) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0; /* don't care for the reason here */ @@ -4947,7 +4957,6 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac if (get_ldev(device)) { struct drbd_peer_request *peer_req; - const enum req_op op = REQ_OP_WRITE_ZEROES; peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, size, 0, GFP_NOIO); @@ -4957,6 +4966,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac } peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_DISCARD; peer_req->submit_jif = jiffies; peer_req->flags |= EE_TRIM; @@ -4965,8 +4975,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(device, peer_req, op, - DRBD_FAULT_RS_WR); + err = drbd_submit_peer_request(peer_req); if (err) { spin_lock_irq(&device->resource->req_lock); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 7f9bcc82fc9c..eb14ec8ec04c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_req.c @@ -144,7 +144,7 @@ void drbd_req_destroy(struct kref *kref) if (get_ldev_if_state(device, D_FAILED)) { drbd_al_complete_io(device, &req->i); put_ldev(device); - } else if (__ratelimit(&drbd_ratelimit_state)) { + } else if (drbd_ratelimit()) { drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), " "but my Disk seems to have failed :(\n", (unsigned long long) req->i.sector, req->i.size); @@ -518,7 +518,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) { - if (!__ratelimit(&drbd_ratelimit_state)) + if (!drbd_ratelimit()) return; drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n", @@ -1402,7 +1402,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request submit_private_bio = true; } else if (no_remote) { nodata: - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n", (unsigned long long)req->i.sector, req->i.size >> 9); /* A write may have been queued for send_oos, however. diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 6237fa1dcb0e..b4017b5c3fbc 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* drbd_req.h diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 3f7bf9f2d874..75d13ea0024f 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_state.c diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index f87371e55e68..cbaeb8018dbf 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef DRBD_STATE_H #define DRBD_STATE_H diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h index d5b0479bc9a6..9d78d8e3912e 100644 --- a/drivers/block/drbd/drbd_state_change.h +++ b/drivers/block/drbd/drbd_state_change.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef DRBD_STATE_CHANGE_H #define DRBD_STATE_CHANGE_H diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index fc01307607ea..0a06f744b096 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd.h diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h index 87b94a27358a..0201f6590f6a 100644 --- a/drivers/block/drbd/drbd_strings.h +++ b/drivers/block/drbd/drbd_strings.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __DRBD_STRINGS_H #define __DRBD_STRINGS_H diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h index 01e3babc5277..1ee81e3c2152 100644 --- a/drivers/block/drbd/drbd_vli.h +++ b/drivers/block/drbd/drbd_vli.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* -*- linux-c -*- drbd_receiver.c diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0bb1a900c2d5..f46738040d6b 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_worker.c @@ -176,7 +176,7 @@ void drbd_peer_request_endio(struct bio *bio) bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES || bio_op(bio) == REQ_OP_DISCARD; - if (bio->bi_status && __ratelimit(&drbd_ratelimit_state)) + if (bio->bi_status && drbd_ratelimit()) drbd_warn(device, "%s: error=%d s=%llus\n", is_write ? (is_discard ? "discard" : "write") : "read", bio->bi_status, @@ -240,7 +240,7 @@ void drbd_request_endio(struct bio *bio) * though we still will complain noisily about it. */ if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); if (!bio->bi_status) @@ -400,13 +400,13 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, goto defer; peer_req->w.cb = w_e_send_csum; + peer_req->opf = REQ_OP_READ; spin_lock_irq(&device->resource->req_lock); list_add_tail(&peer_req->w.list, &device->read_ee); spin_unlock_irq(&device->resource->req_lock); atomic_add(size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, - DRBD_FAULT_RS_RD) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0; /* If it failed because of ENOMEM, retry should help. If it failed @@ -1062,7 +1062,7 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req); } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Sending NegDReply. sector=%llus.\n", (unsigned long long)peer_req->i.sector); @@ -1135,13 +1135,13 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) else err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Not sending RSDataReply, " "partner DISKLESS!\n"); err = 0; } } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Sending NegRSDReply. sector %llus.\n", (unsigned long long)peer_req->i.sector); @@ -1212,7 +1212,7 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) } } else { err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Sending NegDReply. I guess it gets messy.\n"); } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index ccad3d7b3ddd..487840e3564d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4593,8 +4593,10 @@ static int __init do_floppy_init(void) goto out_put_disk; err = floppy_alloc_disk(drive, 0); - if (err) + if (err) { + blk_mq_free_tag_set(&tag_sets[drive]); goto out_put_disk; + } timer_setup(&motor_off_timer[drive], motor_off_callback, 0); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 1f154f92f4c2..7d28e3aa406c 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -523,6 +523,24 @@ out: } CONFIGFS_ATTR(nullb_device_, badblocks); +static ssize_t nullb_device_zone_readonly_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + + return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY); +} +CONFIGFS_ATTR_WO(nullb_device_, zone_readonly); + +static ssize_t nullb_device_zone_offline_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + + return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE); +} +CONFIGFS_ATTR_WO(nullb_device_, zone_offline); + static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_size, &nullb_device_attr_completion_nsec, @@ -549,6 +567,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_offline, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tag_bitmap, @@ -614,7 +634,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "poll_queues,power,queue_mode,shared_tag_bitmap,size," "submit_queues,use_per_node_hctx,virt_boundary,zoned," "zone_capacity,zone_max_active,zone_max_open," - "zone_nr_conv,zone_size\n"); + "zone_nr_conv,zone_offline,zone_readonly,zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 94ff68052b1e..eb5972c50be8 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -151,6 +151,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len); +ssize_t zone_cond_store(struct nullb_device *dev, const char *page, + size_t count, enum blk_zone_cond cond); #else static inline int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) @@ -174,6 +176,12 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb, { return len; } +static inline ssize_t zone_cond_store(struct nullb_device *dev, + const char *page, size_t count, + enum blk_zone_cond cond) +{ + return -EOPNOTSUPP; +} #define null_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 55a69e48ef8b..635ce0648133 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -384,8 +384,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, null_lock_zone(dev, zone); - if (zone->cond == BLK_ZONE_COND_FULL) { - /* Cannot write to a full zone */ + if (zone->cond == BLK_ZONE_COND_FULL || + zone->cond == BLK_ZONE_COND_READONLY || + zone->cond == BLK_ZONE_COND_OFFLINE) { + /* Cannot write to the zone */ ret = BLK_STS_IOERR; goto unlock; } @@ -613,7 +615,9 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op, for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { zone = &dev->zones[i]; null_lock_zone(dev, zone); - if (zone->cond != BLK_ZONE_COND_EMPTY) { + if (zone->cond != BLK_ZONE_COND_EMPTY && + zone->cond != BLK_ZONE_COND_READONLY && + zone->cond != BLK_ZONE_COND_OFFLINE) { null_reset_zone(dev, zone); trace_nullb_zone_op(cmd, i, zone->cond); } @@ -627,6 +631,12 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op, null_lock_zone(dev, zone); + if (zone->cond == BLK_ZONE_COND_READONLY || + zone->cond == BLK_ZONE_COND_OFFLINE) { + ret = BLK_STS_IOERR; + goto unlock; + } + switch (op) { case REQ_OP_ZONE_RESET: ret = null_reset_zone(dev, zone); @@ -648,6 +658,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op, if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); +unlock: null_unlock_zone(dev, zone); return ret; @@ -674,6 +685,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, default: dev = cmd->nq->dev; zone = &dev->zones[null_zone_no(dev, sector)]; + if (zone->cond == BLK_ZONE_COND_OFFLINE) + return BLK_STS_IOERR; null_lock_zone(dev, zone); sts = null_process_cmd(cmd, op, sector, nr_sectors); @@ -681,3 +694,79 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, return sts; } } + +/* + * Set a zone in the read-only or offline condition. + */ +static void null_set_zone_cond(struct nullb_device *dev, + struct nullb_zone *zone, enum blk_zone_cond cond) +{ + if (WARN_ON_ONCE(cond != BLK_ZONE_COND_READONLY && + cond != BLK_ZONE_COND_OFFLINE)) + return; + + null_lock_zone(dev, zone); + + /* + * If the read-only condition is requested again to zones already in + * read-only condition, restore back normal empty condition. Do the same + * if the offline condition is requested for offline zones. Otherwise, + * set the specified zone condition to the zones. Finish the zones + * beforehand to free up zone resources. + */ + if (zone->cond == cond) { + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + if (dev->memory_backed) + null_handle_discard(dev, zone->start, zone->len); + } else { + if (zone->cond != BLK_ZONE_COND_READONLY && + zone->cond != BLK_ZONE_COND_OFFLINE) + null_finish_zone(dev, zone); + zone->cond = cond; + zone->wp = (sector_t)-1; + } + + null_unlock_zone(dev, zone); +} + +/* + * Identify a zone from the sector written to configfs file. Then set zone + * condition to the zone. + */ +ssize_t zone_cond_store(struct nullb_device *dev, const char *page, + size_t count, enum blk_zone_cond cond) +{ + unsigned long long sector; + unsigned int zone_no; + int ret; + + if (!dev->zoned) { + pr_err("null_blk device is not zoned\n"); + return -EINVAL; + } + + if (!dev->zones) { + pr_err("null_blk device is not yet powered\n"); + return -EINVAL; + } + + ret = kstrtoull(page, 0, §or); + if (ret < 0) + return ret; + + zone_no = null_zone_no(dev, sector); + if (zone_no >= dev->nr_zones) { + pr_err("Sector out of range\n"); + return -EINVAL; + } + + if (dev->zones[zone_no].type == BLK_ZONE_TYPE_CONVENTIONAL) { + pr_err("Can not change condition of conventional zones\n"); + return -EINVAL; + } + + null_set_zone_cond(dev, &dev->zones[zone_no], cond); + + return count; +} diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c deleted file mode 100644 index 4cea3b08087e..000000000000 --- a/drivers/block/pktcdvd.c +++ /dev/null @@ -1,2944 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe <axboe@suse.de> - * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com> - * Copyright (C) 2006 Thomas Maier <balagi@justmail.de> - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and - * DVD-RAM devices. - * - * Theory of operation: - * - * At the lowest level, there is the standard driver for the CD/DVD device, - * such as drivers/scsi/sr.c. This driver can handle read and write requests, - * but it doesn't know anything about the special restrictions that apply to - * packet writing. One restriction is that write requests must be aligned to - * packet boundaries on the physical media, and the size of a write request - * must be equal to the packet size. Another restriction is that a - * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read - * command, if the previous command was a write. - * - * The purpose of the packet writing driver is to hide these restrictions from - * higher layers, such as file systems, and present a block device that can be - * randomly read and written using 2kB-sized blocks. - * - * The lowest layer in the packet writing driver is the packet I/O scheduler. - * Its data is defined by the struct packet_iosched and includes two bio - * queues with pending read and write requests. These queues are processed - * by the pkt_iosched_process_queue() function. The write requests in this - * queue are already properly aligned and sized. This layer is responsible for - * issuing the flush cache commands and scheduling the I/O in a good order. - * - * The next layer transforms unaligned write requests to aligned writes. This - * transformation requires reading missing pieces of data from the underlying - * block device, assembling the pieces to full packets and queuing them to the - * packet I/O scheduler. - * - * At the top layer there is a custom ->submit_bio function that forwards - * read requests directly to the iosched queue and puts write requests in the - * unaligned write queue. A kernel thread performs the necessary read - * gathering to convert the unaligned writes to aligned writes and then feeds - * them to the packet I/O scheduler. - * - *************************************************************************/ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/pktcdvd.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/compat.h> -#include <linux/kthread.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/file.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/miscdevice.h> -#include <linux/freezer.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/backing-dev.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/scsi.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/nospec.h> -#include <linux/uaccess.h> - -#define DRIVER_NAME "pktcdvd" - -#define pkt_err(pd, fmt, ...) \ - pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_notice(pd, fmt, ...) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_info(pd, fmt, ...) \ - pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) - -#define pkt_dbg(level, pd, fmt, ...) \ -do { \ - if (level == 2 && PACKET_DEBUG >= 2) \ - pr_notice("%s: %s():" fmt, \ - pd->name, __func__, ##__VA_ARGS__); \ - else if (level == 1 && PACKET_DEBUG >= 1) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ -} while (0) - -#define MAX_SPEED 0xffff - -static DEFINE_MUTEX(pktcdvd_mutex); -static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; -static struct proc_dir_entry *pkt_proc; -static int pktdev_major; -static int write_congestion_on = PKT_WRITE_CONGESTION_ON; -static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; -static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ -static mempool_t psd_pool; -static struct bio_set pkt_bio_set; - -static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ -static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ - -/* forward declaration */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); -static int pkt_remove_dev(dev_t pkt_dev); -static int pkt_seq_show(struct seq_file *m, void *p); - -static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) -{ - return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); -} - -/********************************************************** - * sysfs interface for pktcdvd - * by (C) 2006 Thomas Maier <balagi@justmail.de> - - /sys/class/pktcdvd/pktcdvd[0-7]/ - stat/reset - stat/packets_started - stat/packets_finished - stat/kb_written - stat/kb_read - stat/kb_read_gather - write_queue/size - write_queue/congestion_off - write_queue/congestion_on - **********************************************************/ - -static ssize_t packets_started_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); -} -static DEVICE_ATTR_RO(packets_started); - -static ssize_t packets_finished_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); -} -static DEVICE_ATTR_RO(packets_finished); - -static ssize_t kb_written_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); -} -static DEVICE_ATTR_RO(kb_written); - -static ssize_t kb_read_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); -} -static DEVICE_ATTR_RO(kb_read); - -static ssize_t kb_read_gather_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); -} -static DEVICE_ATTR_RO(kb_read_gather); - -static ssize_t reset_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - if (len > 0) { - pd->stats.pkt_started = 0; - pd->stats.pkt_ended = 0; - pd->stats.secs_w = 0; - pd->stats.secs_rg = 0; - pd->stats.secs_r = 0; - } - return len; -} -static DEVICE_ATTR_WO(reset); - -static struct attribute *pkt_stat_attrs[] = { - &dev_attr_packets_finished.attr, - &dev_attr_packets_started.attr, - &dev_attr_kb_read.attr, - &dev_attr_kb_written.attr, - &dev_attr_kb_read_gather.attr, - &dev_attr_reset.attr, - NULL, -}; - -static const struct attribute_group pkt_stat_group = { - .name = "stat", - .attrs = pkt_stat_attrs, -}; - -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); - spin_unlock(&pd->lock); - return n; -} -static DEVICE_ATTR_RO(size); - -static void init_write_congestion_marks(int* lo, int* hi) -{ - if (*hi > 0) { - *hi = max(*hi, 500); - *hi = min(*hi, 1000000); - if (*lo <= 0) - *lo = *hi - 100; - else { - *lo = min(*lo, *hi - 100); - *lo = max(*lo, 100); - } - } else { - *hi = -1; - *lo = -1; - } -} - -static ssize_t congestion_off_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_off_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val; - - if (sscanf(buf, "%d", &val) == 1) { - spin_lock(&pd->lock); - pd->write_congestion_off = val; - init_write_congestion_marks(&pd->write_congestion_off, - &pd->write_congestion_on); - spin_unlock(&pd->lock); - } - return len; -} -static DEVICE_ATTR_RW(congestion_off); - -static ssize_t congestion_on_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_on_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val; - - if (sscanf(buf, "%d", &val) == 1) { - spin_lock(&pd->lock); - pd->write_congestion_on = val; - init_write_congestion_marks(&pd->write_congestion_off, - &pd->write_congestion_on); - spin_unlock(&pd->lock); - } - return len; -} -static DEVICE_ATTR_RW(congestion_on); - -static struct attribute *pkt_wq_attrs[] = { - &dev_attr_congestion_on.attr, - &dev_attr_congestion_off.attr, - &dev_attr_size.attr, - NULL, -}; - -static const struct attribute_group pkt_wq_group = { - .name = "write_queue", - .attrs = pkt_wq_attrs, -}; - -static const struct attribute_group *pkt_groups[] = { - &pkt_stat_group, - &pkt_wq_group, - NULL, -}; - -static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) -{ - if (class_pktcdvd) { - pd->dev = device_create_with_groups(class_pktcdvd, NULL, - MKDEV(0, 0), pd, pkt_groups, - "%s", pd->name); - if (IS_ERR(pd->dev)) - pd->dev = NULL; - } -} - -static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) -{ - if (class_pktcdvd) - device_unregister(pd->dev); -} - - -/******************************************************************** - /sys/class/pktcdvd/ - add map block device - remove unmap packet dev - device_map show mappings - *******************************************************************/ - -static void class_pktcdvd_release(struct class *cls) -{ - kfree(cls); -} - -static ssize_t device_map_show(struct class *c, struct class_attribute *attr, - char *data) -{ - int n = 0; - int idx; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - for (idx = 0; idx < MAX_WRITERS; idx++) { - struct pktcdvd_device *pd = pkt_devs[idx]; - if (!pd) - continue; - n += sprintf(data+n, "%s %u:%u %u:%u\n", - pd->name, - MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), - MAJOR(pd->bdev->bd_dev), - MINOR(pd->bdev->bd_dev)); - } - mutex_unlock(&ctl_mutex); - return n; -} -static CLASS_ATTR_RO(device_map); - -static ssize_t add_store(struct class *c, struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - /* pkt_setup_dev() expects caller to hold reference to self */ - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - pkt_setup_dev(MKDEV(major, minor), NULL); - - module_put(THIS_MODULE); - - return count; - } - - return -EINVAL; -} -static CLASS_ATTR_WO(add); - -static ssize_t remove_store(struct class *c, struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - pkt_remove_dev(MKDEV(major, minor)); - return count; - } - return -EINVAL; -} -static CLASS_ATTR_WO(remove); - -static struct attribute *class_pktcdvd_attrs[] = { - &class_attr_add.attr, - &class_attr_remove.attr, - &class_attr_device_map.attr, - NULL, -}; -ATTRIBUTE_GROUPS(class_pktcdvd); - -static int pkt_sysfs_init(void) -{ - int ret = 0; - - /* - * create control files in sysfs - * /sys/class/pktcdvd/... - */ - class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL); - if (!class_pktcdvd) - return -ENOMEM; - class_pktcdvd->name = DRIVER_NAME; - class_pktcdvd->owner = THIS_MODULE; - class_pktcdvd->class_release = class_pktcdvd_release; - class_pktcdvd->class_groups = class_pktcdvd_groups; - ret = class_register(class_pktcdvd); - if (ret) { - kfree(class_pktcdvd); - class_pktcdvd = NULL; - pr_err("failed to create class pktcdvd\n"); - return ret; - } - return 0; -} - -static void pkt_sysfs_cleanup(void) -{ - if (class_pktcdvd) - class_destroy(class_pktcdvd); - class_pktcdvd = NULL; -} - -/******************************************************************** - entries in debugfs - - /sys/kernel/debug/pktcdvd[0-7]/ - info - - *******************************************************************/ - -static int pkt_debugfs_seq_show(struct seq_file *m, void *p) -{ - return pkt_seq_show(m, p); -} - -static int pkt_debugfs_fops_open(struct inode *inode, struct file *file) -{ - return single_open(file, pkt_debugfs_seq_show, inode->i_private); -} - -static const struct file_operations debug_fops = { - .open = pkt_debugfs_fops_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; - -static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); - if (!pd->dfs_d_root) - return; - - pd->dfs_f_info = debugfs_create_file("info", 0444, - pd->dfs_d_root, pd, &debug_fops); -} - -static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - debugfs_remove(pd->dfs_f_info); - debugfs_remove(pd->dfs_d_root); - pd->dfs_f_info = NULL; - pd->dfs_d_root = NULL; -} - -static void pkt_debugfs_init(void) -{ - pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); -} - -static void pkt_debugfs_cleanup(void) -{ - debugfs_remove(pkt_debugfs_root); - pkt_debugfs_root = NULL; -} - -/* ----------------------------------------------------------*/ - - -static void pkt_bio_finished(struct pktcdvd_device *pd) -{ - BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); - if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - pkt_dbg(2, pd, "queue empty\n"); - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); - } -} - -/* - * Allocate a packet_data struct - */ -static struct packet_data *pkt_alloc_packet_data(int frames) -{ - int i; - struct packet_data *pkt; - - pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL); - if (!pkt) - goto no_pkt; - - pkt->frames = frames; - pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); - if (!pkt->w_bio) - goto no_bio; - - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { - pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!pkt->pages[i]) - goto no_page; - } - - spin_lock_init(&pkt->lock); - bio_list_init(&pkt->orig_bios); - - for (i = 0; i < frames; i++) { - pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); - if (!pkt->r_bios[i]) - goto no_rd_bio; - } - - return pkt; - -no_rd_bio: - for (i = 0; i < frames; i++) - kfree(pkt->r_bios[i]); -no_page: - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) - if (pkt->pages[i]) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); -no_bio: - kfree(pkt); -no_pkt: - return NULL; -} - -/* - * Free a packet_data struct - */ -static void pkt_free_packet_data(struct packet_data *pkt) -{ - int i; - - for (i = 0; i < pkt->frames; i++) - kfree(pkt->r_bios[i]); - for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); - kfree(pkt); -} - -static void pkt_shrink_pktlist(struct pktcdvd_device *pd) -{ - struct packet_data *pkt, *next; - - BUG_ON(!list_empty(&pd->cdrw.pkt_active_list)); - - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { - pkt_free_packet_data(pkt); - } - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); -} - -static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) -{ - struct packet_data *pkt; - - BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); - - while (nr_packets > 0) { - pkt = pkt_alloc_packet_data(pd->settings.size >> 2); - if (!pkt) { - pkt_shrink_pktlist(pd); - return 0; - } - pkt->id = nr_packets; - pkt->pd = pd; - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - nr_packets--; - } - return 1; -} - -static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) -{ - struct rb_node *n = rb_next(&node->rb_node); - if (!n) - return NULL; - return rb_entry(n, struct pkt_rb_node, rb_node); -} - -static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - rb_erase(&node->rb_node, &pd->bio_queue); - mempool_free(node, &pd->rb_pool); - pd->bio_queue_size--; - BUG_ON(pd->bio_queue_size < 0); -} - -/* - * Find the first node in the pd->bio_queue rb tree with a starting sector >= s. - */ -static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s) -{ - struct rb_node *n = pd->bio_queue.rb_node; - struct rb_node *next; - struct pkt_rb_node *tmp; - - if (!n) { - BUG_ON(pd->bio_queue_size > 0); - return NULL; - } - - for (;;) { - tmp = rb_entry(n, struct pkt_rb_node, rb_node); - if (s <= tmp->bio->bi_iter.bi_sector) - next = n->rb_left; - else - next = n->rb_right; - if (!next) - break; - n = next; - } - - if (s > tmp->bio->bi_iter.bi_sector) { - tmp = pkt_rbtree_next(tmp); - if (!tmp) - return NULL; - } - BUG_ON(s > tmp->bio->bi_iter.bi_sector); - return tmp; -} - -/* - * Insert a node into the pd->bio_queue rb tree. - */ -static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - struct rb_node **p = &pd->bio_queue.rb_node; - struct rb_node *parent = NULL; - sector_t s = node->bio->bi_iter.bi_sector; - struct pkt_rb_node *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct pkt_rb_node, rb_node); - if (s < tmp->bio->bi_iter.bi_sector) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&node->rb_node, parent, p); - rb_insert_color(&node->rb_node, &pd->bio_queue); - pd->bio_queue_size++; -} - -/* - * Send a packet_command to the underlying block device and - * wait for completion. - */ -static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - struct request_queue *q = bdev_get_queue(pd->bdev); - struct scsi_cmnd *scmd; - struct request *rq; - int ret = 0; - - rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - scmd = blk_mq_rq_to_pdu(rq); - - if (cgc->buflen) { - ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - GFP_NOIO); - if (ret) - goto out; - } - - scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]); - memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE); - - rq->timeout = 60*HZ; - if (cgc->quiet) - rq->rq_flags |= RQF_QUIET; - - blk_execute_rq(rq, false); - if (scmd->result) - ret = -EIO; -out: - blk_mq_free_request(rq); - return ret; -} - -static const char *sense_key_string(__u8 index) -{ - static const char * const info[] = { - "No sense", "Recovered error", "Not ready", - "Medium error", "Hardware error", "Illegal request", - "Unit attention", "Data protect", "Blank check", - }; - - return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; -} - -/* - * A generic sense dump / resolve mechanism should be implemented across - * all ATAPI + SCSI devices. - */ -static void pkt_dump_sense(struct pktcdvd_device *pd, - struct packet_command *cgc) -{ - struct scsi_sense_hdr *sshdr = cgc->sshdr; - - if (sshdr) - pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", - CDROM_PACKET_SIZE, cgc->cmd, - sshdr->sense_key, sshdr->asc, sshdr->ascq, - sense_key_string(sshdr->sense_key)); - else - pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); -} - -/* - * flush the drive cache to media - */ -static int pkt_flush_cache(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_FLUSH_CACHE; - cgc.quiet = 1; - - /* - * the IMMED bit -- we default to not setting it, although that - * would allow a much faster close, this is safer - */ -#if 0 - cgc.cmd[1] = 1 << 1; -#endif - return pkt_generic_packet(pd, &cgc); -} - -/* - * speed is given as the normal factor, e.g. 4 for 4x - */ -static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, - unsigned write_speed, unsigned read_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_SET_SPEED; - cgc.cmd[2] = (read_speed >> 8) & 0xff; - cgc.cmd[3] = read_speed & 0xff; - cgc.cmd[4] = (write_speed >> 8) & 0xff; - cgc.cmd[5] = write_speed & 0xff; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - - return ret; -} - -/* - * Queue a bio for processing by the low-level CD device. Must be called - * from process context. - */ -static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) -{ - spin_lock(&pd->iosched.lock); - if (bio_data_dir(bio) == READ) - bio_list_add(&pd->iosched.read_queue, bio); - else - bio_list_add(&pd->iosched.write_queue, bio); - spin_unlock(&pd->iosched.lock); - - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); -} - -/* - * Process the queued read/write requests. This function handles special - * requirements for CDRW drives: - * - A cache flush command must be inserted before a read request if the - * previous request was a write. - * - Switching between reading and writing is slow, so don't do it more often - * than necessary. - * - Optimize for throughput at the expense of latency. This means that streaming - * writes will never be interrupted by a read, but if the drive has to seek - * before the next write, switch to reading instead if there are any pending - * read requests. - * - Set the read speed according to current usage pattern. When only reading - * from the device, it's best to use the highest possible read speed, but - * when switching often between reading and writing, it's better to have the - * same read and write speeds. - */ -static void pkt_iosched_process_queue(struct pktcdvd_device *pd) -{ - - if (atomic_read(&pd->iosched.attention) == 0) - return; - atomic_set(&pd->iosched.attention, 0); - - for (;;) { - struct bio *bio; - int reads_queued, writes_queued; - - spin_lock(&pd->iosched.lock); - reads_queued = !bio_list_empty(&pd->iosched.read_queue); - writes_queued = !bio_list_empty(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - - if (!reads_queued && !writes_queued) - break; - - if (pd->iosched.writing) { - int need_write_seek = 1; - spin_lock(&pd->iosched.lock); - bio = bio_list_peek(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - if (bio && (bio->bi_iter.bi_sector == - pd->iosched.last_write)) - need_write_seek = 0; - if (need_write_seek && reads_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "write, waiting\n"); - break; - } - pkt_flush_cache(pd); - pd->iosched.writing = 0; - } - } else { - if (!reads_queued && writes_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "read, waiting\n"); - break; - } - pd->iosched.writing = 1; - } - } - - spin_lock(&pd->iosched.lock); - if (pd->iosched.writing) - bio = bio_list_pop(&pd->iosched.write_queue); - else - bio = bio_list_pop(&pd->iosched.read_queue); - spin_unlock(&pd->iosched.lock); - - if (!bio) - continue; - - if (bio_data_dir(bio) == READ) - pd->iosched.successive_reads += - bio->bi_iter.bi_size >> 10; - else { - pd->iosched.successive_reads = 0; - pd->iosched.last_write = bio_end_sector(bio); - } - if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { - if (pd->read_speed == pd->write_speed) { - pd->read_speed = MAX_SPEED; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } else { - if (pd->read_speed != pd->write_speed) { - pd->read_speed = pd->write_speed; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } - - atomic_inc(&pd->cdrw.pending_bios); - submit_bio_noacct(bio); - } -} - -/* - * Special care is needed if the underlying block device has a small - * max_phys_segments value. - */ -static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) -{ - if ((pd->settings.size << 9) / CD_FRAMESIZE - <= queue_max_segments(q)) { - /* - * The cdrom device can handle one segment/frame - */ - clear_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } else if ((pd->settings.size << 9) / PAGE_SIZE - <= queue_max_segments(q)) { - /* - * We can handle this case at the expense of some extra memory - * copies during write operations - */ - set_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } else { - pkt_err(pd, "cdrom max_phys_segments too small\n"); - return -EIO; - } -} - -static void pkt_end_io_read(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", - bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); - - if (bio->bi_status) - atomic_inc(&pkt->io_errors); - bio_uninit(bio); - if (atomic_dec_and_test(&pkt->io_wait)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - pkt_bio_finished(pd); -} - -static void pkt_end_io_packet_write(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); - - pd->stats.pkt_ended++; - - bio_uninit(bio); - pkt_bio_finished(pd); - atomic_dec(&pkt->io_wait); - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); -} - -/* - * Schedule reads for the holes in a packet - */ -static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - int frames_read = 0; - struct bio *bio; - int f; - char written[PACKET_MAX_SIZE]; - - BUG_ON(bio_list_empty(&pkt->orig_bios)); - - atomic_set(&pkt->io_wait, 0); - atomic_set(&pkt->io_errors, 0); - - /* - * Figure out which frames we need to read before we can write. - */ - memset(written, 0, sizeof(written)); - spin_lock(&pkt->lock); - bio_list_for_each(bio, &pkt->orig_bios) { - int first_frame = (bio->bi_iter.bi_sector - pkt->sector) / - (CD_FRAMESIZE >> 9); - int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE; - pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); - BUG_ON(first_frame < 0); - BUG_ON(first_frame + num_frames > pkt->frames); - for (f = first_frame; f < first_frame + num_frames; f++) - written[f] = 1; - } - spin_unlock(&pkt->lock); - - if (pkt->cache_valid) { - pkt_dbg(2, pd, "zone %llx cached\n", - (unsigned long long)pkt->sector); - goto out_account; - } - - /* - * Schedule reads for missing parts of the packet. - */ - for (f = 0; f < pkt->frames; f++) { - int p, offset; - - if (written[f]) - continue; - - bio = pkt->r_bios[f]; - bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ); - bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); - bio->bi_end_io = pkt_end_io_read; - bio->bi_private = pkt; - - p = (f * CD_FRAMESIZE) / PAGE_SIZE; - offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", - f, pkt->pages[p], offset); - if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) - BUG(); - - atomic_inc(&pkt->io_wait); - pkt_queue_bio(pd, bio); - frames_read++; - } - -out_account: - pkt_dbg(2, pd, "need %d frames for zone %llx\n", - frames_read, (unsigned long long)pkt->sector); - pd->stats.pkt_started++; - pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); -} - -/* - * Find a packet matching zone, or the least recently used packet if - * there is no match. - */ -static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone) -{ - struct packet_data *pkt; - - list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) { - if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) { - list_del_init(&pkt->list); - if (pkt->sector != zone) - pkt->cache_valid = 0; - return pkt; - } - } - BUG(); - return NULL; -} - -static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - if (pkt->cache_valid) { - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - } else { - list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list); - } -} - -static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) -{ -#if PACKET_DEBUG > 1 - static const char *state_name[] = { - "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" - }; - enum packet_data_state old_state = pkt->state; - pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", - pkt->id, (unsigned long long)pkt->sector, - state_name[old_state], state_name[state]); -#endif - pkt->state = state; -} - -/* - * Scan the work queue to see if we can start a new packet. - * returns non-zero if any work was done. - */ -static int pkt_handle_queue(struct pktcdvd_device *pd) -{ - struct packet_data *pkt, *p; - struct bio *bio = NULL; - sector_t zone = 0; /* Suppress gcc warning */ - struct pkt_rb_node *node, *first_node; - struct rb_node *n; - - atomic_set(&pd->scan_queue, 0); - - if (list_empty(&pd->cdrw.pkt_free_list)) { - pkt_dbg(2, pd, "no pkt\n"); - return 0; - } - - /* - * Try to find a zone we are not already working on. - */ - spin_lock(&pd->lock); - first_node = pkt_rbtree_find(pd, pd->current_sector); - if (!first_node) { - n = rb_first(&pd->bio_queue); - if (n) - first_node = rb_entry(n, struct pkt_rb_node, rb_node); - } - node = first_node; - while (node) { - bio = node->bio; - zone = get_zone(bio->bi_iter.bi_sector, pd); - list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { - if (p->sector == zone) { - bio = NULL; - goto try_next_bio; - } - } - break; -try_next_bio: - node = pkt_rbtree_next(node); - if (!node) { - n = rb_first(&pd->bio_queue); - if (n) - node = rb_entry(n, struct pkt_rb_node, rb_node); - } - if (node == first_node) - node = NULL; - } - spin_unlock(&pd->lock); - if (!bio) { - pkt_dbg(2, pd, "no bio\n"); - return 0; - } - - pkt = pkt_get_packet_data(pd, zone); - - pd->current_sector = zone + pd->settings.size; - pkt->sector = zone; - BUG_ON(pkt->frames != pd->settings.size >> 2); - pkt->write_size = 0; - - /* - * Scan work queue for bios in the same zone and link them - * to this packet. - */ - spin_lock(&pd->lock); - pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); - while ((node = pkt_rbtree_find(pd, zone)) != NULL) { - bio = node->bio; - pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long) - get_zone(bio->bi_iter.bi_sector, pd)); - if (get_zone(bio->bi_iter.bi_sector, pd) != zone) - break; - pkt_rbtree_erase(pd, node); - spin_lock(&pkt->lock); - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE; - spin_unlock(&pkt->lock); - } - /* check write congestion marks, and if bio_queue_size is - * below, wake up any waiters - */ - if (pd->congested && - pd->bio_queue_size <= pd->write_congestion_off) { - pd->congested = false; - wake_up_var(&pd->congested); - } - spin_unlock(&pd->lock); - - pkt->sleep_time = max(PACKET_WAIT_TIME, 1); - pkt_set_state(pkt, PACKET_WAITING_STATE); - atomic_set(&pkt->run_sm, 1); - - spin_lock(&pd->cdrw.active_list_lock); - list_add(&pkt->list, &pd->cdrw.pkt_active_list); - spin_unlock(&pd->cdrw.active_list_lock); - - return 1; -} - -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -static void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} - -/* - * Assemble a bio to write one packet and queue the bio for processing - * by the underlying block device. - */ -static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - int f; - - bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, - REQ_OP_WRITE); - pkt->w_bio->bi_iter.bi_sector = pkt->sector; - pkt->w_bio->bi_end_io = pkt_end_io_packet_write; - pkt->w_bio->bi_private = pkt; - - /* XXX: locking? */ - for (f = 0; f < pkt->frames; f++) { - struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; - unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - - if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) - BUG(); - } - pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); - - /* - * Fill-in bvec with data from orig_bios. - */ - spin_lock(&pkt->lock); - bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); - - pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); - spin_unlock(&pkt->lock); - - pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", - pkt->write_size, (unsigned long long)pkt->sector); - - if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) - pkt->cache_valid = 1; - else - pkt->cache_valid = 0; - - /* Start the write request */ - atomic_set(&pkt->io_wait, 1); - pkt_queue_bio(pd, pkt->w_bio); -} - -static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) -{ - struct bio *bio; - - if (status) - pkt->cache_valid = 0; - - /* Finish all bios corresponding to this packet */ - while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_status = status; - bio_endio(bio); - } -} - -static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - pkt_dbg(2, pd, "pkt %d\n", pkt->id); - - for (;;) { - switch (pkt->state) { - case PACKET_WAITING_STATE: - if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0)) - return; - - pkt->sleep_time = 0; - pkt_gather_data(pd, pkt); - pkt_set_state(pkt, PACKET_READ_WAIT_STATE); - break; - - case PACKET_READ_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (atomic_read(&pkt->io_errors) > 0) { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); - } else { - pkt_start_write(pd, pkt); - } - break; - - case PACKET_WRITE_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (!pkt->w_bio->bi_status) { - pkt_set_state(pkt, PACKET_FINISHED_STATE); - } else { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); - } - break; - - case PACKET_RECOVERY_STATE: - pkt_dbg(2, pd, "No recovery possible\n"); - pkt_set_state(pkt, PACKET_FINISHED_STATE); - break; - - case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_status); - return; - - default: - BUG(); - break; - } - } -} - -static void pkt_handle_packets(struct pktcdvd_device *pd) -{ - struct packet_data *pkt, *next; - - /* - * Run state machine for active packets - */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) { - atomic_set(&pkt->run_sm, 0); - pkt_run_state_machine(pd, pkt); - } - } - - /* - * Move no longer active packets to the free list - */ - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) { - if (pkt->state == PACKET_FINISHED_STATE) { - list_del(&pkt->list); - pkt_put_packet_data(pd, pkt); - pkt_set_state(pkt, PACKET_IDLE_STATE); - atomic_set(&pd->scan_queue, 1); - } - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -static void pkt_count_states(struct pktcdvd_device *pd, int *states) -{ - struct packet_data *pkt; - int i; - - for (i = 0; i < PACKET_NUM_STATES; i++) - states[i] = 0; - - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - states[pkt->state]++; - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -/* - * kcdrwd is woken up when writes have been queued for one of our - * registered devices - */ -static int kcdrwd(void *foobar) -{ - struct pktcdvd_device *pd = foobar; - struct packet_data *pkt; - long min_sleep_time, residue; - - set_user_nice(current, MIN_NICE); - set_freezable(); - - for (;;) { - DECLARE_WAITQUEUE(wait, current); - - /* - * Wait until there is something to do - */ - add_wait_queue(&pd->wqueue, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - /* Check if we need to run pkt_handle_queue */ - if (atomic_read(&pd->scan_queue) > 0) - goto work_to_do; - - /* Check if we need to run the state machine for some packet */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) - goto work_to_do; - } - - /* Check if we need to process the iosched queues */ - if (atomic_read(&pd->iosched.attention) != 0) - goto work_to_do; - - /* Otherwise, go to sleep */ - if (PACKET_DEBUG > 1) { - int states[PACKET_NUM_STATES]; - pkt_count_states(pd, states); - pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], - states[3], states[4], states[5]); - } - - min_sleep_time = MAX_SCHEDULE_TIMEOUT; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sleep_time && pkt->sleep_time < min_sleep_time) - min_sleep_time = pkt->sleep_time; - } - - pkt_dbg(2, pd, "sleeping\n"); - residue = schedule_timeout(min_sleep_time); - pkt_dbg(2, pd, "wake up\n"); - - /* make swsusp happy with our thread */ - try_to_freeze(); - - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (!pkt->sleep_time) - continue; - pkt->sleep_time -= min_sleep_time - residue; - if (pkt->sleep_time <= 0) { - pkt->sleep_time = 0; - atomic_inc(&pkt->run_sm); - } - } - - if (kthread_should_stop()) - break; - } -work_to_do: - set_current_state(TASK_RUNNING); - remove_wait_queue(&pd->wqueue, &wait); - - if (kthread_should_stop()) - break; - - /* - * if pkt_handle_queue returns true, we can queue - * another request. - */ - while (pkt_handle_queue(pd)) - ; - - /* - * Handle packet state machine - */ - pkt_handle_packets(pd); - - /* - * Handle iosched queues - */ - pkt_iosched_process_queue(pd); - } - - return 0; -} - -static void pkt_print_settings(struct pktcdvd_device *pd) -{ - pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", - pd->settings.fp ? "Fixed" : "Variable", - pd->settings.size >> 2, - pd->settings.block_mode == 8 ? '1' : '2'); -} - -static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - - cgc->cmd[0] = GPCMD_MODE_SENSE_10; - cgc->cmd[2] = page_code | (page_control << 6); - cgc->cmd[7] = cgc->buflen >> 8; - cgc->cmd[8] = cgc->buflen & 0xff; - cgc->data_direction = CGC_DATA_READ; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - memset(cgc->buffer, 0, 2); - cgc->cmd[0] = GPCMD_MODE_SELECT_10; - cgc->cmd[1] = 0x10; /* PF */ - cgc->cmd[7] = cgc->buflen >> 8; - cgc->cmd[8] = cgc->buflen & 0xff; - cgc->data_direction = CGC_DATA_WRITE; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) -{ - struct packet_command cgc; - int ret; - - /* set up command and get the disc info */ - init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_DISC_INFO; - cgc.cmd[8] = cgc.buflen = 2; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - /* not all drives have the same disc_info length, so requeue - * packet with the length the drive tells us it can supply - */ - cgc.buflen = be16_to_cpu(di->disc_information_length) + - sizeof(di->disc_information_length); - - if (cgc.buflen > sizeof(disc_information)) - cgc.buflen = sizeof(disc_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti) -{ - struct packet_command cgc; - int ret; - - init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; - cgc.cmd[1] = type & 3; - cgc.cmd[4] = (track & 0xff00) >> 8; - cgc.cmd[5] = track & 0xff; - cgc.cmd[8] = 8; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - cgc.buflen = be16_to_cpu(ti->track_information_length) + - sizeof(ti->track_information_length); - - if (cgc.buflen > sizeof(track_information)) - cgc.buflen = sizeof(track_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, - long *last_written) -{ - disc_information di; - track_information ti; - __u32 last_track; - int ret; - - ret = pkt_get_disc_info(pd, &di); - if (ret) - return ret; - - last_track = (di.last_track_msb << 8) | di.last_track_lsb; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - - /* if this track is blank, try the previous. */ - if (ti.blank) { - last_track--; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - } - - /* if last recorded field is valid, return it. */ - if (ti.lra_v) { - *last_written = be32_to_cpu(ti.last_rec_address); - } else { - /* make it up instead */ - *last_written = be32_to_cpu(ti.track_start) + - be32_to_cpu(ti.track_size); - if (ti.free_blocks) - *last_written -= (be32_to_cpu(ti.free_blocks) + 7); - } - return 0; -} - -/* - * write mode select package based on pd->settings - */ -static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - write_param_page *wp; - char buffer[128]; - int ret, size; - - /* doesn't apply to DVD+RW or DVD-RAM */ - if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12)) - return 0; - - memset(buffer, 0, sizeof(buffer)); - init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff)); - pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff); - if (size > sizeof(buffer)) - size = sizeof(buffer); - - /* - * now get it all - */ - init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - /* - * write page is offset header + block descriptor length - */ - wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset]; - - wp->fp = pd->settings.fp; - wp->track_mode = pd->settings.track_mode; - wp->write_type = pd->settings.write_type; - wp->data_block_type = pd->settings.block_mode; - - wp->multi_session = 0; - -#ifdef PACKET_USE_LS - wp->link_size = 7; - wp->ls_v = 1; -#endif - - if (wp->data_block_type == PACKET_BLOCK_MODE1) { - wp->session_format = 0; - wp->subhdr2 = 0x20; - } else if (wp->data_block_type == PACKET_BLOCK_MODE2) { - wp->session_format = 0x20; - wp->subhdr2 = 8; -#if 0 - wp->mcn[0] = 0x80; - memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1); -#endif - } else { - /* - * paranoia - */ - pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); - return 1; - } - wp->packet_size = cpu_to_be32(pd->settings.size >> 2); - - cgc.buflen = cgc.cmd[8] = size; - ret = pkt_mode_select(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - pkt_print_settings(pd); - return 0; -} - -/* - * 1 -- we can write to this track, 0 -- we can't - */ -static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) -{ - switch (pd->mmc3_profile) { - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - /* The track is always writable on DVD+RW/DVD-RAM */ - return 1; - default: - break; - } - - if (!ti->packet || !ti->fp) - return 0; - - /* - * "good" settings as per Mt Fuji. - */ - if (ti->rt == 0 && ti->blank == 0) - return 1; - - if (ti->rt == 0 && ti->blank == 1) - return 1; - - if (ti->rt == 1 && ti->blank == 0) - return 1; - - pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); - return 0; -} - -/* - * 1 -- we can write to this disc, 0 -- we can't - */ -static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) -{ - switch (pd->mmc3_profile) { - case 0x0a: /* CD-RW */ - case 0xffff: /* MMC3 not supported */ - break; - case 0x1a: /* DVD+RW */ - case 0x13: /* DVD-RW */ - case 0x12: /* DVD-RAM */ - return 1; - default: - pkt_dbg(2, pd, "Wrong disc profile (%x)\n", - pd->mmc3_profile); - return 0; - } - - /* - * for disc type 0xff we should probably reserve a new track. - * but i'm not sure, should we leave this to user apps? probably. - */ - if (di->disc_type == 0xff) { - pkt_notice(pd, "unknown disc - no track?\n"); - return 0; - } - - if (di->disc_type != 0x20 && di->disc_type != 0) { - pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); - return 0; - } - - if (di->erasable == 0) { - pkt_notice(pd, "disc not erasable\n"); - return 0; - } - - if (di->border_status == PACKET_SESSION_RESERVED) { - pkt_err(pd, "can't write to last track (reserved)\n"); - return 0; - } - - return 1; -} - -static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - unsigned char buf[12]; - disc_information di; - track_information ti; - int ret, track; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_GET_CONFIGURATION; - cgc.cmd[8] = 8; - ret = pkt_generic_packet(pd, &cgc); - pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7]; - - memset(&di, 0, sizeof(disc_information)); - memset(&ti, 0, sizeof(track_information)); - - ret = pkt_get_disc_info(pd, &di); - if (ret) { - pkt_err(pd, "failed get_disc\n"); - return ret; - } - - if (!pkt_writable_disc(pd, &di)) - return -EROFS; - - pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; - - track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ - ret = pkt_get_track_info(pd, track, 1, &ti); - if (ret) { - pkt_err(pd, "failed get_track\n"); - return ret; - } - - if (!pkt_writable_track(pd, &ti)) { - pkt_err(pd, "can't write to this track\n"); - return -EROFS; - } - - /* - * we keep packet size in 512 byte units, makes it easier to - * deal with request calculations. - */ - pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; - if (pd->settings.size == 0) { - pkt_notice(pd, "detected zero packet size!\n"); - return -ENXIO; - } - if (pd->settings.size > PACKET_MAX_SECTORS) { - pkt_err(pd, "packet size is too big\n"); - return -EROFS; - } - pd->settings.fp = ti.fp; - pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1); - - if (ti.nwa_v) { - pd->nwa = be32_to_cpu(ti.next_writable); - set_bit(PACKET_NWA_VALID, &pd->flags); - } - - /* - * in theory we could use lra on -RW media as well and just zero - * blocks that haven't been written yet, but in practice that - * is just a no-go. we'll use that for -R, naturally. - */ - if (ti.lra_v) { - pd->lra = be32_to_cpu(ti.last_rec_address); - set_bit(PACKET_LRA_VALID, &pd->flags); - } else { - pd->lra = 0xffffffff; - set_bit(PACKET_LRA_VALID, &pd->flags); - } - - /* - * fine for now - */ - pd->settings.link_loss = 7; - pd->settings.write_type = 0; /* packet */ - pd->settings.track_mode = ti.track_mode; - - /* - * mode1 or mode2 disc - */ - switch (ti.data_mode) { - case PACKET_MODE1: - pd->settings.block_mode = PACKET_BLOCK_MODE1; - break; - case PACKET_MODE2: - pd->settings.block_mode = PACKET_BLOCK_MODE2; - break; - default: - pkt_err(pd, "unknown data mode\n"); - return -EROFS; - } - return 0; -} - -/* - * enable/disable write caching on drive - */ -static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, - int set) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - int ret; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.buflen = pd->mode_offset + 12; - - /* - * caching mode page might not be there, so quiet this command - */ - cgc.quiet = 1; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); - if (ret) - return ret; - - buf[pd->mode_offset + 10] |= (!!set << 2); - - cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); - ret = pkt_mode_select(pd, &cgc); - if (ret) { - pkt_err(pd, "write caching control failed\n"); - pkt_dump_sense(pd, &cgc); - } else if (!ret && set) - pkt_notice(pd, "enabled write caching\n"); - return ret; -} - -static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL; - cgc.cmd[4] = lockflag ? 1 : 0; - return pkt_generic_packet(pd, &cgc); -} - -/* - * Returns drive maximum write speed - */ -static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, - unsigned *write_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[256+18]; - unsigned char *cap_buf; - int ret, offset; - - cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); - cgc.sshdr = &sshdr; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - cgc.buflen = pd->mode_offset + cap_buf[1] + 2 + - sizeof(struct mode_page_header); - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - } - - offset = 20; /* Obsoleted field, used by older drives */ - if (cap_buf[1] >= 28) - offset = 28; /* Current write speed selected */ - if (cap_buf[1] >= 30) { - /* If the drive reports at least one "Logical Unit Write - * Speed Performance Descriptor Block", use the information - * in the first block. (contains the highest speed) - */ - int num_spdb = (cap_buf[30] << 8) + cap_buf[31]; - if (num_spdb > 0) - offset = 34; - } - - *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1]; - return 0; -} - -/* These tables from cdrecord - I don't have orange book */ -/* standard speed CD-RW (1-4x) */ -static char clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* high speed CD-RW (-10x) */ -static char hs_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* ultra high speed CD-RW */ -static char us_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0 -}; - -/* - * reads the maximum media speed from ATIP - */ -static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, - unsigned *speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - unsigned int size, st, sp; - int ret; - - init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; /* READ ATIP */ - cgc.cmd[8] = 2; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - size = ((unsigned int) buf[0]<<8) + buf[1] + 2; - if (size > sizeof(buf)) - size = sizeof(buf); - - init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; - cgc.cmd[8] = size; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - if (!(buf[6] & 0x40)) { - pkt_notice(pd, "disc type is not CD-RW\n"); - return 1; - } - if (!(buf[6] & 0x4)) { - pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); - return 1; - } - - st = (buf[6] >> 3) & 0x7; /* disc sub-type */ - - sp = buf[16] & 0xf; /* max speed from ATIP A1 field */ - - /* Info from cdrecord */ - switch (st) { - case 0: /* standard speed */ - *speed = clv_to_speed[sp]; - break; - case 1: /* high speed */ - *speed = hs_clv_to_speed[sp]; - break; - case 2: /* ultra high speed */ - *speed = us_clv_to_speed[sp]; - break; - default: - pkt_notice(pd, "unknown disc sub-type %d\n", st); - return 1; - } - if (*speed) { - pkt_info(pd, "maximum media speed: %d\n", *speed); - return 0; - } else { - pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); - return 1; - } -} - -static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - pkt_dbg(2, pd, "Performing OPC\n"); - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.timeout = 60*HZ; - cgc.cmd[0] = GPCMD_SEND_OPC; - cgc.cmd[1] = 1; - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - return ret; -} - -static int pkt_open_write(struct pktcdvd_device *pd) -{ - int ret; - unsigned int write_speed, media_write_speed, read_speed; - - ret = pkt_probe_settings(pd); - if (ret) { - pkt_dbg(2, pd, "failed probe\n"); - return ret; - } - - ret = pkt_set_write_settings(pd); - if (ret) { - pkt_dbg(1, pd, "failed saving write settings\n"); - return -EIO; - } - - pkt_write_caching(pd, USE_WCACHING); - - ret = pkt_get_max_speed(pd, &write_speed); - if (ret) - write_speed = 16 * 177; - switch (pd->mmc3_profile) { - case 0x13: /* DVD-RW */ - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); - break; - default: - ret = pkt_media_speed(pd, &media_write_speed); - if (ret) - media_write_speed = 16; - write_speed = min(write_speed, media_write_speed * 177); - pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); - break; - } - read_speed = write_speed; - - ret = pkt_set_speed(pd, write_speed, read_speed); - if (ret) { - pkt_dbg(1, pd, "couldn't set write speed\n"); - return -EIO; - } - pd->write_speed = write_speed; - pd->read_speed = read_speed; - - ret = pkt_perform_opc(pd); - if (ret) { - pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); - } - - return 0; -} - -/* - * called at open time. - */ -static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) -{ - int ret; - long lba; - struct request_queue *q; - struct block_device *bdev; - - /* - * We need to re-open the cdrom device without O_NONBLOCK to be able - * to read/write from/to it. It is already opened in O_NONBLOCK mode - * so open should not fail. - */ - bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto out; - } - - ret = pkt_get_last_written(pd, &lba); - if (ret) { - pkt_err(pd, "pkt_get_last_written failed\n"); - goto out_putdev; - } - - set_capacity(pd->disk, lba << 2); - set_capacity_and_notify(pd->bdev->bd_disk, lba << 2); - - q = bdev_get_queue(pd->bdev); - if (write) { - ret = pkt_open_write(pd); - if (ret) - goto out_putdev; - /* - * Some CDRW drives can not handle writes larger than one packet, - * even if the size is a multiple of the packet size. - */ - blk_queue_max_hw_sectors(q, pd->settings.size); - set_bit(PACKET_WRITABLE, &pd->flags); - } else { - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - clear_bit(PACKET_WRITABLE, &pd->flags); - } - - ret = pkt_set_segment_merging(pd, q); - if (ret) - goto out_putdev; - - if (write) { - if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - pkt_err(pd, "not enough memory for buffers\n"); - ret = -ENOMEM; - goto out_putdev; - } - pkt_info(pd, "%lukB available on disc\n", lba << 1); - } - - return 0; - -out_putdev: - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); -out: - return ret; -} - -/* - * called when the device is closed. makes sure that the device flushes - * the internal cache before we close. - */ -static void pkt_release_dev(struct pktcdvd_device *pd, int flush) -{ - if (flush && pkt_flush_cache(pd)) - pkt_dbg(1, pd, "not flushing cache\n"); - - pkt_lock_door(pd, 0); - - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); - - pkt_shrink_pktlist(pd); -} - -static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) -{ - if (dev_minor >= MAX_WRITERS) - return NULL; - - dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); - return pkt_devs[dev_minor]; -} - -static int pkt_open(struct block_device *bdev, fmode_t mode) -{ - struct pktcdvd_device *pd = NULL; - int ret; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); - if (!pd) { - ret = -ENODEV; - goto out; - } - BUG_ON(pd->refcnt < 0); - - pd->refcnt++; - if (pd->refcnt > 1) { - if ((mode & FMODE_WRITE) && - !test_bit(PACKET_WRITABLE, &pd->flags)) { - ret = -EBUSY; - goto out_dec; - } - } else { - ret = pkt_open_dev(pd, mode & FMODE_WRITE); - if (ret) - goto out_dec; - /* - * needed here as well, since ext2 (among others) may change - * the blocksize at mount time - */ - set_blocksize(bdev, CD_FRAMESIZE); - } - - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return 0; - -out_dec: - pd->refcnt--; -out: - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return ret; -} - -static void pkt_close(struct gendisk *disk, fmode_t mode) -{ - struct pktcdvd_device *pd = disk->private_data; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd->refcnt--; - BUG_ON(pd->refcnt < 0); - if (pd->refcnt == 0) { - int flush = test_bit(PACKET_WRITABLE, &pd->flags); - pkt_release_dev(pd, flush); - } - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); -} - - -static void pkt_end_io_read_cloned(struct bio *bio) -{ - struct packet_stacked_data *psd = bio->bi_private; - struct pktcdvd_device *pd = psd->pd; - - psd->bio->bi_status = bio->bi_status; - bio_put(bio); - bio_endio(psd->bio); - mempool_free(psd, &psd_pool); - pkt_bio_finished(pd); -} - -static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) -{ - struct bio *cloned_bio = - bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set); - struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); - - psd->pd = pd; - psd->bio = bio; - cloned_bio->bi_private = psd; - cloned_bio->bi_end_io = pkt_end_io_read_cloned; - pd->stats.secs_r += bio_sectors(bio); - pkt_queue_bio(pd, cloned_bio); -} - -static void pkt_make_request_write(struct request_queue *q, struct bio *bio) -{ - struct pktcdvd_device *pd = q->queuedata; - sector_t zone; - struct packet_data *pkt; - int was_empty, blocked_bio; - struct pkt_rb_node *node; - - zone = get_zone(bio->bi_iter.bi_sector, pd); - - /* - * If we find a matching packet in state WAITING or READ_WAIT, we can - * just append this bio to that packet. - */ - spin_lock(&pd->cdrw.active_list_lock); - blocked_bio = 0; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sector == zone) { - spin_lock(&pkt->lock); - if ((pkt->state == PACKET_WAITING_STATE) || - (pkt->state == PACKET_READ_WAIT_STATE)) { - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += - bio->bi_iter.bi_size / CD_FRAMESIZE; - if ((pkt->write_size >= pkt->frames) && - (pkt->state == PACKET_WAITING_STATE)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - spin_unlock(&pkt->lock); - spin_unlock(&pd->cdrw.active_list_lock); - return; - } else { - blocked_bio = 1; - } - spin_unlock(&pkt->lock); - } - } - spin_unlock(&pd->cdrw.active_list_lock); - - /* - * Test if there is enough room left in the bio work queue - * (queue size >= congestion on mark). - * If not, wait till the work queue size is below the congestion off mark. - */ - spin_lock(&pd->lock); - if (pd->write_congestion_on > 0 - && pd->bio_queue_size >= pd->write_congestion_on) { - struct wait_bit_queue_entry wqe; - - init_wait_var_entry(&wqe, &pd->congested, 0); - for (;;) { - prepare_to_wait_event(__var_waitqueue(&pd->congested), - &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - if (pd->bio_queue_size <= pd->write_congestion_off) - break; - pd->congested = true; - spin_unlock(&pd->lock); - schedule(); - spin_lock(&pd->lock); - } - } - spin_unlock(&pd->lock); - - /* - * No matching packet found. Store the bio in the work queue. - */ - node = mempool_alloc(&pd->rb_pool, GFP_NOIO); - node->bio = bio; - spin_lock(&pd->lock); - BUG_ON(pd->bio_queue_size < 0); - was_empty = (pd->bio_queue_size == 0); - pkt_rbtree_insert(pd, node); - spin_unlock(&pd->lock); - - /* - * Wake up the worker thread. - */ - atomic_set(&pd->scan_queue, 1); - if (was_empty) { - /* This wake_up is required for correct operation */ - wake_up(&pd->wqueue); - } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) { - /* - * This wake up is not required for correct operation, - * but improves performance in some cases. - */ - wake_up(&pd->wqueue); - } -} - -static void pkt_submit_bio(struct bio *bio) -{ - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; - struct bio *split; - - bio = bio_split_to_limits(bio); - - pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", - (unsigned long long)bio->bi_iter.bi_sector, - (unsigned long long)bio_end_sector(bio)); - - /* - * Clone READ bios so we can have our own bi_end_io callback. - */ - if (bio_data_dir(bio) == READ) { - pkt_make_request_read(pd, bio); - return; - } - - if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - pkt_notice(pd, "WRITE for ro device (%llu)\n", - (unsigned long long)bio->bi_iter.bi_sector); - goto end_io; - } - - if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { - pkt_err(pd, "wrong bio size\n"); - goto end_io; - } - - do { - sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); - sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); - - if (last_zone != zone) { - BUG_ON(last_zone != zone + pd->settings.size); - - split = bio_split(bio, last_zone - - bio->bi_iter.bi_sector, - GFP_NOIO, &pkt_bio_set); - bio_chain(split, bio); - } else { - split = bio; - } - - pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); - } while (split != bio); - - return; -end_io: - bio_io_error(bio); -} - -static void pkt_init_queue(struct pktcdvd_device *pd) -{ - struct request_queue *q = pd->disk->queue; - - blk_queue_logical_block_size(q, CD_FRAMESIZE); - blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); - q->queuedata = pd; -} - -static int pkt_seq_show(struct seq_file *m, void *p) -{ - struct pktcdvd_device *pd = m->private; - char *msg; - int states[PACKET_NUM_STATES]; - - seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); - - seq_printf(m, "\nSettings:\n"); - seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); - - if (pd->settings.write_type == 0) - msg = "Packet"; - else - msg = "Unknown"; - seq_printf(m, "\twrite type:\t\t%s\n", msg); - - seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); - seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); - - seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); - - if (pd->settings.block_mode == PACKET_BLOCK_MODE1) - msg = "Mode 1"; - else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) - msg = "Mode 2"; - else - msg = "Unknown"; - seq_printf(m, "\tblock mode:\t\t%s\n", msg); - - seq_printf(m, "\nStatistics:\n"); - seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); - seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); - seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); - seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); - seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); - - seq_printf(m, "\nMisc:\n"); - seq_printf(m, "\treference count:\t%d\n", pd->refcnt); - seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); - seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); - seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); - seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); - seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); - - seq_printf(m, "\nQueue state:\n"); - seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); - seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); - seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); - - pkt_count_states(pd, states); - seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", - pd->write_congestion_off, - pd->write_congestion_on); - return 0; -} - -static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) -{ - int i; - struct block_device *bdev; - struct scsi_device *sdev; - - if (pd->pkt_dev == dev) { - pkt_err(pd, "recursive setup not allowed\n"); - return -EBUSY; - } - for (i = 0; i < MAX_WRITERS; i++) { - struct pktcdvd_device *pd2 = pkt_devs[i]; - if (!pd2) - continue; - if (pd2->bdev->bd_dev == dev) { - pkt_err(pd, "%pg already setup\n", pd2->bdev); - return -EBUSY; - } - if (pd2->pkt_dev == dev) { - pkt_err(pd, "can't chain pktcdvd devices\n"); - return -EBUSY; - } - } - - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - sdev = scsi_device_from_queue(bdev->bd_disk->queue); - if (!sdev) { - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); - return -EINVAL; - } - put_device(&sdev->sdev_gendev); - - /* This is safe, since we have a reference from open(). */ - __module_get(THIS_MODULE); - - pd->bdev = bdev; - set_blocksize(bdev, CD_FRAMESIZE); - - pkt_init_queue(pd); - - atomic_set(&pd->cdrw.pending_bios, 0); - pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); - if (IS_ERR(pd->cdrw.thread)) { - pkt_err(pd, "can't start kernel thread\n"); - goto out_mem; - } - - proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); - pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); - return 0; - -out_mem: - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - return -ENOMEM; -} - -static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) -{ - struct pktcdvd_device *pd = bdev->bd_disk->private_data; - int ret; - - pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", - cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - - mutex_lock(&pktcdvd_mutex); - switch (cmd) { - case CDROMEJECT: - /* - * The door gets locked when the device is opened, so we - * have to unlock it or else the eject command fails. - */ - if (pd->refcnt == 1) - pkt_lock_door(pd, 0); - fallthrough; - /* - * forward selected CDROM ioctls to CD-ROM, for UDF - */ - case CDROMMULTISESSION: - case CDROMREADTOCENTRY: - case CDROM_LAST_WRITTEN: - case CDROM_SEND_PACKET: - case SCSI_IOCTL_SEND_COMMAND: - if (!bdev->bd_disk->fops->ioctl) - ret = -ENOTTY; - else - ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); - break; - default: - pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); - ret = -ENOTTY; - } - mutex_unlock(&pktcdvd_mutex); - - return ret; -} - -static unsigned int pkt_check_events(struct gendisk *disk, - unsigned int clearing) -{ - struct pktcdvd_device *pd = disk->private_data; - struct gendisk *attached_disk; - - if (!pd) - return 0; - if (!pd->bdev) - return 0; - attached_disk = pd->bdev->bd_disk; - if (!attached_disk || !attached_disk->fops->check_events) - return 0; - return attached_disk->fops->check_events(attached_disk, clearing); -} - -static char *pkt_devnode(struct gendisk *disk, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); -} - -static const struct block_device_operations pktcdvd_ops = { - .owner = THIS_MODULE, - .submit_bio = pkt_submit_bio, - .open = pkt_open, - .release = pkt_close, - .ioctl = pkt_ioctl, - .compat_ioctl = blkdev_compat_ptr_ioctl, - .check_events = pkt_check_events, - .devnode = pkt_devnode, -}; - -/* - * Set up mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) -{ - int idx; - int ret = -ENOMEM; - struct pktcdvd_device *pd; - struct gendisk *disk; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) - if (!pkt_devs[idx]) - break; - if (idx == MAX_WRITERS) { - pr_err("max %d writers supported\n", MAX_WRITERS); - ret = -EBUSY; - goto out_mutex; - } - - pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL); - if (!pd) - goto out_mutex; - - ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE, - sizeof(struct pkt_rb_node)); - if (ret) - goto out_mem; - - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); - INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); - spin_lock_init(&pd->cdrw.active_list_lock); - - spin_lock_init(&pd->lock); - spin_lock_init(&pd->iosched.lock); - bio_list_init(&pd->iosched.read_queue); - bio_list_init(&pd->iosched.write_queue); - sprintf(pd->name, DRIVER_NAME"%d", idx); - init_waitqueue_head(&pd->wqueue); - pd->bio_queue = RB_ROOT; - - pd->write_congestion_on = write_congestion_on; - pd->write_congestion_off = write_congestion_off; - - ret = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) - goto out_mem; - pd->disk = disk; - disk->major = pktdev_major; - disk->first_minor = idx; - disk->minors = 1; - disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; - strcpy(disk->disk_name, pd->name); - disk->private_data = pd; - - pd->pkt_dev = MKDEV(pktdev_major, idx); - ret = pkt_new_dev(pd, dev); - if (ret) - goto out_mem2; - - /* inherit events of the host device */ - disk->events = pd->bdev->bd_disk->events; - - ret = add_disk(disk); - if (ret) - goto out_mem2; - - pkt_sysfs_dev_new(pd); - pkt_debugfs_dev_new(pd); - - pkt_devs[idx] = pd; - if (pkt_dev) - *pkt_dev = pd->pkt_dev; - - mutex_unlock(&ctl_mutex); - return 0; - -out_mem2: - put_disk(disk); -out_mem: - mempool_exit(&pd->rb_pool); - kfree(pd); -out_mutex: - mutex_unlock(&ctl_mutex); - pr_err("setup of pktcdvd device failed\n"); - return ret; -} - -/* - * Tear down mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_remove_dev(dev_t pkt_dev) -{ - struct pktcdvd_device *pd; - int idx; - int ret = 0; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) { - pd = pkt_devs[idx]; - if (pd && (pd->pkt_dev == pkt_dev)) - break; - } - if (idx == MAX_WRITERS) { - pr_debug("dev not setup\n"); - ret = -ENXIO; - goto out; - } - - if (pd->refcnt > 0) { - ret = -EBUSY; - goto out; - } - if (!IS_ERR(pd->cdrw.thread)) - kthread_stop(pd->cdrw.thread); - - pkt_devs[idx] = NULL; - - pkt_debugfs_dev_remove(pd); - pkt_sysfs_dev_remove(pd); - - blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); - - remove_proc_entry(pd->name, pkt_proc); - pkt_dbg(1, pd, "writer unmapped\n"); - - del_gendisk(pd->disk); - put_disk(pd->disk); - - mempool_exit(&pd->rb_pool); - kfree(pd); - - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - -out: - mutex_unlock(&ctl_mutex); - return ret; -} - -static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) -{ - struct pktcdvd_device *pd; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); - if (pd) { - ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev); - ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); - } else { - ctrl_cmd->dev = 0; - ctrl_cmd->pkt_dev = 0; - } - ctrl_cmd->num_devices = MAX_WRITERS; - - mutex_unlock(&ctl_mutex); -} - -static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - struct pkt_ctrl_command ctrl_cmd; - int ret = 0; - dev_t pkt_dev = 0; - - if (cmd != PACKET_CTRL_CMD) - return -ENOTTY; - - if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - - switch (ctrl_cmd.command) { - case PKT_CTRL_CMD_SETUP: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); - ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); - break; - case PKT_CTRL_CMD_TEARDOWN: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); - break; - case PKT_CTRL_CMD_STATUS: - pkt_get_status(&ctrl_cmd); - break; - default: - return -ENOTTY; - } - - if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - return ret; -} - -#ifdef CONFIG_COMPAT -static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); -} -#endif - -static const struct file_operations pkt_ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = pkt_ctl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = pkt_ctl_compat_ioctl, -#endif - .owner = THIS_MODULE, - .llseek = no_llseek, -}; - -static struct miscdevice pkt_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = DRIVER_NAME, - .nodename = "pktcdvd/control", - .fops = &pkt_ctl_fops -}; - -static int __init pkt_init(void) -{ - int ret; - - mutex_init(&ctl_mutex); - - ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE, - sizeof(struct packet_stacked_data)); - if (ret) - return ret; - ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) { - mempool_exit(&psd_pool); - return ret; - } - - ret = register_blkdev(pktdev_major, DRIVER_NAME); - if (ret < 0) { - pr_err("unable to register block device\n"); - goto out2; - } - if (!pktdev_major) - pktdev_major = ret; - - ret = pkt_sysfs_init(); - if (ret) - goto out; - - pkt_debugfs_init(); - - ret = misc_register(&pkt_misc); - if (ret) { - pr_err("unable to register misc device\n"); - goto out_misc; - } - - pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL); - - return 0; - -out_misc: - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); -out: - unregister_blkdev(pktdev_major, DRIVER_NAME); -out2: - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); - return ret; -} - -static void __exit pkt_exit(void) -{ - remove_proc_entry("driver/"DRIVER_NAME, NULL); - misc_deregister(&pkt_misc); - - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); - - unregister_blkdev(pktdev_major, DRIVER_NAME); - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); -} - -MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); -MODULE_AUTHOR("Jens Axboe <axboe@suse.de>"); -MODULE_LICENSE("GPL"); - -module_init(pkt_init); -module_exit(pkt_exit); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 19da5defd734..68bd2f7961b3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -512,7 +512,7 @@ static void virtblk_free_disk(struct gendisk *disk) { struct virtio_blk *vblk = disk->private_data; - ida_simple_remove(&vd_index_ida, vblk->index); + ida_free(&vd_index_ida, vblk->index); mutex_destroy(&vblk->vdev_mutex); kfree(vblk); } @@ -902,8 +902,8 @@ static int virtblk_probe(struct virtio_device *vdev) return -EINVAL; } - err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), - GFP_KERNEL); + err = ida_alloc_range(&vd_index_ida, 0, + minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); if (err < 0) goto out; index = err; @@ -1163,7 +1163,7 @@ out_free_vq: out_free_vblk: kfree(vblk); out_free_index: - ida_simple_remove(&vd_index_ida, index); + ida_free(&vd_index_ida, index); out: return err; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 35b9bcad9db9..b28489290323 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info) if (info->rq && info->gd) { blk_mq_stop_hw_queues(info->rq); blk_mark_disk_dead(info->gd); - set_capacity(info->gd, 0); } for_each_rinfo(info, rinfo, i) { diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 99499d1f6e66..9f32901fdad1 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -160,7 +160,7 @@ static void read_moving(struct cache_set *c) moving_init(io); bio = &io->bio.bio; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; bio->bi_end_io = read_moving_endio; if (bch_bio_alloc_pages(bio, GFP_KERNEL)) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 32e21ba64357..67a2e29e0b40 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -244,7 +244,7 @@ static void bch_data_insert_start(struct closure *cl) trace_bcache_cache_insert(k); bch_keylist_push(&op->insert_keys); - bio_set_op_attrs(n, REQ_OP_WRITE, 0); + n->bi_opf = REQ_OP_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 0285b676e983..d4a5fc0650bb 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -434,7 +434,7 @@ static void write_dirty(struct closure *cl) */ if (KEY_DIRTY(&w->key)) { dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_opf = REQ_OP_WRITE; io->bio.bi_iter.bi_sector = KEY_START(&w->key); bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; @@ -547,7 +547,7 @@ static void read_dirty(struct cached_dev *dc) io->sequence = sequence++; dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_opf = REQ_OP_READ; io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 078da18bb86d..8541d5688f3a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, struct dm_keyslot_evict_args *args = data; int err; - err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); + err = blk_crypto_evict_key(dev->bdev, args->key); if (!args->err) args->err = err; /* Always try to evict the key from all devices. */ diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e76c96c760a9..c2b5a537f5b8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -410,7 +410,7 @@ static void end_discard(struct discard_op *op, int r) * need to wait for the chain to complete. */ bio_chain(op->bio, op->parent_bio); - bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0); + op->bio->bi_opf = REQ_OP_DISCARD; submit_bio(op->bio); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 95a1ee3d314e..e1ea3a7bd9d9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -732,28 +732,48 @@ static char *_dm_claim_ptr = "I belong to device-mapper"; /* * Open a table device so we can use it as a map destination. */ -static int open_table_device(struct table_device *td, dev_t dev, - struct mapped_device *md) +static struct table_device *open_table_device(struct mapped_device *md, + dev_t dev, fmode_t mode) { + struct table_device *td; struct block_device *bdev; u64 part_off; int r; - BUG_ON(td->dm_dev.bdev); + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); + if (!td) + return ERR_PTR(-ENOMEM); + refcount_set(&td->count, 1); - bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr); + if (IS_ERR(bdev)) { + r = PTR_ERR(bdev); + goto out_free_td; + } - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); - return r; + /* + * We can be called before the dm disk is added. In that case we can't + * register the holder relation here. It will be done once add_disk was + * called. + */ + if (md->disk->slave_dir) { + r = bd_link_disk_holder(bdev, md->disk); + if (r) + goto out_blkdev_put; } + td->dm_dev.mode = mode; td->dm_dev.bdev = bdev; td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); - return 0; + format_dev_t(td->dm_dev.name, dev); + list_add(&td->list, &md->table_devices); + return td; + +out_blkdev_put: + blkdev_put(bdev, mode | FMODE_EXCL); +out_free_td: + kfree(td); + return ERR_PTR(r); } /* @@ -761,14 +781,12 @@ static int open_table_device(struct table_device *td, dev_t dev, */ static void close_table_device(struct table_device *td, struct mapped_device *md) { - if (!td->dm_dev.bdev) - return; - - bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + if (md->disk->slave_dir) + bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); put_dax(td->dm_dev.dax_dev); - td->dm_dev.bdev = NULL; - td->dm_dev.dax_dev = NULL; + list_del(&td->list); + kfree(td); } static struct table_device *find_table_device(struct list_head *l, dev_t dev, @@ -786,31 +804,16 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, struct dm_dev **result) { - int r; struct table_device *td; mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { - td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); - if (!td) { + td = open_table_device(md, dev, mode); + if (IS_ERR(td)) { mutex_unlock(&md->table_devices_lock); - return -ENOMEM; + return PTR_ERR(td); } - - td->dm_dev.mode = mode; - td->dm_dev.bdev = NULL; - - if ((r = open_table_device(td, dev, md))) { - mutex_unlock(&md->table_devices_lock); - kfree(td); - return r; - } - - format_dev_t(td->dm_dev.name, dev); - - refcount_set(&td->count, 1); - list_add(&td->list, &md->table_devices); } else { refcount_inc(&td->count); } @@ -825,27 +828,11 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) struct table_device *td = container_of(d, struct table_device, dm_dev); mutex_lock(&md->table_devices_lock); - if (refcount_dec_and_test(&td->count)) { + if (refcount_dec_and_test(&td->count)) close_table_device(td, md); - list_del(&td->list); - kfree(td); - } mutex_unlock(&md->table_devices_lock); } -static void free_table_devices(struct list_head *devices) -{ - struct list_head *tmp, *next; - - list_for_each_safe(tmp, next, devices) { - struct table_device *td = list_entry(tmp, struct table_device, list); - - DMWARN("dm_destroy: %s still exists with %d references", - td->dm_dev.name, refcount_read(&td->count)); - kfree(td); - } -} - /* * Get the geometry associated with a dm device */ @@ -1972,8 +1959,21 @@ static void cleanup_mapped_device(struct mapped_device *md) md->disk->private_data = NULL; spin_unlock(&_minor_lock); if (dm_get_md_type(md) != DM_TYPE_NONE) { + struct table_device *td; + dm_sysfs_exit(md); + list_for_each_entry(td, &md->table_devices, list) { + bd_unlink_disk_holder(td->dm_dev.bdev, + md->disk); + } + + /* + * Hold lock to make sure del_gendisk() won't concurrent + * with open/close_table_device(). + */ + mutex_lock(&md->table_devices_lock); del_gendisk(md->disk); + mutex_unlock(&md->table_devices_lock); } dm_queue_destroy_crypto_profile(md->queue); put_disk(md->disk); @@ -2122,7 +2122,7 @@ static void free_dev(struct mapped_device *md) cleanup_mapped_device(md); - free_table_devices(&md->table_devices); + WARN_ON_ONCE(!list_empty(&md->table_devices)); dm_stats_cleanup(&md->stats); free_minor(minor); @@ -2305,6 +2305,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; + struct table_device *td; int r; switch (type) { @@ -2333,17 +2334,40 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) if (r) return r; + /* + * Hold lock to make sure add_disk() and del_gendisk() won't concurrent + * with open_table_device() and close_table_device(). + */ + mutex_lock(&md->table_devices_lock); r = add_disk(md->disk); + mutex_unlock(&md->table_devices_lock); if (r) return r; - r = dm_sysfs_init(md); - if (r) { - del_gendisk(md->disk); - return r; + /* + * Register the holder relationship for devices added before the disk + * was live. + */ + list_for_each_entry(td, &md->table_devices, list) { + r = bd_link_disk_holder(td->dm_dev.bdev, md->disk); + if (r) + goto out_undo_holders; } + + r = dm_sysfs_init(md); + if (r) + goto out_undo_holders; + md->type = type; return 0; + +out_undo_holders: + list_for_each_entry_continue_reverse(td, &md->table_devices, list) + bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); + mutex_lock(&md->table_devices_lock); + del_gendisk(md->disk); + mutex_unlock(&md->table_devices_lock); + return r; } struct mapped_device *dm_get_md(dev_t dev) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bf6dffadbe6f..e7cc6ba1b657 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap) sb = kmap_atomic(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); - pr_debug(" version: %d\n", le32_to_cpu(sb->version)); + pr_debug(" version: %u\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", le32_to_cpu(*(__le32 *)(sb->uuid+0)), le32_to_cpu(*(__le32 *)(sb->uuid+4)), @@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap) pr_debug("events cleared: %llu\n", (unsigned long long) le64_to_cpu(sb->events_cleared)); pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); - pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); - pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); + pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); kunmap_atomic(sb); } @@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bytes = DIV_ROUND_UP(chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - } while (bytes > (space << 9)); + } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < + (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); } else chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; @@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = pages; bitmap->counts.chunkshift = chunkshift; bitmap->counts.chunks = chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + BITMAP_BLOCK_SHIFT); blocks = min(old_counts.chunks << old_counts.chunkshift, @@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = old_counts.pages; bitmap->counts.chunkshift = old_counts.chunkshift; bitmap->counts.chunks = old_counts.chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + - BITMAP_BLOCK_SHIFT); + bitmap->mddev->bitmap_info.chunksize = + 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); blocks = old_counts.chunks << old_counts.chunkshift; pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); break; @@ -2195,20 +2196,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (set) { bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); - if (*bmc_new == 0) { - /* need to set on-disk bits too. */ - sector_t end = block + new_blocks; - sector_t start = block >> chunkshift; - start <<= chunkshift; - while (start < end) { - md_bitmap_file_set_bit(bitmap, block); - start += 1 << chunkshift; + if (bmc_new) { + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + + start <<= chunkshift; + while (start < end) { + md_bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); } - *bmc_new = 2; - md_bitmap_count_page(&bitmap->counts, block, 1); - md_bitmap_set_pending(&bitmap->counts, block); + *bmc_new |= NEEDED_MASK; } - *bmc_new |= NEEDED_MASK; if (new_blocks < old_blocks) old_blocks = new_blocks; } @@ -2534,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) if (csize < 512 || !is_power_of_2(csize)) return -EINVAL; + if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * + sizeof(((bitmap_super_t *)0)->chunksize)))) + return -EOVERFLOW; mddev->bitmap_info.chunksize = csize; return len; } diff --git a/drivers/md/md.c b/drivers/md/md.c index a467b492d4ad..775f1dde190a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -93,6 +93,18 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -444,7 +456,7 @@ static void md_submit_bio(struct bio *bio) bio = bio_split_to_limits(bio); - if (mddev->ro == 1 && unlikely(rw == WRITE)) { + if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -509,13 +521,14 @@ static void md_end_flush(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; + bio_put(bio); + rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } - bio_put(bio); } static void md_submit_flush_data(struct work_struct *ws); @@ -913,10 +926,12 @@ static void super_written(struct bio *bio) } else clear_bit(LastDev, &rdev->flags); + bio_put(bio); + + rdev_dec_pending(rdev, mddev); + if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); - rdev_dec_pending(rdev, mddev); - bio_put(bio); } void md_super_write(struct mddev *mddev, struct md_rdev *rdev, @@ -2453,7 +2468,22 @@ static void rdev_delayed_delete(struct work_struct *ws) kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(struct md_rdev *rdev) +void md_autodetect_dev(dev_t dev); + +static void export_rdev(struct md_rdev *rdev) +{ + pr_debug("md: export_rdev(%pg)\n", rdev->bdev); + md_rdev_clear(rdev); +#ifndef MODULE + if (test_bit(AutoDetected, &rdev->flags)) + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + rdev->bdev = NULL; + kobject_put(&rdev->kobj); +} + +static void md_kick_rdev_from_array(struct md_rdev *rdev) { bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); @@ -2476,56 +2506,8 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) INIT_WORK(&rdev->del_work, rdev_delayed_delete); kobject_get(&rdev->kobj); queue_work(md_rdev_misc_wq, &rdev->del_work); -} - -/* - * prevent the device from being mounted, repartitioned or - * otherwise reused by a RAID array (or any other kernel - * subsystem), by bd_claiming the device. - */ -static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) -{ - int err = 0; - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - shared ? (struct md_rdev *)lock_rdev : rdev); - if (IS_ERR(bdev)) { - pr_warn("md: could not open device unknown-block(%u,%u).\n", - MAJOR(dev), MINOR(dev)); - return PTR_ERR(bdev); - } - rdev->bdev = bdev; - return err; -} - -static void unlock_rdev(struct md_rdev *rdev) -{ - struct block_device *bdev = rdev->bdev; - rdev->bdev = NULL; - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -void md_autodetect_dev(dev_t dev); - -static void export_rdev(struct md_rdev *rdev) -{ - pr_debug("md: export_rdev(%pg)\n", rdev->bdev); - md_rdev_clear(rdev); -#ifndef MODULE - if (test_bit(AutoDetected, &rdev->flags)) - md_autodetect_dev(rdev->bdev->bd_dev); -#endif - unlock_rdev(rdev); - kobject_put(&rdev->kobj); -} - -void md_kick_rdev_from_array(struct md_rdev *rdev) -{ - unbind_rdev_from_array(rdev); export_rdev(rdev); } -EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); static void export_array(struct mddev *mddev) { @@ -2639,7 +2621,7 @@ void md_update_sb(struct mddev *mddev, int force_change) int any_badblocks_changed = 0; int ret = -1; - if (mddev->ro) { + if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; @@ -3660,9 +3642,10 @@ EXPORT_SYMBOL_GPL(md_rdev_init); */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { - int err; + static struct md_rdev *claim_rdev; /* just for claiming the bdev */ struct md_rdev *rdev; sector_t size; + int err; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) @@ -3670,14 +3653,20 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe err = md_rdev_init(rdev); if (err) - goto abort_free; + goto out_free_rdev; err = alloc_disk_sb(rdev); if (err) - goto abort_free; + goto out_clear_rdev; - err = lock_rdev(rdev, newdev, super_format == -2); - if (err) - goto abort_free; + rdev->bdev = blkdev_get_by_dev(newdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + super_format == -2 ? claim_rdev : rdev); + if (IS_ERR(rdev->bdev)) { + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(newdev), MINOR(newdev)); + err = PTR_ERR(rdev->bdev); + goto out_clear_rdev; + } kobject_init(&rdev->kobj, &rdev_ktype); @@ -3686,7 +3675,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe pr_warn("md: %pg has zero or unknown size, marking faulty!\n", rdev->bdev); err = -EINVAL; - goto abort_free; + goto out_blkdev_put; } if (super_format >= 0) { @@ -3696,21 +3685,22 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", rdev->bdev, super_format, super_minor); - goto abort_free; + goto out_blkdev_put; } if (err < 0) { pr_warn("md: could not read %pg's sb, not importing!\n", rdev->bdev); - goto abort_free; + goto out_blkdev_put; } } return rdev; -abort_free: - if (rdev->bdev) - unlock_rdev(rdev); +out_blkdev_put: + blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_clear_rdev: md_rdev_clear(rdev); +out_free_rdev: kfree(rdev); return ERR_PTR(err); } @@ -3901,7 +3891,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) goto out_unlock; } rv = -EROFS; - if (mddev->ro) + if (!md_is_rdwr(mddev)) goto out_unlock; /* request to change the personality. Need to ensure: @@ -4107,7 +4097,7 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; - else if (mddev->ro) + else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_layout = n; @@ -4216,7 +4206,7 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; - else if (mddev->ro) + else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_chunk_sectors = n >> 9; @@ -4339,13 +4329,13 @@ array_state_show(struct mddev *mddev, char *page) if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { - case 1: + case MD_RDONLY: st = readonly; break; - case 2: + case MD_AUTO_READ: st = read_auto; break; - case 0: + case MD_RDWR: spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; @@ -4381,7 +4371,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); - if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + if (mddev->pers && (st == active || st == clean) && + mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active */ @@ -4425,23 +4416,23 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) err = md_set_readonly(mddev, NULL); else { - mddev->ro = 1; + mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { - if (mddev->ro == 0) + if (md_is_rdwr(mddev)) err = md_set_readonly(mddev, NULL); - else if (mddev->ro == 1) + else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; @@ -4466,7 +4457,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) wake_up(&mddev->sb_wait); err = 0; } else { - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } @@ -4765,7 +4756,7 @@ action_show(struct mddev *mddev, char *page) if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { @@ -4851,11 +4842,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } - if (mddev->ro == 2) { + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5083,8 +5074,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len) goto out_unlock; err = -EBUSY; - if (max < mddev->resync_max && - mddev->ro == 0 && + if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; @@ -5813,8 +5803,8 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); - if (mddev->ro != 1 && rdev_read_only(rdev)) { - mddev->ro = 1; + if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { + mddev->ro = MD_RDONLY; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); } @@ -5917,8 +5907,8 @@ int md_run(struct mddev *mddev) mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly && mddev->ro == 0) - mddev->ro = 2; /* read-only, but switch on first write */ + if (start_readonly && md_is_rdwr(mddev)) + mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ err = pers->run(mddev); if (err) @@ -5996,8 +5986,8 @@ int md_run(struct mddev *mddev) mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); - } else if (mddev->ro == 2) /* auto-readonly not meaningful */ - mddev->ro = 0; + } else if (mddev->ro == MD_AUTO_READ) + mddev->ro = MD_RDWR; atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); @@ -6015,7 +6005,7 @@ int md_run(struct mddev *mddev) if (rdev->raid_disk >= 0) sysfs_link_rdev(mddev, rdev); /* failure here is OK */ - if (mddev->degraded && !mddev->ro) + if (mddev->degraded && md_is_rdwr(mddev)) /* This ensures that recovering status is reported immediately * via sysfs - until a lack of spares is confirmed. */ @@ -6105,7 +6095,7 @@ static int restart_array(struct mddev *mddev) return -ENXIO; if (!mddev->pers) return -EINVAL; - if (!mddev->ro) + if (md_is_rdwr(mddev)) return -EBUSY; rcu_read_lock(); @@ -6124,7 +6114,7 @@ static int restart_array(struct mddev *mddev) return -EROFS; mddev->safemode = 0; - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(disk, 0); pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ @@ -6151,7 +6141,7 @@ static void md_clean(struct mddev *mddev) mddev->clevel[0] = 0; mddev->flags = 0; mddev->sb_flags = 0; - mddev->ro = 0; + mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; @@ -6203,7 +6193,7 @@ static void __md_stop_writes(struct mddev *mddev) } md_bitmap_flush(mddev); - if (mddev->ro == 0 && + if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || mddev->sb_flags)) { /* mark array as shutdown cleanly */ @@ -6312,9 +6302,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) __md_stop_writes(mddev); err = -ENXIO; - if (mddev->ro==1) + if (mddev->ro == MD_RDONLY) goto out; - mddev->ro = 1; + mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -6371,7 +6361,7 @@ static int do_md_stop(struct mddev *mddev, int mode, return -EBUSY; } if (mddev->pers) { - if (mddev->ro) + if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); __md_stop_writes(mddev); @@ -6388,8 +6378,8 @@ static int do_md_stop(struct mddev *mddev, int mode, mutex_unlock(&mddev->open_mutex); mddev->changed = 1; - if (mddev->ro) - mddev->ro = 0; + if (!md_is_rdwr(mddev)) + mddev->ro = MD_RDWR; } else mutex_unlock(&mddev->open_mutex); /* @@ -7204,7 +7194,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->sync_thread) return -EBUSY; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return -EROFS; rdev_for_each(rdev, mddev) { @@ -7234,7 +7224,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) @@ -7464,6 +7454,40 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static int __md_set_array_info(struct mddev *mddev, void __user *argp) +{ + mdu_array_info_t info; + int err; + + if (!argp) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't update array info. %d\n", err); + return err; + } + + if (!list_empty(&mddev->disks)) { + pr_warn("md: array %s already has disks!\n", mdname(mddev)); + return -EBUSY; + } + + if (mddev->raid_disks) { + pr_warn("md: array %s already initialised!\n", mdname(mddev)); + return -EBUSY; + } + + err = md_set_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't set array info. %d\n", err); + + return err; +} + static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -7569,36 +7593,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } if (cmd == SET_ARRAY_INFO) { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - pr_warn("md: couldn't update array info. %d\n", err); - goto unlock; - } - goto unlock; - } - if (!list_empty(&mddev->disks)) { - pr_warn("md: array %s already has disks!\n", mdname(mddev)); - err = -EBUSY; - goto unlock; - } - if (mddev->raid_disks) { - pr_warn("md: array %s already initialised!\n", mdname(mddev)); - err = -EBUSY; - goto unlock; - } - err = md_set_array_info(mddev, &info); - if (err) { - pr_warn("md: couldn't set array info. %d\n", err); - goto unlock; - } + err = __md_set_array_info(mddev, argp); goto unlock; } @@ -7658,26 +7653,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. */ - if (mddev->ro && mddev->pers) { - if (mddev->ro == 2) { - mddev->ro = 0; - sysfs_notify_dirent_safe(mddev->sysfs_state); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - /* mddev_unlock will wake thread */ - /* If a device failed while we were read-only, we - * need to make sure the metadata is updated now. - */ - if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { - mddev_unlock(mddev); - wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - mddev_lock_nointr(mddev); - } - } else { + if (!md_is_rdwr(mddev) && mddev->pers) { + if (mddev->ro != MD_AUTO_READ) { err = -EROFS; goto unlock; } + mddev->ro = MD_RDWR; + sysfs_notify_dirent_safe(mddev->sysfs_state); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); + } } switch (cmd) { @@ -7763,11 +7757,11 @@ static int md_set_read_only(struct block_device *bdev, bool ro) * Transitioning to read-auto need only happen for arrays that call * md_write_start and which are not ready for writes yet. */ - if (!ro && mddev->ro == 1 && mddev->pers) { + if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { err = restart_array(mddev); if (err) goto out_unlock; - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; } out_unlock: @@ -8241,9 +8235,9 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { - if (mddev->ro==1) + if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); - if (mddev->ro==2) + if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } @@ -8502,10 +8496,10 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (bio_data_dir(bi) != WRITE) return true; - BUG_ON(mddev->ro == 1); - if (mddev->ro == 2) { + BUG_ON(mddev->ro == MD_RDONLY); + if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - mddev->ro = 0; + mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); @@ -8556,7 +8550,7 @@ void md_write_inc(struct mddev *mddev, struct bio *bi) { if (bio_data_dir(bi) != WRITE) return; - WARN_ON_ONCE(mddev->in_sync || mddev->ro); + WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); percpu_ref_get(&mddev->writes_pending); } EXPORT_SYMBOL(md_write_inc); @@ -8661,7 +8655,7 @@ void md_allow_write(struct mddev *mddev) { if (!mddev->pers) return; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return; if (!mddev->pers->sync_request) return; @@ -8709,7 +8703,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) return; - if (mddev->ro) {/* never try to sync a read-only array */ + if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; } @@ -9178,9 +9172,9 @@ static int remove_and_add_spares(struct mddev *mddev, if (test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(Journal, &rdev->flags)) { - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) + if (!md_is_rdwr(mddev) && + !(rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) continue; rdev->recovery_offset = 0; @@ -9278,7 +9272,8 @@ void md_check_recovery(struct mddev *mddev) flush_signals(current); } - if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + if (!md_is_rdwr(mddev) && + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || @@ -9297,7 +9292,7 @@ void md_check_recovery(struct mddev *mddev) if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; - if (mddev->ro) { + if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; if (!mddev->external && mddev->in_sync) /* 'Blocked' flag not needed as failed devices diff --git a/drivers/md/md.h b/drivers/md/md.h index b4e2d8b87b61..554a9026669a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -782,7 +782,6 @@ extern void mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); -extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend); extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 857c49399c28..b536befd8898 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -398,7 +398,6 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, UINT_MAX); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_opt(mddev->queue, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 05d8438cfec8..68a9e2d9985b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1321,7 +1321,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_end_io = raid1_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); + read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &mirror->rdev->flags) && test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -2254,7 +2254,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) continue; } - bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + wbio->bi_opf = REQ_OP_WRITE; if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) wbio->bi_opf |= MD_FAILFAST; @@ -2419,7 +2419,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) GFP_NOIO, &mddev->bio_set); } - bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + wbio->bi_opf = REQ_OP_WRITE; wbio->bi_iter.bi_sector = r1_bio->sector; wbio->bi_iter.bi_size = r1_bio->sectors << 9; @@ -2770,7 +2770,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (i < conf->raid_disks) still_degraded = 1; } else if (!test_bit(In_sync, &rdev->flags)) { - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; write_targets ++; } else { @@ -2797,7 +2797,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (disk < 0) disk = i; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; bio->bi_end_io = end_sync_read; read_targets++; } else if (!test_bit(WriteErrorSeen, &rdev->flags) && @@ -2809,7 +2809,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * if we are doing resync or repair. Otherwise, leave * this device alone for this sync request. */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; write_targets++; } @@ -3159,6 +3159,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { + md_unregister_thread(&conf->thread); ret = -EINVAL; goto abort; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3aa8b6e11d58..6c66357f92f5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1254,7 +1254,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); read_bio->bi_end_io = raid10_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); + read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &rdev->flags) && test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1301,7 +1301,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua); + mbio->bi_opf = op | do_sync | do_fua; if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags) && enough(conf, devnum)) @@ -2933,7 +2933,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + choose_data_offset(r10_bio, rdev); - bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + wbio->bi_opf = REQ_OP_WRITE; if (submit_bio_wait(wbio) < 0) /* Failure! */ @@ -3542,7 +3542,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_next = biolist; biolist = bio; bio->bi_end_io = end_sync_read; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; from_addr = r10_bio->devs[j].addr; @@ -3567,7 +3567,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_next = biolist; biolist = bio; bio->bi_end_io = end_sync_write; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; bio->bi_iter.bi_sector = to_addr + mrdev->data_offset; bio_set_dev(bio, mrdev->bdev); @@ -3588,7 +3588,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_next = biolist; biolist = bio; bio->bi_end_io = end_sync_write; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; bio->bi_iter.bi_sector = to_addr + mreplace->data_offset; bio_set_dev(bio, mreplace->bdev); @@ -3742,7 +3742,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_next = biolist; biolist = bio; bio->bi_end_io = end_sync_read; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; @@ -3764,7 +3764,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_next = biolist; biolist = bio; bio->bi_end_io = end_sync_write; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; @@ -4145,8 +4145,6 @@ static int raid10_run(struct mddev *mddev) conf->thread = NULL; if (mddev->queue) { - blk_queue_max_discard_sectors(mddev->queue, - UINT_MAX); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); raid10_set_io_opt(conf); @@ -4972,7 +4970,7 @@ read_more: b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; b->bi_end_io = end_reshape_write; - bio_set_op_attrs(b, REQ_OP_WRITE, 0); + b->bi_opf = REQ_OP_WRITE; b->bi_next = blist; blist = b; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 832d8566e165..46182b955aef 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1565,11 +1565,12 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) if (!log) return; + + target = READ_ONCE(log->reclaim_target); do { - target = log->reclaim_target; if (new < target) return; - } while (cmpxchg(&log->reclaim_target, target, new) != target); + } while (!try_cmpxchg(&log->reclaim_target, &target, new)); md_wakeup_thread(log->reclaim_thread); } @@ -3061,7 +3062,6 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { - struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; int ret; @@ -3090,9 +3090,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log) return -ENOMEM; log->rdev = rdev; - - log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; - + log->need_cache_flush = bdev_write_cache(rdev->bdev); log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 31b9157bc9ae..e495939bb3e0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1301,8 +1301,6 @@ static int ppl_validate_rdev(struct md_rdev *rdev) static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) { - struct request_queue *q; - if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + PPL_HEADER_SIZE) * 2) { log->use_multippl = true; @@ -1316,8 +1314,7 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) } log->next_io_sector = rdev->ppl.sector; - q = bdev_get_queue(rdev->bdev); - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(rdev->bdev)) log->wb_cache_on = true; } diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index ff8b083dc5c6..e36aeb50b4ed 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_free_cmd; } - blk_mq_start_request(req); + nvme_start_request(req); apple_nvme_submit_cmd(q, cmnd); return BLK_STS_OK; @@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&anv->ctrl); + nvme_quiesce_io_queues(&anv->ctrl); if (!dead) { if (READ_ONCE(anv->ioq.enabled)) { @@ -829,15 +829,13 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) apple_nvme_remove_cq(anv); } - if (shutdown) - nvme_shutdown_ctrl(&anv->ctrl); - nvme_disable_ctrl(&anv->ctrl); + nvme_disable_ctrl(&anv->ctrl, shutdown); } WRITE_ONCE(anv->ioq.enabled, false); WRITE_ONCE(anv->adminq.enabled, false); mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */ - nvme_stop_admin_queue(&anv->ctrl); + nvme_quiesce_admin_queue(&anv->ctrl); /* last chance to complete any requests before nvme_cancel_request */ spin_lock_irqsave(&anv->lock, flags); @@ -854,8 +852,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&anv->ctrl); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); } } @@ -1093,7 +1091,7 @@ static void apple_nvme_reset_work(struct work_struct *work) dev_dbg(anv->dev, "Starting admin queue"); apple_nvme_init_queue(&anv->adminq); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(anv->ctrl.device, @@ -1102,7 +1100,7 @@ static void apple_nvme_reset_work(struct work_struct *work) goto out; } - ret = nvme_init_ctrl_finish(&anv->ctrl); + ret = nvme_init_ctrl_finish(&anv->ctrl, false); if (ret) goto out; @@ -1127,7 +1125,7 @@ static void apple_nvme_reset_work(struct work_struct *work) anv->ctrl.queue_count = nr_io_queues + 1; - nvme_start_queues(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); nvme_wait_freeze(&anv->ctrl); blk_mq_update_nr_hw_queues(&anv->tagset, 1); nvme_unfreeze(&anv->ctrl); @@ -1153,7 +1151,7 @@ out: nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING); nvme_get_ctrl(&anv->ctrl); apple_nvme_disable(anv, false); - nvme_kill_queues(&anv->ctrl); + nvme_mark_namespaces_dead(&anv->ctrl); if (!queue_work(nvme_wq, &anv->remove_work)) nvme_put_ctrl(&anv->ctrl); } @@ -1507,14 +1505,6 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - if (!blk_get_queue(anv->ctrl.admin_q)) { - nvme_start_admin_queue(&anv->ctrl); - blk_mq_destroy_queue(anv->ctrl.admin_q); - anv->ctrl.admin_q = NULL; - ret = -ENODEV; - goto put_dev; - } - nvme_reset_ctrl(&anv->ctrl); async_schedule(apple_nvme_async_probe, anv); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c8a6db7c4498..bb0abbe4491c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -13,6 +13,10 @@ #include "fabrics.h" #include <linux/nvme-auth.h> +#define CHAP_BUF_SIZE 4096 +static struct kmem_cache *nvme_chap_buf_cache; +static mempool_t *nvme_chap_buf_pool; + struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; @@ -20,7 +24,6 @@ struct nvme_dhchap_queue_context { struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; void *buf; - size_t buf_size; int qid; int error; u32 s1; @@ -47,6 +50,12 @@ struct nvme_dhchap_queue_context { #define nvme_auth_queue_from_qid(ctrl, qid) \ (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q +static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues + + ctrl->opts->nr_poll_queues + 1; +} + static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, void *data, size_t data_len, bool auth_send) { @@ -112,7 +121,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -147,7 +156,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); const char *hmac_name, *kpp_name; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -197,12 +206,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return NVME_SC_AUTH_REQUIRED; } - /* Reset host response if the hash had been changed */ - if (chap->hash_id != data->hashid) { - kfree(chap->host_response); - chap->host_response = NULL; - } - chap->hash_id = data->hashid; chap->hash_len = data->hl; dev_dbg(ctrl->device, "qid %d: selected hash %s\n", @@ -219,14 +222,6 @@ select_kpp: return NVME_SC_AUTH_REQUIRED; } - /* Clear host and controller key to avoid accidental reuse */ - kfree_sensitive(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; - kfree_sensitive(chap->ctrl_key); - chap->ctrl_key = NULL; - chap->ctrl_key_len = 0; - if (chap->dhgroup_id == data->dhgid && (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { dev_dbg(ctrl->device, @@ -302,7 +297,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, if (chap->host_key_len) size += chap->host_key_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -344,10 +339,10 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_success1_data *data = chap->buf; size_t size = sizeof(*data); - if (ctrl->ctrl_key) + if (chap->ctrl_key) size += chap->hash_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -521,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, ret = PTR_ERR(ctrl_response); return ret; } + ret = crypto_shash_setkey(chap->shash_tfm, ctrl_response, ctrl->ctrl_key->len); if (ret) { @@ -621,9 +617,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, if (ret) { dev_dbg(ctrl->device, "failed to generate public key, error %d\n", ret); - kfree(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -643,9 +636,6 @@ gen_sesskey: if (ret) { dev_dbg(ctrl->device, "failed to generate shared secret, error %d\n", ret); - kfree_sensitive(chap->sess_key); - chap->sess_key = NULL; - chap->sess_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -654,7 +644,7 @@ gen_sesskey: return 0; } -static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) { kfree_sensitive(chap->host_response); chap->host_response = NULL; @@ -674,24 +664,20 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); + mempool_free(chap->buf, nvme_chap_buf_pool); + chap->buf = NULL; } -static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree_sensitive(chap->ctrl_key); - kfree_sensitive(chap->host_key); - kfree_sensitive(chap->sess_key); - kfree_sensitive(chap->host_response); - kfree(chap->buf); - kfree(chap); } -static void __nvme_auth_work(struct work_struct *work) +static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = container_of(work, struct nvme_dhchap_queue_context, auth_work); @@ -699,6 +685,16 @@ static void __nvme_auth_work(struct work_struct *work) size_t tl; int ret = 0; + /* + * Allocate a large enough buffer for the entire negotiation: + * 4k is enough to ffdhe8192. + */ + chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL); + if (!chap->buf) { + chap->error = -ENOMEM; + return; + } + chap->transaction = ctrl->transaction++; /* DH-HMAC-CHAP Step 1: send negotiate */ @@ -720,8 +716,9 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive challenge, %s %d\n", @@ -757,11 +754,14 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d host response\n", __func__, chap->qid); + mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } + mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", @@ -783,8 +783,9 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive success1\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive success1, %s %d\n", @@ -801,16 +802,19 @@ static void __nvme_auth_work(struct work_struct *work) return; } + mutex_lock(&ctrl->dhchap_auth_mutex); if (ctrl->ctrl_key) { dev_dbg(ctrl->device, "%s: qid %d controller response\n", __func__, chap->qid); ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } } + mutex_unlock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { @@ -819,7 +823,7 @@ static void __nvme_auth_work(struct work_struct *work) goto fail2; } - if (ctrl->ctrl_key) { + if (chap->ctrl_key) { /* DH-HMAC-CHAP Step 5: send success2 */ dev_dbg(ctrl->device, "%s: qid %d send success2\n", __func__, chap->qid); @@ -860,42 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOKEY; } - mutex_lock(&ctrl->dhchap_auth_mutex); - /* Check if the context is already queued */ - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - WARN_ON(!chap->buf); - if (chap->qid == qid) { - dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - __nvme_auth_reset(chap); - queue_work(nvme_wq, &chap->auth_work); - return 0; - } - } - chap = kzalloc(sizeof(*chap), GFP_KERNEL); - if (!chap) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENOMEM; - } - chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; - chap->ctrl = ctrl; - - /* - * Allocate a large enough buffer for the entire negotiation: - * 4k should be enough to ffdhe8192. - */ - chap->buf_size = 4096; - chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); - if (!chap->buf) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - kfree(chap); - return -ENOMEM; - } - - INIT_WORK(&chap->auth_work, __nvme_auth_work); - list_add(&chap->entry, &ctrl->dhchap_auth_list); - mutex_unlock(&ctrl->dhchap_auth_mutex); + chap = &ctrl->dhchap_ctxs[qid]; + cancel_work_sync(&chap->auth_work); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -906,40 +876,28 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) struct nvme_dhchap_queue_context *chap; int ret; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - if (chap->qid != qid) - continue; - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - ret = chap->error; - return ret; - } - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENXIO; + chap = &ctrl->dhchap_ctxs[qid]; + flush_work(&chap->auth_work); + ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_wait); -void nvme_auth_reset(struct nvme_ctrl *ctrl) -{ - struct nvme_dhchap_queue_context *chap; - - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - __nvme_auth_reset(chap); - } - mutex_unlock(&ctrl->dhchap_auth_mutex); -} -EXPORT_SYMBOL_GPL(nvme_auth_reset); - -static void nvme_dhchap_auth_work(struct work_struct *work) +static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, dhchap_auth_work); int ret, q; + /* + * If the ctrl is no connected, bail as reconnect will handle + * authentication. + */ + if (ctrl->state != NVME_CTRL_LIVE) + return; + /* Authenticate admin queue first */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { @@ -968,43 +926,75 @@ static void nvme_dhchap_auth_work(struct work_struct *work) * Failure is a soft-state; credentials remain valid until * the controller terminates the connection. */ + for (q = 1; q < ctrl->queue_count; q++) { + ret = nvme_auth_wait(ctrl, q); + if (ret) + dev_warn(ctrl->device, + "qid %d: authentication failed\n", q); + } } -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { - INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); + struct nvme_dhchap_queue_context *chap; + int i, ret; + mutex_init(&ctrl->dhchap_auth_mutex); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) - return; - nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); - nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); + return 0; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, + &ctrl->host_key); + if (ret) + return ret; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); + if (ret) + goto err_free_dhchap_secret; + + if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) + return ret; + + ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), + sizeof(*chap), GFP_KERNEL); + if (!ctrl->dhchap_ctxs) { + ret = -ENOMEM; + goto err_free_dhchap_ctrl_secret; + } + + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + chap->qid = i; + chap->ctrl = ctrl; + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); + } + + return 0; +err_free_dhchap_ctrl_secret: + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; +err_free_dhchap_secret: + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; - cancel_work_sync(&ctrl->dhchap_auth_work); - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) - cancel_work_sync(&chap->auth_work); - mutex_unlock(&ctrl->dhchap_auth_mutex); } EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + int i; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { - list_del_init(&chap->entry); - flush_work(&chap->auth_work); - __nvme_auth_free(chap); + if (ctrl->dhchap_ctxs) { + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) + nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]); + kfree(ctrl->dhchap_ctxs); } - mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { nvme_auth_free_key(ctrl->host_key); ctrl->host_key = NULL; @@ -1015,3 +1005,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) } } EXPORT_SYMBOL_GPL(nvme_auth_free); + +int __init nvme_init_auth(void) +{ + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", + CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nvme_chap_buf_cache) + return -ENOMEM; + + nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, + mempool_free_slab, nvme_chap_buf_cache); + if (!nvme_chap_buf_pool) + goto err_destroy_chap_buf_cache; + + return 0; +err_destroy_chap_buf_cache: + kmem_cache_destroy(nvme_chap_buf_cache); + return -ENOMEM; +} + +void __exit nvme_exit_auth(void) +{ + mempool_destroy(nvme_chap_buf_pool); + kmem_cache_destroy(nvme_chap_buf_cache); +} diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7e3893d06bab..e26b085a007a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req) nvme_log_error(req); nvme_end_req_zoned(req); nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); blk_mq_end_request(req, status); } @@ -851,8 +853,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC)) + cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC); + if (nvme_ns_has_pi(ns)) { - cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE1: @@ -1118,11 +1123,12 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); mutex_unlock(&ctrl->scan_lock); } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_ctrl_finish(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) { + dev_info(ctrl->device, +"controller capabilities changed, reset may be required to take effect.\n"); + } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); @@ -2003,6 +2009,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } } + /* + * Only set the DEAC bit if the device guarantees that reads from + * deallocated data return zeroes. While the DEAC bit does not + * require that, it must be a no-op if reads from deallocated data + * do not return zeroes. + */ + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + ns->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); @@ -2179,7 +2193,7 @@ const struct pr_ops nvme_pr_ops = { }; #ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, +static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct nvme_ctrl *ctrl = data; @@ -2196,7 +2210,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, NVME_QID_ANY, 1, 0); } -EXPORT_SYMBOL_GPL(nvme_sec_submit); + +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ + if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!ctrl->opal_dev) + ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit); + else if (was_suspended) + opal_unlock_from_suspend(ctrl->opal_dev); + } else { + free_opal_dev(ctrl->opal_dev); + ctrl->opal_dev = NULL; + } +} +#else +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ +} #endif /* CONFIG_BLK_SED_OPAL */ #ifdef CONFIG_BLK_DEV_ZONED @@ -2221,16 +2251,17 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val, + u32 timeout, const char *op) { - unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies; - u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + unsigned long timeout_jiffies = jiffies + timeout * HZ; + u32 csts; int ret; while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { if (csts == ~0) return -ENODEV; - if ((csts & NVME_CSTS_RDY) == bit) + if ((csts & mask) == val) break; usleep_range(1000, 2000); @@ -2239,7 +2270,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) if (time_after(jiffies, timeout_jiffies)) { dev_err(ctrl->device, "Device not ready; aborting %s, CSTS=0x%x\n", - enabled ? "initialisation" : "reset", csts); + op, csts); return -ENODEV; } } @@ -2247,27 +2278,29 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) return ret; } -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config &= ~NVME_CC_ENABLE; + if (shutdown) + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + else + ctrl->ctrl_config &= ~NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; + if (shutdown) { + return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK, + NVME_CSTS_SHST_CMPLT, + ctrl->shutdown_timeout, "shutdown"); + } if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - - return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0, + (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset"); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); @@ -2332,41 +2365,11 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, timeout, true); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY, + (timeout + 1) / 2, "initialisation"); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) -{ - unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); - u32 csts; - int ret; - - ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - - while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { - if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) - break; - - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(ctrl->device, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; @@ -3049,7 +3052,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) - return 0; + return -ENOMEM; c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_CS_CTRL; @@ -3229,7 +3232,7 @@ out_free: * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) { int ret; @@ -3260,6 +3263,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + nvme_configure_opal(ctrl, was_suspended); + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { /* * Do not return errors unless we are in a controller reset, @@ -3745,15 +3750,19 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_secret)) { + struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); + host_key = ctrl->host_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->host_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(host_key); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3795,15 +3804,19 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); + ctrl_key = ctrl->ctrl_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->ctrl_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(ctrl_key); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3875,10 +3888,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_dev_attrs_group = { +const struct attribute_group nvme_dev_attrs_group = { .attrs = nvme_dev_attrs, .is_visible = nvme_dev_attrs_are_visible, }; +EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); static const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, @@ -4333,10 +4347,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) { int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; - if (test_bit(NVME_NS_DEAD, &ns->flags)) - goto out; - - ret = NVME_SC_INVALID_NS | NVME_SC_DNR; if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) { dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); @@ -4407,7 +4417,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) + if (ns->head->ns_id > nsid) list_move_tail(&ns->list, &rm_list); } up_write(&ctrl->namespaces_rwsem); @@ -4424,9 +4434,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; - if (nvme_ctrl_limited_cns(ctrl)) - return -EOPNOTSUPP; - ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; @@ -4534,8 +4541,18 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_scan_ns_list(ctrl) != 0) + if (nvme_ctrl_limited_cns(ctrl)) { nvme_scan_ns_sequential(ctrl); + } else { + /* + * Fall back to sequential scan if DNR is set to handle broken + * devices which should support Identify NS List (as per the VS + * they report) but don't actually support it. + */ + ret = nvme_scan_ns_list(ctrl); + if (ret > 0 && ret & NVME_SC_DNR) + nvme_scan_ns_sequential(ctrl); + } mutex_unlock(&ctrl->scan_lock); } @@ -4565,8 +4582,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) * removing the namespaces' disks; fail all the queues now to avoid * potentially having to clean up the failed sync later. */ - if (ctrl->state == NVME_CTRL_DEAD) - nvme_kill_queues(ctrl); + if (ctrl->state == NVME_CTRL_DEAD) { + nvme_mark_namespaces_dead(ctrl); + nvme_unquiesce_io_queues(ctrl); + } /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); @@ -4692,7 +4711,7 @@ static void nvme_fw_act_work(struct work_struct *work) fw_act_timeout = jiffies + msecs_to_jiffies(admin_timeout * 1000); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); while (nvme_ctrl_pp_status(ctrl)) { if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, @@ -4706,7 +4725,7 @@ static void nvme_fw_act_work(struct work_struct *work) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); @@ -4811,8 +4830,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, EXPORT_SYMBOL_GPL(nvme_complete_async_event); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size) + const struct blk_mq_ops *ops, unsigned int cmd_size) { int ret; @@ -4822,7 +4840,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->ops->flags & NVME_F_FABRICS) set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_NO_SCHED; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = 1; @@ -4850,6 +4870,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); + blk_put_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; @@ -4859,14 +4880,17 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) { blk_mq_destroy_queue(ctrl->admin_q); - if (ctrl->ops->flags & NVME_F_FABRICS) + blk_put_queue(ctrl->admin_q); + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); + blk_put_queue(ctrl->fabrics_q); + } blk_mq_free_tag_set(ctrl->admin_tagset); } EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, + const struct blk_mq_ops *ops, unsigned int nr_maps, unsigned int cmd_size) { int ret; @@ -4874,15 +4898,23 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->queue_depth = ctrl->sqsize + 1; - set->reserved_tags = NVMF_RESERVED_TAGS; + /* + * Some Apple controllers requires tags to be unique across admin and + * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. + */ + if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) + set->reserved_tags = NVME_AQ_DEPTH; + else if (ctrl->ops->flags & NVME_F_FABRICS) + set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size, set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - if (ops->map_queues) - set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + set->nr_maps = nr_maps; ret = blk_mq_alloc_tag_set(set); if (ret) return ret; @@ -4893,6 +4925,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } + blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, + ctrl->connect_q); } ctrl->tagset = set; @@ -4906,8 +4940,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) { - if (ctrl->ops->flags & NVME_F_FABRICS) + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->connect_q); + blk_put_queue(ctrl->connect_q); + } blk_mq_free_tag_set(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); @@ -4943,7 +4979,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_mpath_update(ctrl); } @@ -4988,6 +5024,7 @@ static void nvme_free_ctrl(struct device *dev) nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); + free_opal_dev(ctrl->opal_dev); if (subsys) { mutex_lock(&nvme_subsystems_lock); @@ -5053,7 +5090,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; - ctrl->device->groups = nvme_dev_attr_groups; + if (ops->dev_attr_groups) + ctrl->device->groups = ops->dev_attr_groups; + else + ctrl->device->groups = nvme_dev_attr_groups; ctrl->device->release = nvme_free_ctrl; dev_set_drvdata(ctrl->device, ctrl); ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); @@ -5077,9 +5117,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); nvme_mpath_init_ctrl(ctrl); - nvme_auth_init_ctrl(ctrl); + ret = nvme_auth_init_ctrl(ctrl); + if (ret) + goto out_free_cdev; return 0; +out_free_cdev: + cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); @@ -5092,62 +5136,17 @@ out: } EXPORT_SYMBOL_GPL(nvme_init_ctrl); -static void nvme_start_ns_queue(struct nvme_ns *ns) -{ - if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_unquiesce_queue(ns->queue); -} - -static void nvme_stop_ns_queue(struct nvme_ns *ns) -{ - if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_quiesce_queue(ns->queue); - else - blk_mq_wait_quiesce_done(ns->queue); -} - -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_mark_disk_dead(ns->disk); - nvme_start_ns_queue(ns); - - set_capacity_and_notify(ns->disk, 0); -} - -/** - * nvme_kill_queues(): Ends all namespace queues - * @ctrl: the dead controller that needs to end - * - * Call this function when the driver determines it is unable to get the - * controller in a state capable of servicing IO. - */ -void nvme_kill_queues(struct nvme_ctrl *ctrl) +/* let I/O to all namespaces fail in preparation for surprise removal */ +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); - - /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - nvme_start_admin_queue(ctrl); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_set_queue_dying(ns); - + blk_mark_disk_dead(ns->disk); up_read(&ctrl->namespaces_rwsem); } -EXPORT_SYMBOL_GPL(nvme_kill_queues); +EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { @@ -5197,43 +5196,41 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_freeze); -void nvme_stop_queues(struct nvme_ctrl *ctrl) +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_stop_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (!ctrl->tagset) + return; + if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_quiesce_tagset(ctrl->tagset); + else + blk_mq_wait_quiesce_done(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_stop_queues); +EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues); -void nvme_start_queues(struct nvme_ctrl *ctrl) +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_start_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (!ctrl->tagset) + return; + if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_tagset(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_start_queues); +EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else - blk_mq_wait_quiesce_done(ctrl->admin_q); + blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } -EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); +EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_unquiesce_queue(ctrl->admin_q); } -EXPORT_SYMBOL_GPL(nvme_start_admin_queue); +EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { @@ -5344,8 +5341,13 @@ static int __init nvme_core_init(void) goto unregister_generic_ns; } + result = nvme_init_auth(); + if (result) + goto destroy_ns_chr; return 0; +destroy_ns_chr: + class_destroy(nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: @@ -5366,6 +5368,7 @@ out: static void __exit nvme_core_exit(void) { + nvme_exit_auth(); class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5d57a042dbca..4564f16a0b20 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); nvme_fc_rport_put(rport); @@ -1699,6 +1701,15 @@ restart: spin_unlock_irqrestore(&rport->lock, flags); } +static +void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport, + struct fcnvme_ls_rqst_w0 *w0) +{ + dev_info(lport->dev, "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); +} + /** * nvme_fc_rcv_ls_req - transport entry point called by an LLDD * upon the reception of a NVME LS request. @@ -1751,20 +1762,20 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, goto out_put; } - lsop = kzalloc(sizeof(*lsop) + - sizeof(union nvmefc_ls_requests) + - sizeof(union nvmefc_ls_responses), - GFP_KERNEL); + lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); if (!lsop) { - dev_info(lport->dev, - "RCV %s LS failed: No memory\n", - (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? - nvmefc_ls_names[w0->ls_cmd] : ""); + nvme_fc_rcv_ls_req_err_msg(lport, w0); ret = -ENOMEM; goto out_put; } - lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; - lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; + + lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); + lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); + if (!lsop->rqstbuf || !lsop->rspbuf) { + nvme_fc_rcv_ls_req_err_msg(lport, w0); + ret = -ENOMEM; + goto out_free; + } lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, sizeof(*lsop->rspbuf), @@ -1801,6 +1812,8 @@ out_unmap: fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); out_free: + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); out_put: nvme_fc_rport_put(rport); @@ -2391,7 +2404,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); kfree(ctrl->queues); @@ -2492,20 +2505,20 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * (but with error status). */ if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); if (start_queues) - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); } /* * Other transports, which don't have link-level contexts bound * to sqe's, would try to gracefully shutdown the controller by * writing the registers for shutdown and polling (call - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * nvme_disable_ctrl()). Given a bunch of i/o was potentially * just aborted and we will wait on those contexts, and given * there was no indication of how live the controlelr is on the * link, don't send more io to create more contexts for the @@ -2516,13 +2529,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); if (start_queues) - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); } static void @@ -2732,7 +2745,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, atomic_set(&op->state, FCPOP_STATE_ACTIVE); if (!(op->flags & FCOP_FLAGS_AEN)) - blk_mq_start_request(op->rq); + nvme_start_request(op->rq); cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn)); ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport, @@ -2903,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_fc_mq_ops, 1, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) @@ -3104,9 +3117,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); - ret = nvme_init_ctrl_finish(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; @@ -3250,10 +3263,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -3509,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED, + &nvme_fc_admin_mq_ops, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81f5550b670d..9ddda571f046 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -8,6 +8,50 @@ #include <linux/io_uring.h> #include "nvme.h" +static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, + fmode_t mode) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + + /* + * Do not allow unprivileged processes to send vendor specific or fabrics + * commands as we can't be sure about their effects. + */ + if (c->common.opcode >= nvme_cmd_vendor_start || + c->common.opcode == nvme_fabrics_command) + return false; + + /* + * Do not allow unprivileged passthrough of admin commands except + * for a subset of identify commands that contain information required + * to form proper I/O commands in userspace and do not expose any + * potentially sensitive information. + */ + if (!ns) { + if (c->common.opcode == nvme_admin_identify) { + switch (c->identify.cns) { + case NVME_ID_CNS_NS: + case NVME_ID_CNS_CS_NS: + case NVME_ID_CNS_NS_CS_INDEP: + case NVME_ID_CNS_CS_CTRL: + case NVME_ID_CNS_CTRL: + return true; + } + } + return false; + } + + /* + * Only allow I/O commands that transfer data to the controller if the + * special file is open for writing, but always allow I/O commands that + * transfer data from the controller. + */ + if (nvme_is_write(c)) + return mode & FMODE_WRITE; + return true; +} + /* * Convert integer values from ioctl structures to user pointers, silently * ignoring the upper bits in the compat case to match behaviour of 32-bit @@ -261,7 +305,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) + struct nvme_passthru_cmd __user *ucmd, fmode_t mode) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -269,8 +313,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u64 result; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -291,6 +333,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -308,15 +353,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd, bool vec) + struct nvme_passthru_cmd64 __user *ucmd, bool vec, + fmode_t mode) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -337,6 +381,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -483,9 +530,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, void *meta = NULL; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - c.common.opcode = READ_ONCE(cmd->opcode); c.common.flags = READ_ONCE(cmd->flags); if (c.common.flags) @@ -507,6 +551,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode)) + return -EACCES; + d.metadata = READ_ONCE(cmd->metadata); d.addr = READ_ONCE(cmd->addr); d.data_len = READ_ONCE(cmd->data_len); @@ -570,13 +617,13 @@ static bool is_ctrl_ioctl(unsigned int cmd) } static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, mode); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -601,14 +648,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp); + return nvme_user_cmd(ns->ctrl, ns, argp, mode); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -620,19 +667,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, false); + return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode); case NVME_IOCTL_IO64_CMD_VEC: - return nvme_user_cmd64(ns->ctrl, ns, argp, true); + return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode); default: return -ENOTTY; } } -static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg, + fmode_t mode) { - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); - return nvme_ns_ioctl(ns, cmd, arg); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode); + return nvme_ns_ioctl(ns, cmd, arg, mode); } int nvme_ioctl(struct block_device *bdev, fmode_t mode, @@ -640,7 +688,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, { struct nvme_ns *ns = bdev->bd_disk->private_data; - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, mode); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -648,7 +696,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -716,7 +764,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, } #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, struct nvme_ns_head *head, int srcu_idx) + void __user *argp, struct nvme_ns_head *head, int srcu_idx, + fmode_t mode) __releases(&head->srcu) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -724,7 +773,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, nvme_get_ctrl(ns->ctrl); srcu_read_unlock(&head->srcu, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); nvme_put_ctrl(ctrl); return ret; @@ -749,9 +798,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -773,9 +823,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, goto out_unlock; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + file->f_mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -849,7 +900,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) return ret; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, + fmode_t mode) { struct nvme_ns *ns; int ret; @@ -873,7 +925,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp); + ret = nvme_user_cmd(ctrl, ns, argp, mode); nvme_put_ns(ns); return ret; @@ -890,11 +942,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, file->f_mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode); case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); + return nvme_dev_user_cmd(ctrl, argp, file->f_mode); case NVME_IOCTL_RESET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 7e025b8948cb..c03093b6813c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } +void nvme_mpath_start_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns->head->disk; + + if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) + return; + + nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, + blk_rq_bytes(rq) >> SECTOR_SHIFT, + req_op(rq), jiffies); +} +EXPORT_SYMBOL_GPL(nvme_mpath_start_request); + +void nvme_mpath_end_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + + if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) + return; + bdev_end_io_acct(ns->head->disk->part0, req_op(rq), + nvme_req(rq)->start_time); +} + void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; @@ -506,6 +531,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); /* * This assumes all controllers that refer to a namespace either * support poll queues or not. That is not a strict guarantee, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a29877217ee6..6bbb73ef8b25 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,9 @@ struct nvme_request { u8 retries; u8 flags; u16 status; +#ifdef CONFIG_NVME_MULTIPATH + unsigned long start_time; +#endif struct nvme_ctrl *ctrl; }; @@ -173,6 +176,7 @@ struct nvme_request { enum { NVME_REQ_CANCELLED = (1 << 0), NVME_REQ_USERCMD = (1 << 1), + NVME_MPATH_IO_STATS = (1 << 2), }; static inline struct nvme_request *nvme_req(struct request *req) @@ -237,6 +241,7 @@ enum nvme_ctrl_flags { NVME_CTRL_FAILFAST_EXPIRED = 0, NVME_CTRL_ADMIN_Q_STOPPED = 1, NVME_CTRL_STARTED_ONCE = 2, + NVME_CTRL_STOPPED = 3, }; struct nvme_ctrl { @@ -336,8 +341,8 @@ struct nvme_ctrl { #ifdef CONFIG_NVME_AUTH struct work_struct dhchap_auth_work; - struct list_head dhchap_auth_list; struct mutex dhchap_auth_mutex; + struct nvme_dhchap_queue_context *dhchap_ctxs; struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; u16 transaction; @@ -454,6 +459,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ + NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */ }; struct nvme_ns { @@ -483,11 +489,9 @@ struct nvme_ns { unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 #define NVME_NS_READY 4 -#define NVME_NS_STOPPED 5 struct cdev cdev; struct device cdev_device; @@ -508,6 +512,9 @@ struct nvme_ctrl_ops { unsigned int flags; #define NVME_F_FABRICS (1 << 0) #define NVME_F_METADATA_SUPPORTED (1 << 1) +#define NVME_F_BLOCKING (1 << 2) + + const struct attribute_group **dev_attr_groups; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -728,37 +735,32 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size); + const struct blk_mq_ops *ops, unsigned int cmd_size); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, + const struct blk_mq_ops *ops, unsigned int nr_maps, unsigned int cmd_size); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send); - void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); -void nvme_stop_queues(struct nvme_ctrl *ctrl); -void nvme_start_queues(struct nvme_ctrl *ctrl); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl); -void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); @@ -857,6 +859,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; +extern const struct attribute_group nvme_dev_attrs_group; struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); #ifdef CONFIG_NVME_MULTIPATH @@ -883,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_start_request(struct request *rq); +void nvme_mpath_end_request(struct request *rq); static inline void nvme_trace_bio_complete(struct request *req) { @@ -968,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_start_request(struct request *rq) +{ +} +static inline void nvme_mpath_end_request(struct request *rq) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); @@ -1013,20 +1024,38 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) } #endif +static inline void nvme_start_request(struct request *rq) +{ + if (rq->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_start_request(rq); + blk_mq_start_request(rq); +} + static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) { return ctrl->sgls & ((1 << 0) | (1 << 1)); } #ifdef CONFIG_NVME_AUTH -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); +int __init nvme_init_auth(void); +void __exit nvme_exit_auth(void); +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); -void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else -static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; +static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +{ + return 0; +} +static inline int __init nvme_init_auth(void) +{ + return 0; +} +static inline void __exit nvme_exit_auth(void) +{ +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 488ad7dabeb8..f0f8027644bb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -15,6 +15,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> +#include <linux/kstrtox.h> #include <linux/memremap.h> #include <linux/mm.h> #include <linux/module.h> @@ -108,7 +109,7 @@ struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +static void nvme_delete_io_queues(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -130,7 +131,6 @@ struct nvme_dev { u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; - struct work_struct remove_work; struct mutex shutdown_lock; bool subsystem; u64 cmb_size; @@ -158,8 +158,6 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; - - bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) return dev->nr_allocated_queues * 8 * dev->db_stride; } -static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) +static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { unsigned int mem_size = nvme_dbbuf_size(dev); + if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) + return; + if (dev->dbbuf_dbs) { /* * Clear the dbbuf memory so the driver doesn't observe stale @@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) */ memset(dev->dbbuf_dbs, 0, mem_size); memset(dev->dbbuf_eis, 0, mem_size); - return 0; + return; } dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_dbs_dma_addr, GFP_KERNEL); if (!dev->dbbuf_dbs) - return -ENOMEM; + goto fail; dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_eis_dma_addr, GFP_KERNEL); - if (!dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - return -ENOMEM; - } + if (!dev->dbbuf_eis) + goto fail_free_dbbuf_dbs; + return; - return 0; +fail_free_dbbuf_dbs: + dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs, + dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; +fail: + dev_warn(dev->dev, "unable to allocate dma for dbbuf\n"); } static void nvme_dbbuf_dma_free(struct nvme_dev *dev) @@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void) PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); @@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); @@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &dev->ctrl; @@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev) static void nvme_pci_map_queues(struct blk_mq_tag_set *set) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); int i, qoff, offset; offset = queue_irq_offset(dev); @@ -914,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) goto out_unmap_data; } - blk_mq_start_request(req); + nvme_start_request(req); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); @@ -1474,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) } } -/** - * nvme_suspend_queue - put queue into suspended state - * @nvmeq: queue to suspend - */ -static int nvme_suspend_queue(struct nvme_queue *nvmeq) +static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid) { + struct nvme_queue *nvmeq = &dev->queues[qid]; + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) - return 1; + return; /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb(); nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - nvme_stop_admin_queue(&nvmeq->dev->ctrl); + nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) - pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); - return 0; + pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq); } static void nvme_suspend_io_queues(struct nvme_dev *dev) @@ -1499,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev) int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(&dev->queues[i]); -} - -static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) -{ - struct nvme_queue *nvmeq = &dev->queues[0]; - - if (shutdown) - nvme_shutdown_ctrl(&dev->ctrl); - else - nvme_disable_ctrl(&dev->ctrl); - - nvme_poll_irqdisable(nvmeq); + nvme_suspend_queue(dev, i); } /* @@ -1748,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - nvme_start_admin_queue(&dev->ctrl); - blk_mq_destroy_queue(dev->ctrl.admin_q); - blk_mq_free_tag_set(&dev->admin_tagset); + nvme_unquiesce_admin_queue(&dev->ctrl); + nvme_remove_admin_tag_set(&dev->ctrl); } } -static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev) -{ - struct blk_mq_tag_set *set = &dev->admin_tagset; - - set->ops = &nvme_mq_admin_ops; - set->nr_hw_queues = 1; - - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->timeout = NVME_ADMIN_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_NO_SCHED; - set->driver_data = dev; - - if (blk_mq_alloc_tag_set(set)) - return -ENOMEM; - dev->ctrl.admin_tagset = set; - - dev->ctrl.admin_q = blk_mq_init_queue(set); - if (IS_ERR(dev->ctrl.admin_q)) { - blk_mq_free_tag_set(set); - dev->ctrl.admin_q = NULL; - return -ENOMEM; - } - if (!blk_get_queue(dev->ctrl.admin_q)) { - nvme_dev_remove_admin(dev); - dev->ctrl.admin_q = NULL; - return -ENODEV; - } - return 0; -} - static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) { return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); @@ -1829,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl); + /* + * If the device has been passed off to us in an enabled state, just + * clear the enabled bit. The spec says we should set the 'shutdown + * notification bits', but doing so may cause the device to complete + * commands to the admin queue ... and we don't know what memory that + * might be pointing at! + */ + result = nvme_disable_ctrl(&dev->ctrl, false); if (result < 0) return result; @@ -2112,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u32 enable_bits = NVME_HOST_MEM_ENABLE; int ret; + if (!dev->ctrl.hmpre) + return 0; + preferred = min(preferred, max); if (min > max) { dev_warn(dev->ctrl.device, @@ -2192,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, bool new; int ret; - if (strtobool(buf, &new) < 0) + if (kstrtobool(buf, &new) < 0) return -EINVAL; if (new == ndev->hmb) @@ -2240,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = { NULL, }; -static const struct attribute_group nvme_pci_attr_group = { +static const struct attribute_group nvme_pci_dev_attrs_group = { .attrs = nvme_pci_attrs, .is_visible = nvme_pci_attrs_are_visible, }; +static const struct attribute_group *nvme_pci_dev_attr_groups[] = { + &nvme_dev_attrs_group, + &nvme_pci_dev_attrs_group, + NULL, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2319,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) - __nvme_disable_io_queues(dev, nvme_admin_delete_cq); -} - static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { /* @@ -2432,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->online_queues - 1 < dev->max_qid) { nr_io_queues = dev->online_queues - 1; - nvme_disable_io_queues(dev); + nvme_delete_io_queues(dev); result = nvme_setup_io_queues_trylock(dev); if (result) return result; @@ -2495,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) +static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode) { int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; @@ -2523,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) return true; } -static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) +static void nvme_delete_io_queues(struct nvme_dev *dev) { - struct blk_mq_tag_set * set = &dev->tagset; - int ret; + if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq)) + __nvme_delete_io_queues(dev, nvme_admin_delete_cq); +} - set->ops = &nvme_mq_ops; - set->nr_hw_queues = dev->online_queues - 1; - set->nr_maps = 1; - if (dev->io_queues[HCTX_TYPE_READ]) - set->nr_maps = 2; +static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev) +{ if (dev->io_queues[HCTX_TYPE_POLL]) - set->nr_maps = 3; - set->timeout = NVME_IO_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_SHOULD_MERGE; - set->driver_data = dev; - - /* - * Some Apple controllers requires tags to be unique - * across admin and IO queue, so reserve the first 32 - * tags of the IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - set->reserved_tags = NVME_AQ_DEPTH; - - ret = blk_mq_alloc_tag_set(set); - if (ret) { - dev_warn(dev->ctrl.device, - "IO queues tagset allocation failed %d\n", ret); - return; - } - dev->ctrl.tagset = set; + return 3; + if (dev->io_queues[HCTX_TYPE_READ]) + return 2; + return 1; } static void nvme_pci_update_nr_queues(struct nvme_dev *dev) @@ -2647,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); - return 0; + + return nvme_pci_configure_admin_queue(dev); disable: pci_disable_device(pdev); @@ -2661,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_mem_regions(to_pci_dev(dev->dev)); } -static void nvme_pci_disable(struct nvme_dev *dev) +static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); + u32 csts; - pci_free_irq_vectors(pdev); + if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev)) + return true; + if (pdev->error_state != pci_channel_io_normal) + return true; - if (pci_is_enabled(pdev)) { - pci_disable_pcie_error_reporting(pdev); - pci_disable_device(pdev); - } + csts = readl(dev->bar + NVME_REG_CSTS); + return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY); } static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev); + bool dead; mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { - u32 csts; - - if (pci_device_is_present(pdev)) - csts = readl(dev->bar + NVME_REG_CSTS); - else - csts = ~0; - - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { - freeze = true; + dead = nvme_pci_ctrl_is_dead(dev); + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) { + if (pci_is_enabled(pdev)) nvme_start_freeze(&dev->ctrl); - } - dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || - pdev->error_state != pci_channel_io_normal); + /* + * Give the controller a chance to complete all entered requests + * if doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); } - /* - * Give the controller a chance to complete all entered requests if - * doing a safe shutdown. - */ - if (!dead && shutdown && freeze) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - - nvme_stop_queues(&dev->ctrl); + nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); - nvme_disable_admin_queue(dev, shutdown); + nvme_delete_io_queues(dev); + nvme_disable_ctrl(&dev->ctrl, shutdown); + nvme_poll_irqdisable(&dev->queues[0]); } nvme_suspend_io_queues(dev); - nvme_suspend_queue(&dev->queues[0]); - nvme_pci_disable(dev); + nvme_suspend_queue(dev, 0); + pci_free_irq_vectors(pdev); + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } nvme_reap_pending_cqes(dev); nvme_cancel_tagset(&dev->ctrl); @@ -2723,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } @@ -2762,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + size_t alloc_size = sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; + + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->iod_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_mempool) + return -ENOMEM; + return 0; +} + static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) - blk_mq_free_tag_set(&dev->tagset); + nvme_remove_io_tag_set(&dev->ctrl); dev->ctrl.tagset = NULL; } +/* pairs with nvme_pci_alloc_dev */ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - if (dev->ctrl.admin_q) - blk_put_queue(dev->ctrl.admin_q); - free_opal_dev(dev->ctrl.opal_dev); - mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - /* - * Set state to deleting now to avoid blocking nvme_wait_reset(), which - * may be holding this pci_dev's device lock. - */ - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - nvme_get_ctrl(&dev->ctrl); - nvme_dev_disable(dev, false); - nvme_kill_queues(&dev->ctrl); - if (!queue_work(nvme_wq, &dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = @@ -2808,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.state != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); - result = -ENODEV; - goto out; + return; } /* @@ -2824,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - - result = nvme_pci_configure_admin_queue(dev); - if (result) - goto out_unlock; - - if (!dev->ctrl.admin_q) { - result = nvme_pci_alloc_admin_tag_set(dev); - if (result) - goto out_unlock; - } else { - nvme_start_admin_queue(&dev->ctrl); - } - - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); - - /* - * Limit the max command size to prevent iod->sg allocations going - * over a single page. - */ - dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); - dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * Don't limit the IOMMU merged segment size. - */ - dma_set_max_seg_size(dev->dev, 0xffffffff); - + nvme_unquiesce_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2865,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - /* - * We do not support an SGL for metadata (yet), so we are limited to a - * single integrity segment for the separate metadata pointer. - */ - dev->ctrl.max_integrity_segments = 1; - - result = nvme_init_ctrl_finish(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { - if (!dev->ctrl.opal_dev) - dev->ctrl.opal_dev = - init_opal_dev(&dev->ctrl, &nvme_sec_submit); - else if (was_suspend) - opal_unlock_from_suspend(dev->ctrl.opal_dev); - } else { - free_opal_dev(dev->ctrl.opal_dev); - dev->ctrl.opal_dev = NULL; - } - - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { - result = nvme_dbbuf_dma_alloc(dev); - if (result) - dev_warn(dev->dev, - "unable to allocate dma for dbbuf\n"); - } + nvme_dbbuf_dma_alloc(dev); - if (dev->ctrl.hmpre) { - result = nvme_setup_host_mem(dev); - if (result < 0) - goto out; - } + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; result = nvme_setup_io_queues(dev); if (result) goto out; /* - * Keep the controller around but remove all namespaces if we don't have - * any working I/O queue. + * Freeze and update the number of I/O queues as thos might have + * changed. If there are no I/O queues left after this reset, keep the + * controller around but remove all namespaces. */ - if (dev->online_queues < 2) { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - nvme_kill_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); - } else { - nvme_start_queues(&dev->ctrl); + if (dev->online_queues > 1) { + nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - if (!dev->ctrl.tagset) - nvme_pci_alloc_tag_set(dev); - else - nvme_pci_update_nr_queues(dev); + nvme_pci_update_nr_queues(dev); nvme_dbbuf_set(dev); nvme_unfreeze(&dev->ctrl); + } else { + dev_warn(dev->ctrl.device, "IO queues lost\n"); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); } /* @@ -2934,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group)) - dev->attrs_added = true; - nvme_start_ctrl(&dev->ctrl); return; out_unlock: mutex_unlock(&dev->shutdown_lock); out: - if (result) - dev_warn(dev->ctrl.device, - "Removing after probe failure status: %d\n", result); - nvme_remove_dead_ctrl(dev); -} - -static void nvme_remove_dead_ctrl_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_get_drvdata(pdev)) - device_release_driver(&pdev->dev); - nvme_put_ctrl(&dev->ctrl); + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n", + result); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) @@ -3010,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED, + .dev_attr_groups = nvme_pci_dev_attr_groups, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -3079,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } -static void nvme_async_probe(void *data, async_cookie_t cookie) +static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, + const struct pci_device_id *id) { - struct nvme_dev *dev = data; - - flush_work(&dev->ctrl.reset_work); - flush_work(&dev->ctrl.scan_work); - nvme_put_ctrl(&dev->ctrl); -} - -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - int node, result = -ENOMEM; - struct nvme_dev *dev; unsigned long quirks = id->driver_data; - size_t alloc_size; + int node = dev_to_node(&pdev->dev); + struct nvme_dev *dev; + int ret = -ENOMEM; - node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return -ENOMEM; + return NULL; + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; @@ -3109,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->queues = kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto free; + goto out_free_dev; dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on @@ -3137,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) "platform quirk: setting simple suspend\n"); quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } + ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (ret) + goto out_put_device; + + dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); + dma_set_max_seg_size(&pdev->dev, 0xffffffff); /* - * Double check that our mempool alloc size will cover the biggest - * command we support. + * Limit the max command size to prevent iod->sg allocations going + * over a single page. */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9); + dev->ctrl.max_segments = NVME_MAX_SEGS; - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; - goto release_pools; - } + /* + * There is no support for SGLs for metadata (yet), so we are limited to + * a single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; + return dev; + +out_put_device: + put_device(dev->dev); + kfree(dev->queues); +out_free_dev: + kfree(dev); + return ERR_PTR(ret); +} - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result = -ENOMEM; + + dev = nvme_pci_alloc_dev(pdev, id); + if (!dev) + return -ENOMEM; + + result = nvme_dev_map(dev); if (result) - goto release_mempool; + goto out_uninit_ctrl; + + result = nvme_setup_prp_pools(dev); + if (result) + goto out_dev_unmap; + + result = nvme_pci_alloc_iod_mempool(dev); + if (result) + goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - nvme_reset_ctrl(&dev->ctrl); - async_schedule(nvme_async_probe, dev); + result = nvme_pci_enable(dev); + if (result) + goto out_release_iod_mempool; + + result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset, + &nvme_mq_admin_ops, sizeof(struct nvme_iod)); + if (result) + goto out_disable; + + /* + * Mark the controller as connecting before sending admin commands to + * allow the timeout handler to do the right thing. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + result = -EBUSY; + goto out_disable; + } + + result = nvme_init_ctrl_finish(&dev->ctrl, false); + if (result) + goto out_disable; + + nvme_dbbuf_dma_alloc(dev); + + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out_disable; + + result = nvme_setup_io_queues(dev); + if (result) + goto out_disable; + if (dev->online_queues > 1) { + nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, + nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); + nvme_dbbuf_set(dev); + } + + if (!dev->ctrl.tagset) + dev_warn(dev->ctrl.device, "IO queues not created\n"); + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, + "failed to mark controller live state\n"); + result = -ENODEV; + goto out_disable; + } + + pci_set_drvdata(pdev, dev); + + nvme_start_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return 0; - release_mempool: +out_disable: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); + nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); + nvme_free_queues(dev, 0); +out_release_iod_mempool: mempool_destroy(dev->iod_mempool); - release_pools: +out_release_prp_pools: nvme_release_prp_pools(dev); - unmap: +out_dev_unmap: nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); return result; } @@ -3208,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } -static void nvme_remove_attrs(struct nvme_dev *dev) -{ - if (dev->attrs_added) - sysfs_remove_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group); -} - /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3236,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); + mempool_destroy(dev->iod_mempool); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); @@ -3576,11 +3505,12 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, -#ifdef CONFIG_PM_SLEEP .driver = { - .pm = &nvme_dev_pm_ops, - }, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, +#ifdef CONFIG_PM_SLEEP + .pm = &nvme_dev_pm_ops, #endif + }, .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6e079abb22ee..bbad26b82b56 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -798,7 +798,9 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) NVME_RDMA_METADATA_SGL_SIZE; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, - &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size); + &nvme_rdma_mq_ops, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, + cmd_size); } static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) @@ -846,7 +848,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (new) { error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, - BLK_MQ_F_NO_SCHED, sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE); if (error) @@ -869,16 +870,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_quiesce_queue; return 0; out_quiesce_queue: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -922,7 +923,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_cleanup_tagset; if (!new) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -949,7 +950,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_tagset: @@ -964,12 +965,12 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) { - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); } nvme_rdma_destroy_admin_queue(ctrl); @@ -980,12 +981,12 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, { if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (remove) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_remove_io_tag_set(&ctrl->ctrl); } nvme_rdma_free_io_queues(ctrl); @@ -1106,7 +1107,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) destroy_io: if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); @@ -1115,7 +1116,7 @@ destroy_io: nvme_rdma_free_io_queues(ctrl); } destroy_admin: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1153,13 +1154,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_auth_stop(&ctrl->ctrl); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); + nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2040,7 +2041,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) goto unmap_qe; - blk_mq_start_request(rq); + nvme_start_request(rq); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && queue->pi_support && @@ -2207,11 +2208,8 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(&ctrl->ctrl); - if (shutdown) - nvme_shutdown_ctrl(&ctrl->ctrl); - else - nvme_disable_ctrl(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); + nvme_disable_ctrl(&ctrl->ctrl, shutdown); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3d13f6f08388..b69b89166b6b 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1867,7 +1867,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (new) { ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, &nvme_tcp_mq_ops, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, sizeof(struct nvme_tcp_request)); if (ret) goto out_free_io_queues; @@ -1884,7 +1884,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_cleanup_connect_q; if (!new) { - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -1911,7 +1911,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: @@ -1942,7 +1942,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (new) { error = nvme_alloc_admin_tag_set(ctrl, &to_tcp_ctrl(ctrl)->admin_tag_set, - &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING, + &nvme_tcp_admin_mq_ops, sizeof(struct nvme_tcp_request)); if (error) goto out_free_queue; @@ -1956,16 +1956,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); - error = nvme_init_ctrl_finish(ctrl); + error = nvme_init_ctrl_finish(ctrl, false); if (error) goto out_quiesce_queue; return 0; out_quiesce_queue: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1981,12 +1981,12 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); if (remove) - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1995,14 +1995,14 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); nvme_start_freeze(ctrl); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); if (remove) - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); } @@ -2083,14 +2083,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) destroy_io: if (ctrl->queue_count > 1) { - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2128,14 +2128,14 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; - nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); + nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2150,11 +2150,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(ctrl); - if (shutdown) - nvme_shutdown_ctrl(ctrl); - else - nvme_disable_ctrl(ctrl); + nvme_quiesce_admin_queue(ctrl); + nvme_disable_ctrl(ctrl, shutdown); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2414,7 +2411,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(ret)) return ret; - blk_mq_start_request(rq); + nvme_start_request(rq); nvme_tcp_queue_request(req, true, bd->last); @@ -2523,7 +2520,7 @@ static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .name = "tcp", .module = THIS_MODULE, - .flags = NVME_F_FABRICS, + .flags = NVME_F_FABRICS | NVME_F_BLOCKING, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c8a061ce3ee5..53a004ea320c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -370,7 +370,9 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number, strlen(subsys->model_number), ' '); memcpy_and_pad(id->fr, sizeof(id->fr), - UTS_RELEASE, strlen(UTS_RELEASE), ' '); + subsys->firmware_rev, strlen(subsys->firmware_rev), ' '); + + put_unaligned_le24(subsys->ieee_oui, id->ieee); id->rab = 6; @@ -379,11 +381,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) else id->cntrltype = NVME_CTRL_IO; - /* - * XXX: figure out how we can assign a IEEE OUI, but until then - * the safest is to leave it as zeroes. - */ - /* we support multiple ports, multiples hosts and ANA: */ id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | NVME_CTRL_CMIC_ANA; @@ -564,7 +561,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } if (req->ns->readonly) - id->nsattr |= (1 << 0); + id->nsattr |= NVME_NS_ATTR_RO; done: if (!status) status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 6a2816f3b4e8..907143870da5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -4,6 +4,7 @@ * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kstrtox.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> @@ -267,7 +268,7 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item, struct nvmet_port *port = to_nvmet_port(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; if (nvmet_is_port_enabled(port, __func__)) @@ -532,7 +533,7 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, bool enable; int ret = 0; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) return -EINVAL; if (enable) @@ -556,7 +557,7 @@ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, struct nvmet_ns *ns = to_nvmet_ns(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; mutex_lock(&ns->subsys->lock); @@ -579,7 +580,7 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item, struct nvmet_ns *ns = to_nvmet_ns(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; if (!val) @@ -728,7 +729,7 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item, bool enable; int ret = 0; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) return -EINVAL; if (enable) @@ -995,7 +996,7 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, bool allow_any_host; int ret = 0; - if (strtobool(page, &allow_any_host)) + if (kstrtobool(page, &allow_any_host)) return -EINVAL; down_write(&nvmet_config_sem); @@ -1262,6 +1263,116 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_model); +static ssize_t nvmet_subsys_attr_ieee_oui_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "0x%06x\n", subsys->ieee_oui); +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + uint32_t val = 0; + int ret; + + if (subsys->subsys_discovered) { + pr_err("Can't set IEEE OUI. 0x%06x is already assigned\n", + subsys->ieee_oui); + return -EINVAL; + } + + ret = kstrtou32(page, 0, &val); + if (ret < 0) + return ret; + + if (val >= 0x1000000) + return -EINVAL; + + subsys->ieee_oui = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_ieee_oui_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui); + +static ssize_t nvmet_subsys_attr_firmware_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "%s\n", subsys->firmware_rev); +} + +static ssize_t nvmet_subsys_attr_firmware_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int pos = 0, len; + char *val; + + if (subsys->subsys_discovered) { + pr_err("Can't set firmware revision. %s is already assigned\n", + subsys->firmware_rev); + return -EINVAL; + } + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + if (len > NVMET_FR_MAX_SIZE) { + pr_err("Firmware revision size can not exceed %d Bytes\n", + NVMET_FR_MAX_SIZE); + return -EINVAL; + } + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + val = kmemdup_nul(page, len, GFP_KERNEL); + if (!val) + return -ENOMEM; + + kfree(subsys->firmware_rev); + + subsys->firmware_rev = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_firmware_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_firmware_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_firmware); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1275,7 +1386,7 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); bool pi_enable; - if (strtobool(page, &pi_enable)) + if (kstrtobool(page, &pi_enable)) return -EINVAL; subsys->pi_support = pi_enable; @@ -1293,6 +1404,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, const char *page, size_t cnt) { + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_ctrl *ctrl; u16 qid_max; if (sscanf(page, "%hu\n", &qid_max) != 1) @@ -1302,8 +1415,13 @@ static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, return -EINVAL; down_write(&nvmet_config_sem); - to_subsys(item)->max_qid = qid_max; + subsys->max_qid = qid_max; + + /* Force reconnect */ + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + ctrl->ops->delete_ctrl(ctrl); up_write(&nvmet_config_sem); + return cnt; } CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); @@ -1316,6 +1434,8 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, + &nvmet_subsys_attr_attr_ieee_oui, + &nvmet_subsys_attr_attr_firmware, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif @@ -1395,7 +1515,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item, struct nvmet_port *port = to_nvmet_port(item); bool enable; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) goto inval; if (enable) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index aecb5853f8da..f66ed13d7c11 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -10,11 +10,14 @@ #include <linux/pci-p2pdma.h> #include <linux/scatterlist.h> +#include <generated/utsrelease.h> + #define CREATE_TRACE_POINTS #include "trace.h" #include "nvmet.h" +struct kmem_cache *nvmet_bvec_cache; struct workqueue_struct *buffered_io_wq; struct workqueue_struct *zbd_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; @@ -695,11 +698,10 @@ static void nvmet_update_sq_head(struct nvmet_req *req) if (req->sq->size) { u32 old_sqhd, new_sqhd; + old_sqhd = READ_ONCE(req->sq->sqhd); do { - old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; - } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != - old_sqhd); + } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd)); } req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); } @@ -1561,6 +1563,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, goto free_subsys; } + subsys->ieee_oui = 0; + + subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL); + if (!subsys->firmware_rev) { + ret = -ENOMEM; + goto free_mn; + } + switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; @@ -1572,14 +1582,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, default: pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); ret = -EINVAL; - goto free_mn; + goto free_fr; } subsys->type = type; subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, GFP_KERNEL); if (!subsys->subsysnqn) { ret = -ENOMEM; - goto free_mn; + goto free_fr; } subsys->cntlid_min = NVME_CNTLID_MIN; subsys->cntlid_max = NVME_CNTLID_MAX; @@ -1592,6 +1602,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, return subsys; +free_fr: + kfree(subsys->firmware_rev); free_mn: kfree(subsys->model_number); free_subsys: @@ -1611,6 +1623,7 @@ static void nvmet_subsys_free(struct kref *ref) kfree(subsys->subsysnqn); kfree(subsys->model_number); + kfree(subsys->firmware_rev); kfree(subsys); } @@ -1631,26 +1644,28 @@ void nvmet_subsys_put(struct nvmet_subsys *subsys) static int __init nvmet_init(void) { - int error; + int error = -ENOMEM; nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + nvmet_bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nvmet_bvec_cache) + return -ENOMEM; + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); if (!zbd_wq) - return -ENOMEM; + goto out_destroy_bvec_cache; buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); - if (!buffered_io_wq) { - error = -ENOMEM; + if (!buffered_io_wq) goto out_free_zbd_work_queue; - } nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0); - if (!nvmet_wq) { - error = -ENOMEM; + if (!nvmet_wq) goto out_free_buffered_work_queue; - } error = nvmet_init_discovery(); if (error) @@ -1669,6 +1684,8 @@ out_free_buffered_work_queue: destroy_workqueue(buffered_io_wq); out_free_zbd_work_queue: destroy_workqueue(zbd_wq); +out_destroy_bvec_cache: + kmem_cache_destroy(nvmet_bvec_cache); return error; } @@ -1680,6 +1697,7 @@ static void __exit nvmet_exit(void) destroy_workqueue(nvmet_wq); destroy_workqueue(buffered_io_wq); destroy_workqueue(zbd_wq); + kmem_cache_destroy(nvmet_bvec_cache); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 946ad0240ee5..871c4f32f443 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include "nvmet.h" -#define NVMET_MAX_MPOOL_BVEC 16 #define NVMET_MIN_MPOOL_OBJ 16 void nvmet_file_ns_revalidate(struct nvmet_ns *ns) @@ -26,8 +25,6 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) flush_workqueue(buffered_io_wq); mempool_destroy(ns->bvec_pool); ns->bvec_pool = NULL; - kmem_cache_destroy(ns->bvec_cache); - ns->bvec_cache = NULL; fput(ns->file); ns->file = NULL; } @@ -59,16 +56,8 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) ns->blksize_shift = min_t(u8, file_inode(ns->file)->i_blkbits, 12); - ns->bvec_cache = kmem_cache_create("nvmet-bvec", - NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ns->bvec_cache) { - ret = -ENOMEM; - goto err; - } - ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab, - mempool_free_slab, ns->bvec_cache); + mempool_free_slab, nvmet_bvec_cache); if (!ns->bvec_pool) { ret = -ENOMEM; @@ -77,9 +66,10 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) return ret; err: + fput(ns->file); + ns->file = NULL; ns->size = 0; ns->blksize_shift = 0; - nvmet_file_ns_disable(ns); return ret; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b45fe3adf015..f2d24b2d992f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -145,7 +145,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - blk_mq_start_request(req); + nvme_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; iod->req.port = queue->ctrl->port; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, @@ -353,7 +353,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.queue_count = 1; error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED, + &nvme_loop_admin_mq_ops, sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (error) @@ -375,9 +375,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_cleanup_tagset; @@ -394,14 +394,14 @@ out_free_sq: static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) - nvme_shutdown_ctrl(&ctrl->ctrl); + nvme_disable_ctrl(&ctrl->ctrl, true); nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); @@ -494,7 +494,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) return ret; ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_loop_mq_ops, 1, sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (ret) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dfe3894205aa..89bedfcd974c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -29,6 +29,7 @@ #define NVMET_DEFAULT_CTRL_MODEL "Linux" #define NVMET_MN_MAX_SIZE 40 #define NVMET_SN_MAX_SIZE 20 +#define NVMET_FR_MAX_SIZE 8 /* * Supported optional AENs: @@ -77,7 +78,6 @@ struct nvmet_ns { struct completion disable_done; mempool_t *bvec_pool; - struct kmem_cache *bvec_cache; int use_p2pmem; struct pci_dev *p2p_dev; @@ -264,6 +264,8 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; char *model_number; + u32 ieee_oui; + char *firmware_rev; #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; @@ -393,6 +395,8 @@ struct nvmet_req { u64 error_slba; }; +#define NVMET_MAX_MPOOL_BVEC 16 +extern struct kmem_cache *nvmet_bvec_cache; extern struct workqueue_struct *buffered_io_wq; extern struct workqueue_struct *zbd_wq; extern struct workqueue_struct *nvmet_wq; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 5565f67d6537..86812d2073ea 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(published); +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + size_t len = vma->vm_end - vma->vm_start; + struct pci_p2pdma *p2pdma; + struct percpu_ref *ref; + unsigned long vaddr; + void *kaddr; + int ret; + + /* prevent private mappings from being established */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + pci_info_ratelimited(pdev, + "%s: fail, attempted private mapping\n", + current->comm); + return -EINVAL; + } + + if (vma->vm_pgoff) { + pci_info_ratelimited(pdev, + "%s: fail, attempted mapping with non-zero offset\n", + current->comm); + return -EINVAL; + } + + rcu_read_lock(); + p2pdma = rcu_dereference(pdev->p2pdma); + if (!p2pdma) { + ret = -ENODEV; + goto out; + } + + kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref); + if (!kaddr) { + ret = -ENOMEM; + goto out; + } + + /* + * vm_insert_page() can sleep, so a reference is taken to mapping + * such that rcu_read_unlock() can be done before inserting the + * pages + */ + if (unlikely(!percpu_ref_tryget_live_rcu(ref))) { + ret = -ENODEV; + goto out_free_mem; + } + rcu_read_unlock(); + + for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { + ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr)); + if (ret) { + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len); + return ret; + } + percpu_ref_get(ref); + put_page(virt_to_page(kaddr)); + kaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + + percpu_ref_put(ref); + + return 0; +out_free_mem: + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len); +out: + rcu_read_unlock(); + return ret; +} + +static struct bin_attribute p2pmem_alloc_attr = { + .attr = { .name = "allocate", .mode = 0660 }, + .mmap = p2pmem_alloc_mmap, + /* + * Some places where we want to call mmap (ie. python) will check + * that the file size is greater than the mmap size before allowing + * the mmap to continue. To work around this, just set the size + * to be very large. + */ + .size = SZ_1T, +}; + static struct attribute *p2pmem_attrs[] = { &dev_attr_size.attr, &dev_attr_available.attr, @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = { NULL, }; +static struct bin_attribute *p2pmem_bin_attrs[] = { + &p2pmem_alloc_attr, + NULL, +}; + static const struct attribute_group p2pmem_group = { .attrs = p2pmem_attrs, + .bin_attrs = p2pmem_bin_attrs, .name = "p2pmem", }; +static void p2pdma_page_free(struct page *page) +{ + struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); + struct percpu_ref *ref; + + gen_pool_free_owner(pgmap->provider->p2pdma->pool, + (uintptr_t)page_to_virt(page), PAGE_SIZE, + (void **)&ref); + percpu_ref_put(ref); +} + +static const struct dev_pagemap_ops p2pdma_pgmap_ops = { + .page_free = p2pdma_page_free, +}; + static void pci_p2pdma_release(void *data) { struct pci_dev *pdev = data; @@ -152,6 +257,19 @@ out: return error; } +static void pci_p2pdma_unmap_mappings(void *data) +{ + struct pci_dev *pdev = data; + + /* + * Removing the alloc attribute from sysfs will call + * unmap_mapping_range() on the inode, teardown any existing userspace + * mappings and prevent new ones from being created. + */ + sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + p2pmem_group.name); +} + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -198,6 +316,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->range.end = pgmap->range.start + size - 1; pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; + pgmap->ops = &p2pdma_pgmap_ops; p2p_pgmap->provider = pdev; p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - @@ -209,6 +328,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, goto pgmap_free; } + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, + pdev); + if (error) + goto pages_free; + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 8b89fab7c420..249757ddd8fe 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2735,7 +2735,7 @@ static void scsi_stop_queue(struct scsi_device *sdev, bool nowait) blk_mq_quiesce_queue(sdev->request_queue); } else { if (!nowait) - blk_mq_wait_quiesce_done(sdev->request_queue); + blk_mq_wait_quiesce_done(sdev->request_queue->tag_set); } } diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 5d27f5196de6..0a95fa787fdf 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -344,7 +344,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, sdev->request_queue = q; q->queuedata = sdev; __scsi_init_queue(sdev->host, q); - WARN_ON_ONCE(!blk_get_queue(q)); depth = sdev->host->cmd_per_lun ?: 1; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index b1f59a5fe632..d2b11d5b91ce 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9544,6 +9544,7 @@ void ufshcd_remove(struct ufs_hba *hba) ufshpb_remove(hba); ufs_sysfs_remove_nodes(hba->dev); blk_mq_destroy_queue(hba->tmf_queue); + blk_put_queue(hba->tmf_queue); blk_mq_free_tag_set(&hba->tmf_tag_set); scsi_remove_host(hba->host); /* disable interrupts */ @@ -9840,6 +9841,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) free_tmf_queue: blk_mq_destroy_queue(hba->tmf_queue); + blk_put_queue(hba->tmf_queue); free_tmf_tag_set: blk_mq_free_tag_set(&hba->tmf_tag_set); out_remove_scsi_host: |