From 6df87e65dbe4528ef07b917af89913abb8caaaba Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 28 May 2011 14:44:46 +0200 Subject: block: improve the bio_add_page() and bio_add_pc_page() descriptions The descriptions of bio_add_page() and bio_add_pc_page() are slightly inconsistent; improve them. Signed-off-by: Andreas Gruenbacher Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index 840a0d755248..9bfade8a609b 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * @offset: vec entry offset * * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block - * device limitations. The target block device must allow bio's - * smaller than PAGE_SIZE, so it is always possible to add a single - * page to an empty bio. This should only be used by REQ_PC bios. + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. */ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) @@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page); * @offset: vec entry offset * * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block - * device limitations. The target block device must allow bio's - * smaller than PAGE_SIZE, so it is always possible to add a single - * page to an empty bio. + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) -- cgit v1.2.3-70-g09d2 From 35fbf5bcf497d6ddbe7b6478141e7526d1474ff5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 28 May 2011 14:44:46 +0200 Subject: nbd: pass MSG_* flags to kernel_recvmsg() Unlike kernel_sendmsg(), kernel_recvmsg() requires passing flags explicitly via last parameter instead of struct msghdr.msg_flags. Therefore calls to sock_xmit(lo, 0, ..., MSG_WAITALL) have not been processed properly by tcp layer wrt. the flag. Fix it. Signed-off-by: Namhyung Kim Cc: Paul Clements Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e6fc716aca45..1df3bfe5225b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -192,7 +192,8 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (lo->xmit_timeout) del_timer_sync(&ti); } else - result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0); + result = kernel_recvmsg(sock, &msg, &iov, 1, size, + msg.msg_flags); if (signal_pending(current)) { siginfo_t info; -- cgit v1.2.3-70-g09d2 From 3b2710824e00d238554c13b5add347e6c701ab1a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 28 May 2011 14:44:46 +0200 Subject: nbd: limit module parameters to a sane value The 'max_part' parameter controls the number of maximum partition a nbd device can have. However if a user specifies very large value it would exceed the limitation of device minor number and can cause a kernel oops (or, at least, produce invalid device nodes in some cases). In addition, specifying large 'nbds_max' value causes same problem for the same reason. On my desktop, following command results to the kernel bug: $ sudo modprobe nbd max_part=100000 kernel BUG at /media/Linux_Data/project/linux/fs/sysfs/group.c:65! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/virtual/block/nbd4/range CPU 1 Modules linked in: nbd(+) bridge stp llc kvm_intel kvm asus_atk0110 sg sr_mod cdrom Pid: 2522, comm: modprobe Tainted: G W 2.6.39-leonard+ #159 System manufacturer System Product Name/P5G41TD-M PRO RIP: 0010:[] [] internal_create_group+0x2f/0x166 RSP: 0018:ffff8801009f1de8 EFLAGS: 00010246 RAX: 00000000ffffffef RBX: ffff880103920478 RCX: 00000000000a7bd3 RDX: ffffffff81a2dbe0 RSI: 0000000000000000 RDI: ffff880103920478 RBP: ffff8801009f1e38 R08: ffff880103920468 R09: ffff880103920478 R10: ffff8801009f1de8 R11: ffff88011eccbb68 R12: ffffffff81a2dbe0 R13: ffff880103920468 R14: 0000000000000000 R15: ffff880103920400 FS: 00007f3c49de9700(0000) GS:ffff88011f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f3b7fe7c000 CR3: 00000000cd58d000 CR4: 00000000000406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 2522, threadinfo ffff8801009f0000, task ffff8801009a93a0) Stack: ffff8801009f1e58 ffffffff812e8f6e ffff8801009f1e58 ffffffff812e7a80 ffff880000000010 ffff880103920400 ffff8801002fd0c0 ffff880103920468 0000000000000011 ffff880103920400 ffff8801009f1e48 ffffffff8115ab6a Call Trace: [] ? device_add+0x4f1/0x5e4 [] ? dev_set_name+0x41/0x43 [] sysfs_create_group+0x13/0x15 [] blk_trace_init_sysfs+0x14/0x16 [] blk_register_queue+0x4c/0xfd [] add_disk+0xe4/0x29c [] nbd_init+0x2ab/0x30d [nbd] [] ? 0xffffffffa007dfff [] do_one_initcall+0x7f/0x13e [] sys_init_module+0xa1/0x1e3 [] system_call_fastpath+0x16/0x1b Code: 41 57 41 56 41 55 41 54 53 48 83 ec 28 0f 1f 44 00 00 48 89 fb 41 89 f6 49 89 d4 48 85 ff 74 0b 85 f6 75 0b 48 83 7f 30 00 75 14 <0f> 0b eb fe b9 ea ff ff ff 48 83 7f 30 00 0f 84 09 01 00 00 49 RIP [] internal_create_group+0x2f/0x166 RSP ---[ end trace 753285ffbf72c57c ]--- Signed-off-by: Namhyung Kim Cc: Laurent Vivier Cc: Paul Clements Cc: stable@kernel.org Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1df3bfe5225b..fdee7567fd15 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -757,6 +757,12 @@ static int __init nbd_init(void) if (max_part > 0) part_shift = fls(max_part); + if ((1UL << part_shift) > DISK_MAX_PARTS) + return -EINVAL; + + if (nbds_max > 1UL << (MINORBITS - part_shift)) + return -EINVAL; + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = alloc_disk(1 << part_shift); if (!disk) -- cgit v1.2.3-70-g09d2 From 5988ce239682854d4e632fb58bff000700830394 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 28 May 2011 14:44:46 +0200 Subject: nbd: adjust 'max_part' according to part_shift The 'max_part' parameter determines how many partitions are supported on each nbd device. However the actual number can be changed to the power of 2 minus 1 form during the module initialization as alloc_disk() is called with (1 << part_shift) for some reason. So adjust 'max_part' also at least for consistency with loop and brd. It is exported via sysfs already, and a user should check this value after module loading if [s]he wants to use that number correctly (i.e. fdisk or something). Signed-off-by: Namhyung Kim Cc: Laurent Vivier Cc: Paul Clements Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index fdee7567fd15..f533f3375e24 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -754,9 +754,20 @@ static int __init nbd_init(void) return -ENOMEM; part_shift = 0; - if (max_part > 0) + if (max_part > 0) { part_shift = fls(max_part); + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can know the max number of + * partition kernel should be able to manage. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + if ((1UL << part_shift) > DISK_MAX_PARTS) return -EINVAL; -- cgit v1.2.3-70-g09d2 From a1706ac4c0201ea0143dc0db0659001b26ceeabb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 May 2011 07:42:51 +0200 Subject: Revert "block: Remove extra discard_alignment from hd_struct." It was not a good idea to start dereferencing disk->queue from the fs sysfs strategy for displaying discard alignment. We ran into first a NULL pointer deref, and after fixing that we sometimes see unvalid disk->queue pointer values. Since discard is the only one of the bunch actually looking into the queue, just revert the change. This reverts commit 23ceb5b7719e9276d4fa72a3ecf94dd396755276. Conflicts: fs/partitions/check.c --- fs/partitions/check.c | 10 +++------- include/linux/genhd.h | 1 + 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f82e762eeca2..d545e97d99c3 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct gendisk *disk = dev_to_disk(dev); - unsigned int alignment = 0; - - if (disk->queue) - alignment = queue_limit_discard_alignment(&disk->queue->limits, - p->start_sect); - return sprintf(buf, "%u\n", alignment); + return sprintf(buf, "%u\n", p->discard_alignment); } ssize_t part_stat_show(struct device *dev, @@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b78956b3c2e7..300d7582006e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -100,6 +100,7 @@ struct hd_struct { sector_t start_sect; sector_t nr_sects; sector_t alignment_offset; + unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; -- cgit v1.2.3-70-g09d2 From ea9d6553b3b3044e7374774cc33bb1b2eee19dd3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 31 May 2011 13:45:53 +0200 Subject: block: remove unwanted semicolons Since those defined functions require additional semicolon from the caller, they could cause potential syntax errors when used in if-else statements. Signed-off-by: Namhyung Kim Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ae9091a68480..1a23722e8878 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1282,8 +1282,8 @@ queue_max_integrity_segments(struct request_queue *q) #define blk_get_integrity(a) (0) #define blk_integrity_compare(a, b) (0) #define blk_integrity_register(a, b) (0) -#define blk_integrity_unregister(a) do { } while (0); -#define blk_queue_max_integrity_segments(a, b) do { } while (0); +#define blk_integrity_unregister(a) do { } while (0) +#define blk_queue_max_integrity_segments(a, b) do { } while (0) #define queue_max_integrity_segments(a) (0) #define blk_integrity_merge_rq(a, b, c) (0) #define blk_integrity_merge_bio(a, b, c) (0) -- cgit v1.2.3-70-g09d2 From 4495a7d41dbda03841c2a1c2a5ce7135a45131ba Mon Sep 17 00:00:00 2001 From: Kyungmin Park Date: Tue, 31 May 2011 10:04:09 +0200 Subject: CFQ: Fix typo and remove unnecessary semicolon Fix comment typo and remove unnecessary semicolon at macro Signed-off-by: Kyungmin Park Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7c52d6888924..8a02c955c610 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -185,7 +185,7 @@ struct cfq_group { int nr_cfqq; /* - * Per group busy queus average. Useful for workload slice calc. We + * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. @@ -369,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy); #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args); + blkg_path(&(cfqq)->cfqg->blkg), ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args); \ + blkg_path(&(cfqg)->blkg), ##args) \ #else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) #endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) -- cgit v1.2.3-70-g09d2 From 4c49ff3fe128ca68dabd07537415c419ad7f82f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 1 Jun 2011 08:27:41 +0200 Subject: block: blkdev_get() should access ->bd_disk only after success d4dc210f69 (block: don't block events on excl write for non-optical devices) added dereferencing of bdev->bd_disk to test GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; however, bdev->bd_disk can be %NULL if open failed which can lead to an oops. Test the flag after testing open was successful, not before. Signed-off-by: Tejun Heo Reported-by: David Miller Tested-by: David Miller Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 1f2b19978333..1a2421f908f0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) * individual writeable reference is too fragile given the * way @mode is used in blkdev_get/put(). */ - if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && - !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; disk_block_events(disk); } -- cgit v1.2.3-70-g09d2 From 6464920a6e30604cb71d0ecbaa20e35009bd76fb Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 25 May 2011 12:24:25 +0200 Subject: xen/blkback: don't call vbd_size() if bd_disk is NULL ...because vbd_size() dereferences bd_disk if bd_part is NULL. Signed-off-by: Laszlo Ersek Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 34570823355b..6cc0db1bf522 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -357,14 +357,13 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, } vbd->bdev = bdev; - vbd->size = vbd_sz(vbd); - if (vbd->bdev->bd_disk == NULL) { DPRINTK("xen_vbd_create: device %08x doesn't exist.\n", vbd->pdevice); xen_vbd_free(vbd); return -ENOENT; } + vbd->size = vbd_sz(vbd); if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) vbd->type |= VDISK_CDROM; -- cgit v1.2.3-70-g09d2 From 9b83c771214cf6a256ee875050e6eaf320cf7983 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 May 2011 09:27:16 +0300 Subject: xen/blkback: potential null dereference in error handling blkbk->pending_pages can be NULL here so I added a check for it. Signed-off-by: Dan Carpenter [v1: Redid the loop a bit] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index c73910cc28c9..5cf2993a8338 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -809,11 +809,13 @@ static int __init xen_blkif_init(void) failed_init: kfree(blkbk->pending_reqs); kfree(blkbk->pending_grant_handles); - for (i = 0; i < mmap_pages; i++) { - if (blkbk->pending_pages[i]) - __free_page(blkbk->pending_pages[i]); + if (blkbk->pending_pages) { + for (i = 0; i < mmap_pages; i++) { + if (blkbk->pending_pages[i]) + __free_page(blkbk->pending_pages[i]); + } + kfree(blkbk->pending_pages); } - kfree(blkbk->pending_pages); kfree(blkbk); blkbk = NULL; return rc; -- cgit v1.2.3-70-g09d2 From 28304f485c3627cc5e1665b92e26eb7fcfe98088 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 2 Jun 2011 13:05:02 +0200 Subject: cfq-iosched: Remove bogus check in queue_fail path queue_fail can only be reached if cic is NULL, so its check for cic must be bogus. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8a02c955c610..3c7b537bf908 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3786,9 +3786,6 @@ new_queue: return 0; queue_fail: - if (cic) - put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); -- cgit v1.2.3-70-g09d2 From e2bd9678fc0085acf540dc4cb48ff961cd4d88c0 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 2 Jun 2011 13:05:02 +0200 Subject: block: Use hlist_entry() for io_context.cic_list.first list_entry() and hlist_entry() are both simply aliases for container_of(), but since io_context.cic_list.first is an hlist_node one should at least use the correct alias. Signed-off-by: Paul Bolle Signed-off-by: Jens Axboe --- block/blk-ioc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index c898049dafd5..342eae9b0d3c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc) if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, cic_list); cic->dtor(ioc); } @@ -57,7 +57,7 @@ static void cfq_exit(struct io_context *ioc) if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, cic_list); cic->exit(ioc); } -- cgit v1.2.3-70-g09d2