From b80d5ccca3a012e91ca64a2a0b13049163a6a698 Mon Sep 17 00:00:00 2001 From: Haiyan Hu Date: Tue, 10 Sep 2013 11:25:37 +0800 Subject: NVMe: Avoid shift operation when writing cq head doorbell Changes the type of dev->db_stride to unsigned and changes the value stored there to be 1 << the current value. Then there is less calculation to be done at completion time. Signed-off-by: Haiyan Hu Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 26ebcf41c213..8119a476cb29 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -80,7 +80,7 @@ struct nvme_dev { struct dma_pool *prp_small_pool; int instance; int queue_count; - int db_stride; + u32 db_stride; u32 ctrl_config; struct msix_entry *entry; struct nvme_bar __iomem *bar; -- cgit v1.2.3-70-g09d2 From 320a382746e0ab1304476ea7e986a8d416ab99db Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 23 Oct 2013 13:07:34 -0600 Subject: NVMe: compat SG_IO ioctl For 32-bit versions of sg3-utils running on a 64-bit system. This is mostly a copy from the relevent portions of fs/compat_ioctl.c, with slight modifications for going through block_device_operations. Signed-off-by: Keith Busch Reviewed-by: Vishal Verma [fixed up CONFIG_COMPAT=n build problems] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 18 +++++- drivers/block/nvme-scsi.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 1 + 3 files changed, 165 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index df3daeb6227f..e44350de8a21 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1568,10 +1568,26 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, } } +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case SG_IO: + return nvme_sg_io32(ns, arg); + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, }; static void nvme_resubmit_bios(struct nvme_queue *nvmeq) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 4a4ff4eb8e23..4a0ceb64e269 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -3038,6 +3039,152 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return retcode; } +#ifdef CONFIG_COMPAT +typedef struct sg_io_hdr32 { + compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ + compat_int_t dxfer_direction; /* [i] data transfer direction */ + unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ + unsigned char mx_sb_len; /* [i] max length to write to sbp */ + unsigned short iovec_count; /* [i] 0 implies no scatter gather */ + compat_uint_t dxfer_len; /* [i] byte count of data transfer */ + compat_uint_t dxferp; /* [i], [*io] points to data transfer memory + or scatter gather list */ + compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ + compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ + compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ + compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ + compat_int_t pack_id; /* [i->o] unused internally (normally) */ + compat_uptr_t usr_ptr; /* [i->o] unused internally */ + unsigned char status; /* [o] scsi status */ + unsigned char masked_status; /* [o] shifted, masked scsi status */ + unsigned char msg_status; /* [o] messaging level data (optional) */ + unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ + unsigned short host_status; /* [o] errors from host adapter */ + unsigned short driver_status; /* [o] errors from software driver */ + compat_int_t resid; /* [o] dxfer_len - actual_transferred */ + compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ + compat_uint_t info; /* [o] auxiliary information */ +} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ + +typedef struct sg_iovec32 { + compat_uint_t iov_base; + compat_uint_t iov_len; +} sg_iovec32_t; + +static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) +{ + sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); + sg_iovec32_t __user *iov32 = dxferp; + int i; + + for (i = 0; i < iovec_count; i++) { + u32 base, len; + + if (get_user(base, &iov32[i].iov_base) || + get_user(len, &iov32[i].iov_len) || + put_user(compat_ptr(base), &iov[i].iov_base) || + put_user(len, &iov[i].iov_len)) + return -EFAULT; + } + + if (put_user(iov, &sgio->dxferp)) + return -EFAULT; + return 0; +} + +int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg) +{ + sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg; + sg_io_hdr_t __user *sgio; + u16 iovec_count; + u32 data; + void __user *dxferp; + int err; + int interface_id; + + if (get_user(interface_id, &sgio32->interface_id)) + return -EFAULT; + if (interface_id != 'S') + return -EINVAL; + + if (get_user(iovec_count, &sgio32->iovec_count)) + return -EFAULT; + + { + void __user *top = compat_alloc_user_space(0); + void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + + (iovec_count * sizeof(sg_iovec_t))); + if (new > top) + return -EINVAL; + + sgio = new; + } + + /* Ok, now construct. */ + if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, + (2 * sizeof(int)) + + (2 * sizeof(unsigned char)) + + (1 * sizeof(unsigned short)) + + (1 * sizeof(unsigned int)))) + return -EFAULT; + + if (get_user(data, &sgio32->dxferp)) + return -EFAULT; + dxferp = compat_ptr(data); + if (iovec_count) { + if (sg_build_iovec(sgio, dxferp, iovec_count)) + return -EFAULT; + } else { + if (put_user(dxferp, &sgio->dxferp)) + return -EFAULT; + } + + { + unsigned char __user *cmdp; + unsigned char __user *sbp; + + if (get_user(data, &sgio32->cmdp)) + return -EFAULT; + cmdp = compat_ptr(data); + + if (get_user(data, &sgio32->sbp)) + return -EFAULT; + sbp = compat_ptr(data); + + if (put_user(cmdp, &sgio->cmdp) || + put_user(sbp, &sgio->sbp)) + return -EFAULT; + } + + if (copy_in_user(&sgio->timeout, &sgio32->timeout, + 3 * sizeof(int))) + return -EFAULT; + + if (get_user(data, &sgio32->usr_ptr)) + return -EFAULT; + if (put_user(compat_ptr(data), &sgio->usr_ptr)) + return -EFAULT; + + err = nvme_sg_io(ns, sgio); + if (err >= 0) { + void __user *datap; + + if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, + sizeof(int)) || + get_user(datap, &sgio->usr_ptr) || + put_user((u32)(unsigned long)datap, + &sgio32->usr_ptr) || + copy_in_user(&sgio32->status, &sgio->status, + (4 * sizeof(unsigned char)) + + (2 * sizeof(unsigned short)) + + (3 * sizeof(int)))) + err = -EFAULT; + } + + return err; +} +#endif + int nvme_sg_get_version_num(int __user *ip) { return put_user(sg_version_num, ip); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8119a476cb29..5bc29197d90e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -165,6 +165,7 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); +int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); int nvme_sg_get_version_num(int __user *ip); #endif /* _LINUX_NVME_H */ -- cgit v1.2.3-70-g09d2 From 9a6b94584de1a0467d85b435df9c744c5c45a270 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:36 -0700 Subject: NVMe: Device resume error handling Adds controller error handling on resume power management. If the device fails to initialize, the device is queued for a reset. If the reset fails, a thread is spawned to remove the pci device. If the device resumes as "busy", the device is responding to admin commands but will not create IO queues. In this case, we need to remove the gendisks and free the IO queues since they can't be used and may be holding bios in their lists. From testing, the dma pools require a pci device so this had to change the pci driver 'remove' to release the dma resources in line with that call instead of after all references to the device are released. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 108 +++++++++++++++++++++++++++++++++++++++------- include/linux/nvme.h | 1 + 2 files changed, 94 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 300766973d76..000bca43c23b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -58,6 +58,7 @@ module_param(use_threaded_interrupts, int, 0); static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; +static struct workqueue_struct *nvme_workq; /* * An NVM Express queue. Each device has at least two (one for admin @@ -1968,7 +1969,6 @@ static int nvme_dev_map(struct nvme_dev *dev) dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) goto disable; - pci_set_drvdata(pdev, dev); dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); if (!dev->bar) goto disable; @@ -1995,9 +1995,9 @@ static void nvme_dev_unmap(struct nvme_dev *dev) if (dev->bar) { iounmap(dev->bar); dev->bar = NULL; + pci_release_regions(dev->pci_dev); } - pci_release_regions(dev->pci_dev); if (pci_is_enabled(dev->pci_dev)) pci_disable_device(dev->pci_dev); } @@ -2085,11 +2085,6 @@ static void nvme_release_instance(struct nvme_dev *dev) static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); - nvme_dev_remove(dev); - nvme_dev_shutdown(dev); - nvme_free_queues(dev); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2161,6 +2156,70 @@ static int nvme_dev_start(struct nvme_dev *dev) return result; } +static int nvme_remove_dead_ctrl(void *arg) +{ + struct nvme_dev *dev = (struct nvme_dev *)arg; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_get_drvdata(pdev)) + pci_stop_and_remove_bus_device(pdev); + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static void nvme_remove_disks(struct work_struct *ws) +{ + int i; + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + + nvme_dev_remove(dev); + spin_lock(&dev_list_lock); + for (i = dev->queue_count - 1; i > 0; i--) { + BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended); + nvme_free_queue(dev->queues[i]); + dev->queue_count--; + dev->queues[i] = NULL; + } + spin_unlock(&dev_list_lock); +} + +static int nvme_dev_resume(struct nvme_dev *dev) +{ + int ret; + + ret = nvme_dev_start(dev); + if (ret && ret != -EBUSY) + return ret; + if (ret == -EBUSY) { + spin_lock(&dev_list_lock); + INIT_WORK(&dev->reset_work, nvme_remove_disks); + queue_work(nvme_workq, &dev->reset_work); + spin_unlock(&dev_list_lock); + } + return 0; +} + +static void nvme_dev_reset(struct nvme_dev *dev) +{ + nvme_dev_shutdown(dev); + if (nvme_dev_resume(dev)) { + dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + kref_get(&dev->kref); + if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", + dev->instance))) { + dev_err(&dev->pci_dev->dev, + "Failed to start controller remove task\n"); + kref_put(&dev->kref, nvme_free_dev); + } + } +} + +static void nvme_reset_failed_dev(struct work_struct *ws) +{ + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + nvme_dev_reset(dev); +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int result = -ENOMEM; @@ -2180,7 +2239,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; - + pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) goto free; @@ -2232,7 +2291,19 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + spin_unlock(&dev_list_lock); + + pci_set_drvdata(pdev, NULL); + flush_work(&dev->reset_work); misc_deregister(&dev->miscdev); + nvme_dev_remove(dev); + nvme_dev_shutdown(dev); + nvme_free_queues(dev); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } @@ -2256,13 +2327,12 @@ static int nvme_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - int ret; - ret = nvme_dev_start(ndev); - /* XXX: should remove gendisks if resume fails */ - if (ret) - nvme_free_queues(ndev); - return ret; + if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { + INIT_WORK(&ndev->reset_work, nvme_reset_failed_dev); + queue_work(nvme_workq, &ndev->reset_work); + } + return 0; } static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); @@ -2303,9 +2373,14 @@ static int __init nvme_init(void) if (IS_ERR(nvme_thread)) return PTR_ERR(nvme_thread); + result = -ENOMEM; + nvme_workq = create_singlethread_workqueue("nvme"); + if (!nvme_workq) + goto kill_kthread; + result = register_blkdev(nvme_major, "nvme"); if (result < 0) - goto kill_kthread; + goto kill_workq; else if (result > 0) nvme_major = result; @@ -2316,6 +2391,8 @@ static int __init nvme_init(void) unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); + kill_workq: + destroy_workqueue(nvme_workq); kill_kthread: kthread_stop(nvme_thread); return result; @@ -2325,6 +2402,7 @@ static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); unregister_blkdev(nvme_major, "nvme"); + destroy_workqueue(nvme_workq); kthread_stop(nvme_thread); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5bc29197d90e..eed81cc56d7e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,6 +87,7 @@ struct nvme_dev { struct list_head namespaces; struct kref kref; struct miscdevice miscdev; + struct work_struct reset_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3-70-g09d2 From ec65993443736a5091b68e80ff1734548944a4b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:16 -0800 Subject: mm, x86: Account for TLB flushes only when debugging Bisection between 3.11 and 3.12 fingered commit 9824cf97 ("mm: vmstats: tlb flush counters") to cause overhead problems. The counters are undeniably useful but how often do we really need to debug TLB flush related issues? It does not justify taking the penalty everywhere so make it a debugging option. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Hugh Dickins Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-XzxjntugxuwpxXhcrxqqh53b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 6 +++--- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/mm/tlb.c | 14 +++++++------- include/linux/vm_event_item.h | 4 +++- include/linux/vmstat.h | 8 ++++++++ mm/vmstat.c | 4 +++- 6 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e6d90babc245..04905bfc508b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) */ static inline void __flush_tlb_up(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); } static inline void flush_tlb_all(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb_all(); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4f..0e25a1bc5ab5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..05446c1cccfe 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -151,7 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -215,7 +215,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* tlb_flushall_shift is on balance point, details in commit log */ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { if (has_large_page(mm, start, end)) { @@ -224,7 +224,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -262,7 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -270,7 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c557c6d096de..3a712e2e7d76 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ -#endif +#endif /* CONFIG_SMP */ NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, +#endif /* CONFIG_DEBUG_TLBFLUSH */ NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4b948080d20..80ebba9c2e87 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_DEBUG_TLBFLUSH +#define count_vm_tlb_event(x) count_vm_event(x) +#define count_vm_tlb_events(x, y) count_vm_events(x, y) +#else +#define count_vm_tlb_event(x) do {} while (0) +#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) +#endif + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; -- cgit v1.2.3-70-g09d2 From 6e5f52674ff0756e61a8879f6232b9ac33735cba Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sat, 25 Jan 2014 22:36:13 +0200 Subject: spi: spi.h: clarify the documentation of transfer_one Explicitly note the transfer_one and transfer_one_message are mutually exclusive, to make the text a little more newcomers friendly. Signed-off-by: Baruch Siach Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index a1d4ca290862..ad050f0dd39d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -287,7 +287,10 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem - * can issue the next transfer + * can issue the next transfer. Note: transfer_one and + * transfer_one_message are mutually exclusive; when both + * are set, the generic subsystem does not call your + * transfer_one callback. * @unprepare_message: undo any work done by prepare_message(). * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS * number. Any individual value may be -ENOENT for CS lines that -- cgit v1.2.3-70-g09d2 From e9305331f64ab9e5210baa4307355635c5e849f4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sat, 25 Jan 2014 22:36:15 +0200 Subject: spi: correct the transfer_one_message documentation wording The transfer_one_message callback handles messages, not transfers. Signed-off-by: Baruch Siach Signed-off-by: Mark Brown --- Documentation/spi/spi-summary | 2 +- include/linux/spi/spi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary index dcaad476789c..7982bcc4d151 100644 --- a/Documentation/spi/spi-summary +++ b/Documentation/spi/spi-summary @@ -543,7 +543,7 @@ SPI MASTER METHODS queuing transfers that arrive in the meantime. When the driver is finished with this message, it must call spi_finalize_current_message() so the subsystem can issue the next - transfer. This may sleep. + message. This may sleep. master->transfer_one(struct spi_master *master, struct spi_device *spi, struct spi_transfer *transfer) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index ad050f0dd39d..4203c66d8803 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -273,7 +273,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * message while queuing transfers that arrive in the meantime. When the * driver is finished with this message, it must call * spi_finalize_current_message() so the subsystem can issue the next - * transfer + * message * @unprepare_transfer_hardware: there are currently no more messages on the * queue so the subsystem notifies the driver that it may relax the * hardware by issuing this call -- cgit v1.2.3-70-g09d2 From d4b4ff8e28b474fac0fbfa9cfc40f88b9e41e380 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:37 -0700 Subject: NVMe: Schedule reset for failed controllers Schedules a controller reset when it indicates it has a failed status. If the device does not become ready after a reset, the pci device will be scheduled for removal. Signed-off-by: Keith Busch [fixed checkpatch issue] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 21 +++++++++++++++++++-- include/linux/nvme.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 000bca43c23b..2f5b9f5f5a21 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -60,6 +60,8 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; +static void nvme_reset_failed_dev(struct work_struct *ws); + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -1612,13 +1614,25 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq) static int nvme_kthread(void *data) { - struct nvme_dev *dev; + struct nvme_dev *dev, *next; while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { + list_for_each_entry_safe(dev, next, &dev_list, node) { int i; + if (readl(&dev->bar->csts) & NVME_CSTS_CFS && + dev->initialized) { + if (work_busy(&dev->reset_work)) + continue; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "Failed status, reset controller\n"); + INIT_WORK(&dev->reset_work, + nvme_reset_failed_dev); + queue_work(nvme_workq, &dev->reset_work); + continue; + } for (i = 0; i < dev->queue_count; i++) { struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) @@ -2006,6 +2020,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; + dev->initialized = 0; for (i = dev->queue_count - 1; i >= 0; i--) nvme_disable_queue(dev, i); @@ -2196,6 +2211,7 @@ static int nvme_dev_resume(struct nvme_dev *dev) queue_work(nvme_workq, &dev->reset_work); spin_unlock(&dev_list_lock); } + dev->initialized = 1; return 0; } @@ -2269,6 +2285,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + dev->initialized = 1; kref_init(&dev->kref); return 0; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index eed81cc56d7e..117d877e8be5 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -95,6 +95,7 @@ struct nvme_dev { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u8 initialized; }; /* -- cgit v1.2.3-70-g09d2 From c30341dc3c436cf43508cd44cdfbb3810c38c195 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:38 -0700 Subject: NVMe: Abort timed out commands Send nvme abort command to io requests that have timed out on an initialized device. If the command is not returned after another timeout, schedule the controller for reset. Signed-off-by: Keith Busch [fix endianness issues] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nvme.h | 1 + include/uapi/linux/nvme.h | 11 ++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2f5b9f5f5a21..75049532df96 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -83,6 +83,7 @@ struct nvme_queue { u16 sq_head; u16 sq_tail; u16 cq_head; + u16 qid; u8 cq_phase; u8 cqe_seen; u8 q_suspended; @@ -100,6 +101,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_features) != 64); BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); @@ -114,6 +116,7 @@ struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; unsigned long timeout; + int aborted; }; static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) @@ -157,6 +160,7 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, info[cmdid].fn = handler; info[cmdid].ctx = ctx; info[cmdid].timeout = jiffies + timeout; + info[cmdid].aborted = 0; return cmdid; } @@ -175,6 +179,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) #define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) +#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) static void special_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) @@ -183,6 +188,10 @@ static void special_completion(struct nvme_dev *dev, void *ctx, return; if (ctx == CMD_CTX_FLUSH) return; + if (ctx == CMD_CTX_ABORT) { + ++dev->abort_limit; + return; + } if (ctx == CMD_CTX_COMPLETED) { dev_warn(&dev->pci_dev->dev, "completed id %d twice on queue %d\n", @@ -1004,6 +1013,56 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, return nvme_submit_admin_cmd(dev, &c, result); } +/** + * nvme_abort_cmd - Attempt aborting a command + * @cmdid: Command id of a timed out IO + * @queue: The queue with timed out IO + * + * Schedule controller reset if the command was already aborted once before and + * still hasn't been returned to the driver, or if this is the admin queue. + */ +static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +{ + int a_cmdid; + struct nvme_command cmd; + struct nvme_dev *dev = nvmeq->dev; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + + if (!nvmeq->qid || info[cmdid].aborted) { + if (work_busy(&dev->reset_work)) + return; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "I/O %d QID %d timeout, reset controller\n", cmdid, + nvmeq->qid); + INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); + queue_work(nvme_workq, &dev->reset_work); + return; + } + + if (!dev->abort_limit) + return; + + a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, + ADMIN_TIMEOUT); + if (a_cmdid < 0) + return; + + memset(&cmd, 0, sizeof(cmd)); + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = cmdid; + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + cmd.abort.command_id = a_cmdid; + + --dev->abort_limit; + info[cmdid].aborted = 1; + info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + nvmeq->qid); + nvme_submit_cmd(dev->queues[0], &cmd); +} + /** * nvme_cancel_ios - Cancel outstanding I/Os * @queue: The queue to cancel I/Os on @@ -1027,7 +1086,12 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) continue; if (info[cmdid].ctx == CMD_CTX_CANCELLED) continue; - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + if (timeout && nvmeq->dev->initialized) { + nvme_abort_cmd(cmdid, nvmeq); + continue; + } + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, + nvmeq->qid); ctx = cancel_cmdid(nvmeq, cmdid, &fn); fn(nvmeq->dev, ctx, &cqe); } @@ -1119,6 +1183,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; + nvmeq->qid = qid; nvmeq->q_suspended = 1; dev->queue_count++; @@ -1929,6 +1994,7 @@ static int nvme_dev_add(struct nvme_dev *dev) ctrl = mem; nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); + dev->abort_limit = ctrl->acl + 1; memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 117d877e8be5..69ae03f6eb15 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -95,6 +95,7 @@ struct nvme_dev { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u16 abort_limit; u8 initialized; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 989c04e0c563..e5ab62201119 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -350,6 +350,16 @@ struct nvme_delete_queue { __u32 rsvd11[5]; }; +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + struct nvme_download_firmware { __u8 opcode; __u8 flags; @@ -384,6 +394,7 @@ struct nvme_command { struct nvme_download_firmware dlfw; struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; + struct nvme_abort_cmd abort; }; }; -- cgit v1.2.3-70-g09d2 From f0276924fa35a3607920a58cf5d878212824b951 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Dec 2013 11:38:50 +0800 Subject: blk-mq: Don't reserve a tag for flush request Reserving a tag (request) for flush to avoid dead lock is a overkill. A tag is valuable resource. We can track the number of flush requests and disallow having too many pending flush requests allocated. With this patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too many pending requests are allocated and new flush request is allocated. But this should not be a problem, too many pending flush requests are very rare case. I verified this can fix the deadlock caused by too many pending flush requests. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-flush.c | 8 +++++--- block/blk-mq.c | 46 ++++++++++++++++++++++++++++++---------------- include/linux/blk-mq.h | 3 +++ 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/blk-flush.c b/block/blk-flush.c index 9288aaf35c21..9143e85226c7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -284,9 +284,8 @@ static void mq_flush_work(struct work_struct *work) q = container_of(work, struct request_queue, mq_flush_work); - /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, true); + __GFP_WAIT|GFP_ATOMIC, false); rq->cmd_type = REQ_TYPE_FS; rq->end_io = flush_end_io; @@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. + * We keep REQ_FLUSH for mq to track flush requests. For !FUA, + * we never dispatch the request directly. */ - rq->cmd_flags &= ~REQ_FLUSH; + if (rq->cmd_flags & REQ_FUA) + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; diff --git a/block/blk-mq.c b/block/blk-mq.c index 57039fcd9c93..9072d0ab184f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,9 +194,27 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) + gfp_t gfp, bool reserved, + int rw) { - return blk_mq_alloc_rq(hctx, gfp, reserved); + struct request *req; + bool is_flush = false; + /* + * flush need allocate a request, leave at least one request for + * non-flush IO to avoid deadlock + */ + if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { + if (atomic_inc_return(&hctx->pending_flush) >= + hctx->queue_depth - hctx->reserved_tags - 1) { + atomic_dec(&hctx->pending_flush); + return NULL; + } + is_flush = true; + } + req = blk_mq_alloc_rq(hctx, gfp, reserved); + if (!req && is_flush) + atomic_dec(&hctx->pending_flush); + return req; } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -209,7 +227,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -272,6 +290,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + atomic_dec(&hctx->pending_flush); + blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -900,14 +921,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); + blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); + rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, + __GFP_WAIT|GFP_ATOMIC, false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1184,7 +1205,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; + hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; + atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1309,15 +1332,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, reg->queue_depth = BLK_MQ_MAX_DEPTH; } - /* - * Set aside a tag for flush requests. It will only be used while - * another flush request is in progress but outside the driver. - * - * TODO: only allocate if flushes are supported - */ - reg->queue_depth++; - reg->reserved_tags++; - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) return ERR_PTR(-EINVAL); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 161b23105b1e..1e8f16f65af4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,12 +36,15 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; + atomic_t pending_flush; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; + unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ -- cgit v1.2.3-70-g09d2 From 0ae89beb283a0db5980d1d4781c7d7be2f2810d6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 30 Jan 2014 10:11:28 +0100 Subject: can: add destructor for self generated skbs Self generated skbuffs in net/can/bcm.c are setting a skb->sk reference but no explicit destructor which is enforced since Linux 3.11 with commit 376c7311bdb6 (net: add a temporary sanity check in skb_orphan()). This patch adds some helper functions to make sure that a destructor is properly defined when a sock reference is assigned to a CAN related skb. To create an unshared skb owned by the original sock a common helper function has been introduced to replace open coded functions to create CAN echo skbs. Signed-off-by: Oliver Hartkopp Tested-by: Andre Naujoks Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/can/dev.c | 15 +++------------ drivers/net/can/janz-ican3.c | 18 ++++-------------- drivers/net/can/vcan.c | 9 ++++----- include/linux/can/skb.h | 38 ++++++++++++++++++++++++++++++++++++++ net/can/af_can.c | 3 ++- net/can/bcm.c | 4 ++-- 6 files changed, 53 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 13a909822e25..fc59bc6f040b 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -323,19 +323,10 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, } if (!priv->echo_skb[idx]) { - struct sock *srcsk = skb->sk; - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else - skb_orphan(skb); - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* make settings for echo to reduce code in irq context */ skb->protocol = htons(ETH_P_CAN); diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c index e24e6690d672..2124c6790687 100644 --- a/drivers/net/can/janz-ican3.c +++ b/drivers/net/can/janz-ican3.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1133,20 +1134,9 @@ static void ican3_handle_message(struct ican3_dev *mod, struct ican3_msg *msg) */ static void ican3_put_echo_skb(struct ican3_dev *mod, struct sk_buff *skb) { - struct sock *srcsk = skb->sk; - - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else { - skb_orphan(skb); - } - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* save this skb for tx interrupt echo handling */ skb_queue_tail(&mod->echoq, skb); diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 0a2a5ee79a17..4e94057ef5cf 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -109,25 +110,23 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) stats->rx_packets++; stats->rx_bytes += cfd->len; } - kfree_skb(skb); + consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { - struct sock *srcsk = skb->sk; - skb = skb_share_check(skb, GFP_ATOMIC); + skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ - skb->sk = srcsk; vcan_rx(skb, dev); } else { /* no looped packets => no counting */ - kfree_skb(skb); + consume_skb(skb); } return NETDEV_TX_OK; } diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 2f0543f7510c..f9bbbb472663 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -11,7 +11,9 @@ #define CAN_SKB_H #include +#include #include +#include /* * The struct can_skb_priv is used to transport additional information along @@ -42,4 +44,40 @@ static inline void can_skb_reserve(struct sk_buff *skb) skb_reserve(skb, sizeof(struct can_skb_priv)); } +static inline void can_skb_destructor(struct sk_buff *skb) +{ + sock_put(skb->sk); +} + +static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) +{ + if (sk) { + sock_hold(sk); + skb->destructor = can_skb_destructor; + skb->sk = sk; + } +} + +/* + * returns an unshared skb owned by the original sock to be echo'ed back + */ +static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) +{ + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (likely(nskb)) { + can_skb_set_owner(nskb, skb->sk); + consume_skb(skb); + return nskb; + } else { + kfree_skb(skb); + return NULL; + } + } + + /* we can assume to have an unshared skb with proper owner */ + return skb; +} + #endif /* CAN_SKB_H */ diff --git a/net/can/af_can.c b/net/can/af_can.c index d249874a366d..a27f8aad9e99 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -290,7 +291,7 @@ int can_send(struct sk_buff *skb, int loop) return -ENOMEM; } - newskb->sk = skb->sk; + can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 3fc737b214c7..dcb75c0e66c1 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -268,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; - skb->sk = op->sk; + can_skb_set_owner(skb, op->sk); can_send(skb, 1); /* update statistics */ @@ -1223,7 +1223,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) can_skb_prv(skb)->ifindex = dev->ifindex; skb->dev = dev; - skb->sk = sk; + can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); -- cgit v1.2.3-70-g09d2 From 17ead6c85c3d0ef57a14d1373f1f1cee2ce60ea8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2014 14:53:23 -0500 Subject: NFSv4: Fix memory corruption in nfs4_proc_open_confirm nfs41_wake_and_assign_slot() relies on the task->tk_msg.rpc_argp and task->tk_msg.rpc_resp always pointing to the session sequence arguments. nfs4_proc_open_confirm tries to pull a fast one by reusing the open sequence structure, thus causing corruption of the NFSv4 slot table. Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++---- include/linux/nfs_xdr.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 42da6af77587..2da6a698b8f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1620,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_sequence_done(task, &data->o_res.seq_res); + nfs40_sequence_done(task, &data->c_res.seq_res); data->rpc_status = task->tk_status; if (data->rpc_status == 0) { @@ -1686,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3ccfcecf8999..b2fb167b2e6d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -379,12 +379,14 @@ struct nfs_openres { * Arguments to the open_confirm call. */ struct nfs_open_confirmargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; }; struct nfs_open_confirmres { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_seqid * seqid; }; -- cgit v1.2.3-70-g09d2 From e85fc9805591a17ca8af50023ee8e2b61d9a123b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 3 Feb 2014 06:43:59 -0500 Subject: Revert "xen/grant-table: Avoid m2p_override during mapping" This reverts commit 08ece5bb2312b4510b161a6ef6682f37f4eac8a1. As it breaks ARM builds and needs more attention on the ARM side. Acked-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 5 +-- arch/x86/xen/p2m.c | 17 ++++++- drivers/block/xen-blkback/blkback.c | 15 ++++--- drivers/xen/gntdev.c | 13 +++--- drivers/xen/grant-table.c | 89 ++++++------------------------------- include/xen/grant_table.h | 8 +--- 6 files changed, 46 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 787e1bb5aafc..3e276eb23d1b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -52,8 +52,7 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op); extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn); + struct gnttab_map_grant_ref *kmap_op); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); @@ -122,7 +121,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) pfn = m2p_find_override_pfn(mfn, ~0); } - /* + /* * pfn is ~0 if there are no entries in the m2p for mfn or if the * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8009acbe41e4..696c694986d0 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -899,6 +899,13 @@ int m2p_add_override(unsigned long mfn, struct page *page, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } + WARN_ON(PagePrivate(page)); + SetPagePrivate(page); + set_page_private(page, mfn); + page->index = pfn_to_mfn(pfn); + + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + return -ENOMEM; if (kmap_op != NULL) { if (!PageHighMem(page)) { @@ -937,16 +944,19 @@ int m2p_add_override(unsigned long mfn, struct page *page, } EXPORT_SYMBOL_GPL(m2p_add_override); int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn) + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; + unsigned long mfn; unsigned long pfn; unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return -EINVAL; if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); @@ -960,7 +970,10 @@ int m2p_remove_override(struct page *page, spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); + WARN_ON(!PagePrivate(page)); + ClearPagePrivate(page); + set_phys_to_machine(pfn, page->index); if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 875025f299b6..6620b73d0490 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -285,7 +285,8 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || !rb_next(&persistent_gnt->node)) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, + segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); segs_to_unmap = 0; @@ -320,7 +321,8 @@ static void unmap_purged_grants(struct work_struct *work) pages[segs_to_unmap] = persistent_gnt->page; if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, + segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); segs_to_unmap = 0; @@ -328,7 +330,7 @@ static void unmap_purged_grants(struct work_struct *work) kfree(persistent_gnt); } if (segs_to_unmap > 0) { - ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap); + ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); BUG_ON(ret); put_free_pages(blkif, pages, segs_to_unmap); } @@ -668,14 +670,15 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, GNTMAP_host_map, pages[i]->handle); pages[i]->handle = BLKBACK_INVALID_HANDLE; if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { - ret = gnttab_unmap_refs(unmap, unmap_pages, invcount); + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, + invcount); BUG_ON(ret); put_free_pages(blkif, unmap_pages, invcount); invcount = 0; } } if (invcount) { - ret = gnttab_unmap_refs(unmap, unmap_pages, invcount); + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); put_free_pages(blkif, unmap_pages, invcount); } @@ -737,7 +740,7 @@ again: } if (segs_to_map) { - ret = gnttab_map_refs(map, pages_to_gnt, segs_to_map); + ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); BUG_ON(ret); } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 34a2704fbc88..073b4a19a8b0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -284,10 +284,8 @@ static int map_grant_pages(struct grant_map *map) } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs_userspace(map->map_ops, - use_ptemod ? map->kmap_ops : NULL, - map->pages, - map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; @@ -317,10 +315,9 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) } } - err = gnttab_unmap_refs_userspace(map->unmap_ops + offset, - use_ptemod ? map->kmap_ops + offset : NULL, - map->pages + offset, - pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, + use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, + pages); if (err) return err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 8ee13e2e45e2..b84e3ab839aa 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -928,17 +928,15 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); -int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count, - bool m2p_override) + struct page **pages, unsigned int count) { int i, ret; bool lazy = false; pte_t *pte; - unsigned long mfn, pfn; + unsigned long mfn; - BUG_ON(kmap_ops && !m2p_override); ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) return ret; @@ -957,12 +955,10 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, map_ops[i].dev_bus_addr >> PAGE_SHIFT); } - return 0; + return ret; } - if (m2p_override && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); lazy = true; } @@ -979,20 +975,8 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, } else { mfn = PFN_DOWN(map_ops[i].dev_bus_addr); } - pfn = page_to_pfn(pages[i]); - - WARN_ON(PagePrivate(pages[i])); - SetPagePrivate(pages[i]); - set_page_private(pages[i], mfn); - - pages[i]->index = pfn_to_mfn(pfn); - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; - } - if (m2p_override) - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); + ret = m2p_add_override(mfn, pages[i], kmap_ops ? + &kmap_ops[i] : NULL); if (ret) goto out; } @@ -1003,32 +987,15 @@ int __gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; } - -int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_map_refs(map_ops, NULL, pages, count, false); -} EXPORT_SYMBOL_GPL(gnttab_map_refs); -int gnttab_map_refs_userspace(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_map_refs(map_ops, kmap_ops, pages, count, true); -} -EXPORT_SYMBOL_GPL(gnttab_map_refs_userspace); - -int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count, - bool m2p_override) + struct page **pages, unsigned int count) { int i, ret; bool lazy = false; - unsigned long pfn, mfn; - BUG_ON(kmap_ops && !m2p_override); ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; @@ -1039,33 +1006,17 @@ int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, INVALID_P2M_ENTRY); } - return 0; + return ret; } - if (m2p_override && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); lazy = true; } for (i = 0; i < count; i++) { - pfn = page_to_pfn(pages[i]); - mfn = get_phys_to_machine(pfn); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { - ret = -EINVAL; - goto out; - } - - set_page_private(pages[i], INVALID_P2M_ENTRY); - WARN_ON(!PagePrivate(pages[i])); - ClearPagePrivate(pages[i]); - set_phys_to_machine(pfn, pages[i]->index); - if (m2p_override) - ret = m2p_remove_override(pages[i], - kmap_ops ? - &kmap_ops[i] : NULL, - mfn); + ret = m2p_remove_override(pages[i], kmap_ops ? + &kmap_ops[i] : NULL); if (ret) goto out; } @@ -1076,22 +1027,8 @@ int __gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } - -int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *map_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_unmap_refs(map_ops, NULL, pages, count, false); -} EXPORT_SYMBOL_GPL(gnttab_unmap_refs); -int gnttab_unmap_refs_userspace(struct gnttab_unmap_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - return __gnttab_unmap_refs(map_ops, kmap_ops, pages, count, true); -} -EXPORT_SYMBOL_GPL(gnttab_unmap_refs_userspace); - static unsigned nr_status_frames(unsigned nr_grant_frames) { BUG_ON(grefs_per_grant_frame == 0); diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 7ad033dbc845..a5af2a26d94f 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -191,15 +191,11 @@ void gnttab_free_auto_xlat_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -int gnttab_map_refs_userspace(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kunmap_ops, struct page **pages, unsigned int count); -int gnttab_unmap_refs_userspace(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kunmap_ops, - struct page **pages, unsigned int count); /* Perform a batch of grant map/copy operations. Retry every batch slot * for which the hypervisor returns GNTST_eagain. This is typically due -- cgit v1.2.3-70-g09d2 From 48b8f6315a6a230f859fcae8ef4a0ccf4f48e802 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 31 Jan 2014 15:49:07 +0200 Subject: drm: add DRM_INFO_ONCE() to print a one-time DRM_INFO() message Just like DRM_INFO(), but only do it once. Signed-off-by: Jani Nikula Reviewed-by: Chris Wilson Acked-by: Dave Airlie Signed-off-by: Daniel Vetter --- include/drm/drmP.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 04086c5be930..04a7f31301f8 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -199,6 +199,9 @@ int drm_err(const char *func, const char *format, ...); #define DRM_INFO(fmt, ...) \ printk(KERN_INFO "[" DRM_NAME "] " fmt, ##__VA_ARGS__) +#define DRM_INFO_ONCE(fmt, ...) \ + printk_once(KERN_INFO "[" DRM_NAME "] " fmt, ##__VA_ARGS__) + /** * Debug output. * -- cgit v1.2.3-70-g09d2 From a13aff0641a92dc0b95136e32526e2ce81ffc4ef Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 4 Feb 2014 03:33:10 +0400 Subject: net: ethoc: set up MII management bus clock MII management bus clock is derived from the MAC clock by dividing it by MIIMODER register CLKDIV field value. This value may need to be set up in case it is undefined or its default value is too high (and communication with PHY is too slow) or too low (and communication with PHY is impossible). The value of CLKDIV is not specified directly, but is derived from the MAC clock for the default MII management bus frequency of 2.5MHz. The MAC clock may be specified in the platform data, or in the 'clocks' device tree attribute. Signed-off-by: Max Filippov Signed-off-by: David S. Miller --- drivers/net/ethernet/ethoc.c | 32 ++++++++++++++++++++++++++++++-- include/net/ethoc.h | 1 + 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 0a8533c0c01a..55e0fa03dc90 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -219,6 +220,7 @@ struct ethoc { struct phy_device *phy; struct mii_bus *mdio; + struct clk *clk; s8 phy_id; }; @@ -1021,6 +1023,8 @@ static int ethoc_probe(struct platform_device *pdev) int num_bd; int ret = 0; bool random_mac = false; + struct ethoc_platform_data *pdata = dev_get_platdata(&pdev->dev); + u32 eth_clkfreq = pdata ? pdata->eth_clkfreq : 0; /* allocate networking device */ netdev = alloc_etherdev(sizeof(struct ethoc)); @@ -1135,8 +1139,7 @@ static int ethoc_probe(struct platform_device *pdev) } /* Allow the platform setup code to pass in a MAC address. */ - if (dev_get_platdata(&pdev->dev)) { - struct ethoc_platform_data *pdata = dev_get_platdata(&pdev->dev); + if (pdata) { memcpy(netdev->dev_addr, pdata->hwaddr, IFHWADDRLEN); priv->phy_id = pdata->phy_id; } else { @@ -1174,6 +1177,27 @@ static int ethoc_probe(struct platform_device *pdev) if (random_mac) netdev->addr_assign_type = NET_ADDR_RANDOM; + /* Allow the platform setup code to adjust MII management bus clock. */ + if (!eth_clkfreq) { + struct clk *clk = devm_clk_get(&pdev->dev, NULL); + + if (!IS_ERR(clk)) { + priv->clk = clk; + clk_prepare_enable(clk); + eth_clkfreq = clk_get_rate(clk); + } + } + if (eth_clkfreq) { + u32 clkdiv = MIIMODER_CLKDIV(eth_clkfreq / 2500000 + 1); + + if (!clkdiv) + clkdiv = 2; + dev_dbg(&pdev->dev, "setting MII clkdiv to %u\n", clkdiv); + ethoc_write(priv, MIIMODER, + (ethoc_read(priv, MIIMODER) & MIIMODER_NOPRE) | + clkdiv); + } + /* register MII bus */ priv->mdio = mdiobus_alloc(); if (!priv->mdio) { @@ -1239,6 +1263,8 @@ free_mdio: kfree(priv->mdio->irq); mdiobus_free(priv->mdio); free: + if (priv->clk) + clk_disable_unprepare(priv->clk); free_netdev(netdev); out: return ret; @@ -1263,6 +1289,8 @@ static int ethoc_remove(struct platform_device *pdev) kfree(priv->mdio->irq); mdiobus_free(priv->mdio); } + if (priv->clk) + clk_disable_unprepare(priv->clk); unregister_netdev(netdev); free_netdev(netdev); } diff --git a/include/net/ethoc.h b/include/net/ethoc.h index 96f3789b27bc..2a2d6bb34eb8 100644 --- a/include/net/ethoc.h +++ b/include/net/ethoc.h @@ -16,6 +16,7 @@ struct ethoc_platform_data { u8 hwaddr[IFHWADDRLEN]; s8 phy_id; + u32 eth_clkfreq; }; #endif /* !LINUX_NET_ETHOC_H */ -- cgit v1.2.3-70-g09d2 From 788a4d56ff378bff0b8e685d03a962b36903a149 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 4 Feb 2014 18:33:12 +0100 Subject: drivers: phy: Add support for optional phys Add devm_phy_optional_get and phy_optional_get, which should be used when the phy is optional. They does not return an error when the phy does not exist, rather they returns NULL, which is considered as a valid phy, but results in NOPs when used with the consumer API. Signed-off-by: Andrew Lunn Tested-by: Gregory CLEMENT Acked-by: Kishon Vijay Abraham I Signed-off-by: Jason Cooper --- Documentation/phy.txt | 20 +++++++++++++------- drivers/phy/phy-core.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy/phy.h | 14 ++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/phy.txt b/Documentation/phy.txt index 2e24b993e95f..ebff6ee52441 100644 --- a/Documentation/phy.txt +++ b/Documentation/phy.txt @@ -75,14 +75,20 @@ Before the controller can make use of the PHY, it has to get a reference to it. This framework provides the following APIs to get a reference to the PHY. struct phy *phy_get(struct device *dev, const char *string); +struct phy *phy_optional_get(struct device *dev, const char *string); struct phy *devm_phy_get(struct device *dev, const char *string); - -phy_get and devm_phy_get can be used to get the PHY. In the case of dt boot, -the string arguments should contain the phy name as given in the dt data and -in the case of non-dt boot, it should contain the label of the PHY. -The only difference between the two APIs is that devm_phy_get associates the -device with the PHY using devres on successful PHY get. On driver detach, -release function is invoked on the the devres data and devres data is freed. +struct phy *devm_phy_optional_get(struct device *dev, const char *string); + +phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can +be used to get the PHY. In the case of dt boot, the string arguments +should contain the phy name as given in the dt data and in the case of +non-dt boot, it should contain the label of the PHY. The two +devm_phy_get associates the device with the PHY using devres on +successful PHY get. On driver detach, release function is invoked on +the the devres data and devres data is freed. phy_optional_get and +devm_phy_optional_get should be used when the phy is optional. These +two functions will never return -ENODEV, but instead returns NULL when +the phy cannot be found. It should be noted that NULL is a valid phy reference. All phy consumer calls on the NULL phy become NOPs. That is the release calls, diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index a9cdeee20d91..5f5b0f4be5be 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -425,6 +425,27 @@ struct phy *phy_get(struct device *dev, const char *string) } EXPORT_SYMBOL_GPL(phy_get); +/** + * phy_optional_get() - lookup and obtain a reference to an optional phy. + * @dev: device that requests this phy + * @string: the phy name as given in the dt data or the name of the controller + * port for non-dt case + * + * Returns the phy driver, after getting a refcount to it; or + * NULL if there is no such phy. The caller is responsible for + * calling phy_put() to release that count. + */ +struct phy *phy_optional_get(struct device *dev, const char *string) +{ + struct phy *phy = phy_get(dev, string); + + if (PTR_ERR(phy) == -ENODEV) + phy = NULL; + + return phy; +} +EXPORT_SYMBOL_GPL(phy_optional_get); + /** * devm_phy_get() - lookup and obtain a reference to a phy. * @dev: device that requests this phy @@ -455,6 +476,30 @@ struct phy *devm_phy_get(struct device *dev, const char *string) } EXPORT_SYMBOL_GPL(devm_phy_get); +/** + * devm_phy_optional_get() - lookup and obtain a reference to an optional phy. + * @dev: device that requests this phy + * @string: the phy name as given in the dt data or phy device name + * for non-dt case + * + * Gets the phy using phy_get(), and associates a device with it using + * devres. On driver detach, release function is invoked on the devres + * data, then, devres data is freed. This differs to devm_phy_get() in + * that if the phy does not exist, it is not considered an error and + * -ENODEV will not be returned. Instead the NULL phy is returned, + * which can be passed to all other phy consumer calls. + */ +struct phy *devm_phy_optional_get(struct device *dev, const char *string) +{ + struct phy *phy = devm_phy_get(dev, string); + + if (PTR_ERR(phy) == -ENODEV) + phy = NULL; + + return phy; +} +EXPORT_SYMBOL_GPL(devm_phy_optional_get); + /** * phy_create() - create a new phy * @dev: device that is creating the new phy diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index e273e5ac19c9..3f83459dbb20 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -146,7 +146,9 @@ static inline void phy_set_bus_width(struct phy *phy, int bus_width) phy->attrs.bus_width = bus_width; } struct phy *phy_get(struct device *dev, const char *string); +struct phy *phy_optional_get(struct device *dev, const char *string); struct phy *devm_phy_get(struct device *dev, const char *string); +struct phy *devm_phy_optional_get(struct device *dev, const char *string); void phy_put(struct phy *phy); void devm_phy_put(struct device *dev, struct phy *phy); struct phy *of_phy_simple_xlate(struct device *dev, @@ -232,11 +234,23 @@ static inline struct phy *phy_get(struct device *dev, const char *string) return ERR_PTR(-ENOSYS); } +static inline struct phy *phy_optional_get(struct device *dev, + const char *string) +{ + return ERR_PTR(-ENOSYS); +} + static inline struct phy *devm_phy_get(struct device *dev, const char *string) { return ERR_PTR(-ENOSYS); } +static inline struct phy *devm_phy_optional_get(struct device *dev, + const char *string) +{ + return ERR_PTR(-ENOSYS); +} + static inline void phy_put(struct phy *phy) { } -- cgit v1.2.3-70-g09d2 From 662372e42e46d9bbfcb83e1cce81f6b33cebaddd Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 3 Feb 2014 08:53:44 -0600 Subject: of: restructure for_each macros to fix compile warnings Commit 00b2c76a6a "include/linux/of.h: make for_each_child_of_node() reference its args when CONFIG_OF=n" fixed warnings for unused variables, but introduced variable "used uninitialized" warnings. Simply initializing the variables would result in "set but not used" warnings with W=1. Fix both types of warnings by making all the for_each macros unconditional and rely on the dummy static inline functions to initialize and reference any variables. Reported-by: Geert Uytterhoeven Cc: David Howells Signed-off-by: Rob Herring Acked-by: Grant Likely --- include/linux/of.h | 153 +++++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 69 deletions(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 70c64ba17fa5..435cb995904d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -169,35 +169,15 @@ static inline const char *of_node_full_name(const struct device_node *np) extern struct device_node *of_find_node_by_name(struct device_node *from, const char *name); -#define for_each_node_by_name(dn, name) \ - for (dn = of_find_node_by_name(NULL, name); dn; \ - dn = of_find_node_by_name(dn, name)) extern struct device_node *of_find_node_by_type(struct device_node *from, const char *type); -#define for_each_node_by_type(dn, type) \ - for (dn = of_find_node_by_type(NULL, type); dn; \ - dn = of_find_node_by_type(dn, type)) extern struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compat); -#define for_each_compatible_node(dn, type, compatible) \ - for (dn = of_find_compatible_node(NULL, type, compatible); dn; \ - dn = of_find_compatible_node(dn, type, compatible)) extern struct device_node *of_find_matching_node_and_match( struct device_node *from, const struct of_device_id *matches, const struct of_device_id **match); -static inline struct device_node *of_find_matching_node( - struct device_node *from, - const struct of_device_id *matches) -{ - return of_find_matching_node_and_match(from, matches, NULL); -} -#define for_each_matching_node(dn, matches) \ - for (dn = of_find_matching_node(NULL, matches); dn; \ - dn = of_find_matching_node(dn, matches)) -#define for_each_matching_node_and_match(dn, matches, match) \ - for (dn = of_find_matching_node_and_match(NULL, matches, match); \ - dn; dn = of_find_matching_node_and_match(dn, matches, match)) + extern struct device_node *of_find_node_by_path(const char *path); extern struct device_node *of_find_node_by_phandle(phandle handle); extern struct device_node *of_get_parent(const struct device_node *node); @@ -209,43 +189,11 @@ extern struct device_node *of_get_next_available_child( extern struct device_node *of_get_child_by_name(const struct device_node *node, const char *name); -#define for_each_child_of_node(parent, child) \ - for (child = of_get_next_child(parent, NULL); child != NULL; \ - child = of_get_next_child(parent, child)) - -#define for_each_available_child_of_node(parent, child) \ - for (child = of_get_next_available_child(parent, NULL); child != NULL; \ - child = of_get_next_available_child(parent, child)) - -static inline int of_get_child_count(const struct device_node *np) -{ - struct device_node *child; - int num = 0; - - for_each_child_of_node(np, child) - num++; - - return num; -} - -static inline int of_get_available_child_count(const struct device_node *np) -{ - struct device_node *child; - int num = 0; - - for_each_available_child_of_node(np, child) - num++; - - return num; -} /* cache lookup */ extern struct device_node *of_find_next_cache_node(const struct device_node *); extern struct device_node *of_find_node_with_property( struct device_node *from, const char *prop_name); -#define for_each_node_with_property(dn, prop_name) \ - for (dn = of_find_node_with_property(NULL, prop_name); dn; \ - dn = of_find_node_with_property(dn, prop_name)) extern struct property *of_find_property(const struct device_node *np, const char *name, @@ -367,42 +315,53 @@ static inline struct device_node *of_find_node_by_name(struct device_node *from, return NULL; } -static inline struct device_node *of_get_parent(const struct device_node *node) +static inline struct device_node *of_find_node_by_type(struct device_node *from, + const char *type) { return NULL; } -static inline bool of_have_populated_dt(void) +static inline struct device_node *of_find_matching_node_and_match( + struct device_node *from, + const struct of_device_id *matches, + const struct of_device_id **match) { - return false; + return NULL; } -/* Kill an unused variable warning on a device_node pointer */ -static inline void __of_use_dn(const struct device_node *np) +static inline struct device_node *of_get_parent(const struct device_node *node) { + return NULL; } -#define for_each_child_of_node(parent, child) \ - while (__of_use_dn(parent), __of_use_dn(child), 0) +static inline struct device_node *of_get_next_child( + const struct device_node *node, struct device_node *prev) +{ + return NULL; +} -#define for_each_available_child_of_node(parent, child) \ - while (0) +static inline struct device_node *of_get_next_available_child( + const struct device_node *node, struct device_node *prev) +{ + return NULL; +} -static inline struct device_node *of_get_child_by_name( - const struct device_node *node, - const char *name) +static inline struct device_node *of_find_node_with_property( + struct device_node *from, const char *prop_name) { return NULL; } -static inline int of_get_child_count(const struct device_node *np) +static inline bool of_have_populated_dt(void) { - return 0; + return false; } -static inline int of_get_available_child_count(const struct device_node *np) +static inline struct device_node *of_get_child_by_name( + const struct device_node *node, + const char *name) { - return 0; + return NULL; } static inline int of_device_is_compatible(const struct device_node *device, @@ -569,6 +528,13 @@ extern int of_node_to_nid(struct device_node *np); static inline int of_node_to_nid(struct device_node *device) { return 0; } #endif +static inline struct device_node *of_find_matching_node( + struct device_node *from, + const struct of_device_id *matches) +{ + return of_find_matching_node_and_match(from, matches, NULL); +} + /** * of_property_read_bool - Findfrom a property * @np: device node from which the property value is to be read. @@ -618,6 +584,55 @@ static inline int of_property_read_u32(const struct device_node *np, s; \ s = of_prop_next_string(prop, s)) +#define for_each_node_by_name(dn, name) \ + for (dn = of_find_node_by_name(NULL, name); dn; \ + dn = of_find_node_by_name(dn, name)) +#define for_each_node_by_type(dn, type) \ + for (dn = of_find_node_by_type(NULL, type); dn; \ + dn = of_find_node_by_type(dn, type)) +#define for_each_compatible_node(dn, type, compatible) \ + for (dn = of_find_compatible_node(NULL, type, compatible); dn; \ + dn = of_find_compatible_node(dn, type, compatible)) +#define for_each_matching_node(dn, matches) \ + for (dn = of_find_matching_node(NULL, matches); dn; \ + dn = of_find_matching_node(dn, matches)) +#define for_each_matching_node_and_match(dn, matches, match) \ + for (dn = of_find_matching_node_and_match(NULL, matches, match); \ + dn; dn = of_find_matching_node_and_match(dn, matches, match)) + +#define for_each_child_of_node(parent, child) \ + for (child = of_get_next_child(parent, NULL); child != NULL; \ + child = of_get_next_child(parent, child)) +#define for_each_available_child_of_node(parent, child) \ + for (child = of_get_next_available_child(parent, NULL); child != NULL; \ + child = of_get_next_available_child(parent, child)) + +#define for_each_node_with_property(dn, prop_name) \ + for (dn = of_find_node_with_property(NULL, prop_name); dn; \ + dn = of_find_node_with_property(dn, prop_name)) + +static inline int of_get_child_count(const struct device_node *np) +{ + struct device_node *child; + int num = 0; + + for_each_child_of_node(np, child) + num++; + + return num; +} + +static inline int of_get_available_child_count(const struct device_node *np) +{ + struct device_node *child; + int num = 0; + + for_each_available_child_of_node(np, child) + num++; + + return num; +} + #if defined(CONFIG_PROC_FS) && defined(CONFIG_PROC_DEVICETREE) extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *); extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop); -- cgit v1.2.3-70-g09d2 From 1db73ae39a97797b7d929eca2af469f82fbb337a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 3 Feb 2014 09:27:40 +0100 Subject: of/device: Nullify match table in of_match_device() for CONFIG_OF=n If the of_device_id table inside a device driver is protected by #ifdef CONFIG_OF, the driver still has to provide a dummy declaration of the table, or wrap it inside of_match_ptr(), when calling of_match_device() in the CONFIG_OF=n case, else the driver fails to compile with e.g. drivers/spi/spi-rspi.c: In function 'rspi_probe': drivers/spi/spi-rspi.c:1203:26: error: 'rspi_of_match' undeclared (first use in this function) drivers/spi/spi-rspi.c:1203:26: note: each undeclared identifier is reported only once for each function it appears in Make of_match_device() nullify the table pointer if CONFIG_OF=n to fix this. Reported-by: Yoshihiro Shimoda Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- include/linux/of_device.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 8d7dd6768cb7..ef370210ffb2 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -78,11 +78,13 @@ static inline int of_device_uevent_modalias(struct device *dev, static inline void of_device_node_put(struct device *dev) { } -static inline const struct of_device_id *of_match_device( +static inline const struct of_device_id *__of_match_device( const struct of_device_id *matches, const struct device *dev) { return NULL; } +#define of_match_device(matches, dev) \ + __of_match_device(of_match_ptr(matches), (dev)) static inline struct device_node *of_cpu_device_node_get(int cpu) { -- cgit v1.2.3-70-g09d2 From e53376bef2cd97d3e3f61fdc677fb8da7d03d0da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 3 Feb 2014 20:01:53 +0100 Subject: netfilter: nf_conntrack: don't release a conntrack with non-zero refcnt With this patch, the conntrack refcount is initially set to zero and it is bumped once it is added to any of the list, so we fulfill Eric's golden rule which is that all released objects always have a refcount that equals zero. Andrey Vagin reports that nf_conntrack_free can't be called for a conntrack with non-zero ref-counter, because it can race with nf_conntrack_find_get(). A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero ref-counter says that this conntrack is used. So when we release a conntrack with non-zero counter, we break this assumption. CPU1 CPU2 ____nf_conntrack_find() nf_ct_put() destroy_conntrack() ... init_conntrack __nf_conntrack_alloc (set use = 1) atomic_inc_not_zero(&ct->use) (use = 2) if (!l4proto->new(ct, skb, dataoff, timeouts)) nf_conntrack_free(ct); (use = 2 !!!) ... __nf_conntrack_alloc (set use = 1) if (!nf_ct_key_equal(h, tuple, zone)) nf_ct_put(ct); (use = 0) destroy_conntrack() /* continue to work with CT */ After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get" another bug was triggered in destroy_conntrack(): <4>[67096.759334] ------------[ cut here ]------------ <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211! ... <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB <4>[67096.759932] RIP: 0010:[] [] destroy_conntrack+0x15c/0x190 [nf_conntrack] <4>[67096.760255] Call Trace: <4>[67096.760255] [] nf_conntrack_destroy+0x17/0x30 <4>[67096.760255] [] nf_conntrack_find_get+0x85/0x130 [nf_conntrack] <4>[67096.760255] [] nf_conntrack_in+0x352/0xb60 [nf_conntrack] <4>[67096.760255] [] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4] <4>[67096.760255] [] nf_iterate+0x69/0xb0 <4>[67096.760255] [] ? dst_output+0x0/0x20 <4>[67096.760255] [] nf_hook_slow+0x74/0x110 <4>[67096.760255] [] ? dst_output+0x0/0x20 <4>[67096.760255] [] raw_sendmsg+0x775/0x910 <4>[67096.760255] [] ? flush_tlb_others_ipi+0x128/0x130 <4>[67096.760255] [] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [] inet_sendmsg+0x4a/0xb0 <4>[67096.760255] [] ? sock_sendmsg+0x13/0x140 <4>[67096.760255] [] sock_sendmsg+0x117/0x140 <4>[67096.760255] [] ? native_smp_send_reschedule+0x49/0x60 <4>[67096.760255] [] ? _spin_unlock_bh+0x1b/0x20 <4>[67096.760255] [] ? autoremove_wake_function+0x0/0x40 <4>[67096.760255] [] ? do_ip_setsockopt+0x90/0xd80 <4>[67096.760255] [] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [] sys_sendto+0x139/0x190 <4>[67096.760255] [] ? audit_syscall_entry+0x1d7/0x200 <4>[67096.760255] [] ? __audit_syscall_exit+0x265/0x290 <4>[67096.760255] [] compat_sys_socketcall+0x13f/0x210 <4>[67096.760255] [] ia32_sysret+0x0/0x5 I have reused the original title for the RFC patch that Andrey posted and most of the original patch description. Cc: Eric Dumazet Cc: Andrew Vagin Cc: Florian Westphal Reported-by: Andrew Vagin Signed-off-by: Pablo Neira Ayuso Reviewed-by: Eric Dumazet Acked-by: Andrew Vagin --- include/net/netfilter/nf_conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 34 +++++++++++++++++++++++++++++----- net/netfilter/nf_synproxy_core.c | 5 ++--- net/netfilter/xt_CT.c | 7 +------ 4 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 01ea6eed1bb1..b2ac6246b7e0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,6 +284,8 @@ extern unsigned int nf_conntrack_max; extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl); + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4d1fb5d094c3..356bef519fe5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -448,7 +448,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto out; add_timer(&ct->timeout); - nf_conntrack_get(&ct->ct_general); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, repl_hash); NF_CT_STAT_INC(net, insert); spin_unlock_bh(&nf_conntrack_lock); @@ -462,6 +464,21 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + spin_lock_bh(&nf_conntrack_lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.tmpl); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -733,11 +750,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone->id = zone; } #endif - /* - * changes to lookup keys must be done before setting refcnt to 1 + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. */ - smp_wmb(); - atomic_set(&ct->ct_general.use, 1); + atomic_set(&ct->ct_general.use, 0); return ct; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -761,6 +777,11 @@ void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); @@ -856,6 +877,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, NF_CT_STAT_INC(net, new); } + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + /* Overload tuple linked list to put us in unconfirmed list. */ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &net->ct.unconfirmed); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9858e3e51a3a..52e20c9a46a5 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -363,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_tmpl_insert(net, ct); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -390,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - nf_conntrack_free(snet->tmpl); + nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 5929be622c5c..75747aecdebe 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -228,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err3; } - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); - - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &par->net->ct.tmpl); + nf_conntrack_tmpl_insert(par->net, ct); out: info->ct = ct; return 0; -- cgit v1.2.3-70-g09d2 From c4ad8f98bef77c7356aa6a9ad9188a6acc6b849d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Feb 2014 12:54:53 -0800 Subject: execve: use 'struct filename *' for executable name passing This changes 'do_execve()' to get the executable name as a 'struct filename', and to free it when it is done. This is what the normal users want, and it simplifies and streamlines their error handling. The controlled lifetime of the executable name also fixes a use-after-free problem with the trace_sched_process_exec tracepoint: the lifetime of the passed-in string for kernel users was not at all obvious, and the user-mode helper code used UMH_WAIT_EXEC to serialize the pathname allocation lifetime with the execve() having finished, which in turn meant that the trace point that happened after mm_release() of the old process VM ended up using already free'd memory. To solve the kernel string lifetime issue, this simply introduces "getname_kernel()" that works like the normal user-space getname() function, except with the source coming from kernel memory. As Oleg points out, this also means that we could drop the tcomm[] array from 'struct linux_binprm', since the pathname lifetime now covers setup_new_exec(). That would be a separate cleanup. Reported-by: Igor Zhbanov Tested-by: Steven Rostedt Cc: Oleg Nesterov Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/parisc/hpux/fs.c | 15 +-------------- fs/exec.c | 45 +++++++++++++++++++++------------------------ fs/namei.c | 30 ++++++++++++++++++++++++++++++ include/linux/binfmts.h | 1 - include/linux/fs.h | 1 + include/linux/sched.h | 3 ++- init/main.c | 2 +- kernel/auditsc.c | 2 +- kernel/kmod.c | 2 +- 9 files changed, 58 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 88d0962de65a..2bedafea3d94 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -33,22 +33,9 @@ int hpux_execve(struct pt_regs *regs) { - int error; - struct filename *filename; - - filename = getname((const char __user *) regs->gr[26]); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - - error = do_execve(filename->name, + return do_execve(getname((const char __user *) regs->gr[26]), (const char __user *const __user *) regs->gr[25], (const char __user *const __user *) regs->gr[24]); - - putname(filename); - -out: - return error; } struct hpux_dirent { diff --git a/fs/exec.c b/fs/exec.c index e1529b4c79b1..3d78fccdd723 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -struct file *open_exec(const char *name) +static struct file *do_open_exec(struct filename *name) { struct file *file; int err; - struct filename tmp = { .name = name }; static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, @@ -760,7 +759,7 @@ struct file *open_exec(const char *name) .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -784,6 +783,12 @@ exit: fput(file); return ERR_PTR(err); } + +struct file *open_exec(const char *name) +{ + struct filename tmp = { .name = name }; + return do_open_exec(&tmp); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, @@ -1162,7 +1167,7 @@ int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } -void free_bprm(struct linux_binprm *bprm) +static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { @@ -1432,7 +1437,7 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(const char *filename, +static int do_execve_common(struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp) { @@ -1441,6 +1446,9 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; int retval; + if (IS_ERR(filename)) + return PTR_ERR(filename); + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs @@ -1473,7 +1481,7 @@ static int do_execve_common(const char *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = open_exec(filename); + file = do_open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1481,8 +1489,7 @@ static int do_execve_common(const char *filename, sched_exec(); bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; + bprm->filename = bprm->interp = filename->name; retval = bprm_mm_init(bprm); if (retval) @@ -1523,6 +1530,7 @@ static int do_execve_common(const char *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1544,10 +1552,11 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: + putname(filename); return retval; } -int do_execve(const char *filename, +int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -1557,7 +1566,7 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -static int compat_do_execve(const char *filename, +static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { @@ -1607,25 +1616,13 @@ SYSCALL_DEFINE3(execve, const char __user *const __user *, argv, const char __user *const __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp); - putname(path); - } - return error; + return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT asmlinkage long compat_sys_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp); - putname(path); - } - return error; + return compat_do_execve(getname(filename), argv, envp); } #endif diff --git a/fs/namei.c b/fs/namei.c index d580df2e6804..385f7817bfcc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,6 +196,7 @@ recopy: goto error; result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; @@ -210,6 +211,35 @@ getname(const char __user * filename) return getname_flags(filename, 0, NULL); } +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} + #ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fd8bf3219ef7..b4a745d7d9a9 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -115,7 +115,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv, extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); -extern void free_bprm(struct linux_binprm *); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c59813..d79678c188ad 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2079,6 +2079,7 @@ extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname(const char __user *); +extern struct filename *getname_kernel(const char *); enum { FILE_CREATED = 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a0..a781dec1cd0b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -128,6 +128,7 @@ struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; /* * List of flags we want to share for kernel threads, @@ -2311,7 +2312,7 @@ extern void do_group_exit(int); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); diff --git a/init/main.c b/init/main.c index 2fd9cef70ee8..eb03090cdced 100644 --- a/init/main.c +++ b/init/main.c @@ -812,7 +812,7 @@ void __init load_default_modules(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return do_execve(init_filename, + return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 10176cd5956a..7aef2f4b6c64 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1719,7 +1719,7 @@ void audit_putname(struct filename *name) struct audit_context *context = current->audit_context; BUG_ON(!context); - if (!context->in_syscall) { + if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e7..6b375af4958d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = do_execve(sub_info->path, + retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); if (!retval) -- cgit v1.2.3-70-g09d2 From 64d46806b6218c97f68742c5663a8ae3a5fbe838 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 5 Feb 2014 15:03:37 +0000 Subject: netfilter: nf_tables: add AF specific expression support For the reject module, we need to add AF-specific implementations to get rid of incorrect module dependencies. Try to load an AF-specific module first and fall back to generic modules. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 22 ++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 57c8ff7955df..0f68e47d3e5e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -252,6 +252,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number + * @family: address family for AF-specific types */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, @@ -262,6 +263,7 @@ struct nft_expr_type { struct module *owner; const struct nla_policy *policy; unsigned int maxattr; + u8 family; }; /** @@ -529,6 +531,9 @@ void nft_unregister_expr(struct nft_expr_type *); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) +#define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ + MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) + #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3c5a219f4242..113c469c7579 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1114,35 +1114,45 @@ void nft_unregister_expr(struct nft_expr_type *type) } EXPORT_SYMBOL_GPL(nft_unregister_expr); -static const struct nft_expr_type *__nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name)) + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) return type; } return NULL; } -static const struct nft_expr_type *nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); - type = __nft_expr_type_get(nla); + type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) return type; #ifdef CONFIG_MODULES if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (__nft_expr_type_get(nla)) + if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } #endif @@ -1193,7 +1203,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); -- cgit v1.2.3-70-g09d2 From cc4723ca316742891954efa346298e7c747c0d17 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 5 Feb 2014 15:03:38 +0000 Subject: netfilter: nft_reject: split up reject module into IPv4 and IPv6 specifc parts Currently the nft_reject module depends on symbols from ipv6. This is wrong since no generic module should force IPv6 support to be loaded. Split up the module into AF-specific and a generic part. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_reject.h | 17 +++++++ net/ipv4/netfilter/Kconfig | 5 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_reject_ipv4.c | 74 ++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 5 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 75 ++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 1 - net/netfilter/nft_reject.c | 89 ++++-------------------------------- 9 files changed, 187 insertions(+), 81 deletions(-) create mode 100644 include/net/netfilter/nft_reject.h create mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c create mode 100644 net/ipv6/netfilter/nft_reject_ipv6.c (limited to 'include') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h new file mode 100644 index 000000000000..ecda75945e77 --- /dev/null +++ b/include/net/netfilter/nft_reject.h @@ -0,0 +1,17 @@ +#ifndef _NFT_REJECT_H_ +#define _NFT_REJECT_H_ + +struct nft_reject { + enum nft_reject_types type:8; + u8 icmp_code; +}; + +extern const struct nla_policy nft_reject_policy[]; + +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); + +#endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 81c6910cfa92..a26ce035e3fa 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,6 +61,11 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate + config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c16be9d58420..90b82405331e 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 000000000000..e935d8de1182 --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy + * Copyright (c) 2013 Eric Leblond + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv4_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "reject", + .ops = &nft_reject_ipv4_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 35750df744dc..4bff1f297e39 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -50,6 +50,11 @@ config NFT_CHAIN_NAT_IPV6 packet transformations such as the source, destination address and source and destination ports. +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + default NFT_REJECT + tristate + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d1b4928f34f7..70d3dd66f2cd 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o +obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c new file mode 100644 index 000000000000..f73285924144 --- /dev/null +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy + * Copyright (c) 2013 Eric Leblond + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static struct nft_expr_type nft_reject_ipv6_type; +static const struct nft_expr_ops nft_reject_ipv6_ops = { + .type = &nft_reject_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv6_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "reject", + .ops = &nft_reject_ipv6_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv6_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv6_type); +} + +static void __exit nft_reject_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv6_type); +} + +module_init(nft_reject_ipv6_module_init); +module_exit(nft_reject_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c37467562fd0..ed8b50e62276 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -513,7 +513,6 @@ config NFT_QUEUE config NFT_REJECT depends on NF_TABLES - depends on NF_TABLES_IPV6 || !NF_TABLES_IPV6 default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" help diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 5e204711d704..f3448c296446 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -16,65 +16,23 @@ #include #include #include -#include -#include +#include -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) -#include -#endif - -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; - u8 family; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_reject *priv = nft_expr_priv(expr); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); -#endif - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (priv->family == NFPROTO_IPV4) - nf_send_unreach(pkt->skb, priv->icmp_code); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); -#endif - break; - case NFT_REJECT_TCP_RST: - if (priv->family == NFPROTO_IPV4) - nf_send_reset(pkt->skb, pkt->ops->hooknum); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); -#endif - break; - } - - data[NFT_REG_VERDICT].verdict = NF_DROP; -} - -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; +EXPORT_SYMBOL_GPL(nft_reject_policy); -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; - priv->family = ctx->afi->family; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: @@ -89,8 +47,9 @@ static int nft_reject_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_reject_init); -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_reject *priv = nft_expr_priv(expr); @@ -109,37 +68,7 @@ static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, -}; - -static struct nft_expr_type nft_reject_type __read_mostly = { - .name = "reject", - .ops = &nft_reject_ops, - .policy = nft_reject_policy, - .maxattr = NFTA_REJECT_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_reject_module_init(void) -{ - return nft_register_expr(&nft_reject_type); -} - -static void __exit nft_reject_module_exit(void) -{ - nft_unregister_expr(&nft_reject_type); -} - -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); +EXPORT_SYMBOL_GPL(nft_reject_dump); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_EXPR("reject"); -- cgit v1.2.3-70-g09d2 From 05513e9e33dbded8124567466a444d32173eecc6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 5 Feb 2014 15:03:39 +0000 Subject: netfilter: nf_tables: add reject module for NFPROTO_INET Add a reject module for NFPROTO_INET. It does nothing but dispatch to the AF-specific modules based on the hook family. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_reject.h | 8 +++++ net/ipv4/netfilter/nft_reject_ipv4.c | 7 ++-- net/ipv6/netfilter/nft_reject_ipv6.c | 7 ++-- net/netfilter/Kconfig | 5 +++ net/netfilter/Makefile | 1 + net/netfilter/nft_reject_inet.c | 63 ++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 net/netfilter/nft_reject_inet.c (limited to 'include') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index ecda75945e77..36b0da2d55bb 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -14,4 +14,12 @@ int nft_reject_init(const struct nft_ctx *ctx, int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + #endif diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e935d8de1182..e79718a382f2 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -20,9 +20,9 @@ #include #include -static void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -37,6 +37,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); static struct nft_expr_type nft_reject_ipv4_type; static const struct nft_expr_ops nft_reject_ipv4_ops = { diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index f73285924144..0bc19fa87821 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -19,9 +19,9 @@ #include #include -static void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); @@ -38,6 +38,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); static struct nft_expr_type nft_reject_ipv6_type; static const struct nft_expr_ops nft_reject_ipv6_ops = { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ed8b50e62276..e9410d17619d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -520,6 +520,11 @@ config NFT_REJECT explicitly deny and notify via TCP reset/ICMP informational errors unallowed traffic. +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate + config NFT_COMPAT depends on NF_TABLES depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ee9c4de5f8ed..bffdad774da7 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -79,6 +79,7 @@ obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 000000000000..8a310f239c93 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); -- cgit v1.2.3-70-g09d2 From a3485d088516be4c2ae2890e8ad64321481f2857 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 4 Feb 2014 13:42:10 +0100 Subject: gpio: consumer.h: Move forward declarations outside #ifdef Make sure that the forward declared structs in gpio/consumer.h are also visible on the else branch of the CONFIG_GPIOLIB #ifdef. Fixes the following warnings and their associated errors when CONFIG_GPIOLIB is not selected: include/linux/gpio/consumer.h:67:14: warning: 'struct device' declared inside parameter list include/linux/gpio/consumer.h:67:14: warning: its scope is only this definition or declaration, which is probably not what you want [...] Signed-off-by: Lars-Peter Clausen Reviewed-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 4d34dbbbad4d..7a8144fef406 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -4,8 +4,6 @@ #include #include -#ifdef CONFIG_GPIOLIB - struct device; struct gpio_chip; @@ -18,6 +16,8 @@ struct gpio_chip; */ struct gpio_desc; +#ifdef CONFIG_GPIOLIB + /* Acquire and dispose GPIOs */ struct gpio_desc *__must_check gpiod_get(struct device *dev, const char *con_id); -- cgit v1.2.3-70-g09d2 From 0165d9325d6a3cf856e2cbbe64a0f4635ac75893 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 25 Jan 2014 14:03:51 +0100 Subject: netfilter: nf_tables: fix racy rule deletion We may lost race if we flush the rule-set (which happens asynchronously via call_rcu) and we try to remove the table (that userspace assumes to be empty). Fix this by recovering synchronous rule and chain deletion. This was introduced time ago before we had no batch support, and synchronous rule deletion performance was not good. Now that we have the batch support, we can just postpone the purge of old rule in a second step in the commit phase. All object deletions are synchronous after this patch. As a side effect, we save memory as we don't need rcu_head per rule anymore. Cc: Patrick McHardy Reported-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ---- net/netfilter/nf_tables_api.c | 40 ++++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f68e47d3e5e..e7e14ffe0f6a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -322,7 +322,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) * struct nft_rule - nf_tables rule * * @list: used internally - * @rcu_head: used internally for rcu * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data @@ -330,7 +329,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) */ struct nft_rule { struct list_head list; - struct rcu_head rcu_head; u64 handle:46, genmask:2, dlen:16; @@ -391,7 +389,6 @@ enum nft_chain_flags { * * @rules: list of rules in the chain * @list: used internally - * @rcu_head: used internally * @net: net namespace that this chain belongs to * @table: table that this chain belongs to * @handle: chain handle @@ -403,7 +400,6 @@ enum nft_chain_flags { struct nft_chain { struct list_head rules; struct list_head list; - struct rcu_head rcu_head; struct net *net; struct nft_table *table; u64 handle; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 113c469c7579..3a2e4800b415 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1008,10 +1008,8 @@ notify: return 0; } -static void nf_tables_rcu_chain_destroy(struct rcu_head *head) +static void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { @@ -1059,7 +1057,9 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, family); /* Make sure all rule references are gone before this is released */ - call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy); + synchronize_rcu(); + + nf_tables_chain_destroy(chain); return 0; } @@ -1531,9 +1531,8 @@ err: return err; } -static void nf_tables_rcu_rule_destroy(struct rcu_head *head) +static void nf_tables_rule_destroy(struct nft_rule *rule) { - struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head); struct nft_expr *expr; /* @@ -1548,11 +1547,6 @@ static void nf_tables_rcu_rule_destroy(struct rcu_head *head) kfree(rule); } -static void nf_tables_rule_destroy(struct nft_rule *rule) -{ - call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy); -} - #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -1819,9 +1813,6 @@ static int nf_tables_commit(struct sk_buff *skb) synchronize_rcu(); list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete this rule from the dirty list */ - list_del(&rupd->list); - /* This rule was inactive in the past and just became active. * Clear the next bit of the genmask since its meaning has * changed, now it is the future. @@ -1832,6 +1823,7 @@ static int nf_tables_commit(struct sk_buff *skb) rupd->chain, rupd->rule, NFT_MSG_NEWRULE, 0, rupd->family); + list_del(&rupd->list); kfree(rupd); continue; } @@ -1841,7 +1833,15 @@ static int nf_tables_commit(struct sk_buff *skb) nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain, rupd->rule, NFT_MSG_DELRULE, 0, rupd->family); + } + + /* Make sure we don't see any packet traversing old rules */ + synchronize_rcu(); + + /* Now we can safely release unused old rules */ + list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { nf_tables_rule_destroy(rupd->rule); + list_del(&rupd->list); kfree(rupd); } @@ -1854,20 +1854,26 @@ static int nf_tables_abort(struct sk_buff *skb) struct nft_rule_trans *rupd, *tmp; list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete all rules from the dirty list */ - list_del(&rupd->list); - if (!nft_rule_is_active_next(net, rupd->rule)) { nft_rule_clear(net, rupd->rule); + list_del(&rupd->list); kfree(rupd); continue; } /* This rule is inactive, get rid of it */ list_del_rcu(&rupd->rule->list); + } + + /* Make sure we don't see any packet accessing aborted rules */ + synchronize_rcu(); + + list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { nf_tables_rule_destroy(rupd->rule); + list_del(&rupd->list); kfree(rupd); } + return 0; } -- cgit v1.2.3-70-g09d2 From 579f82901f6f41256642936d7e632f3979ad76d4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 6 Feb 2014 12:04:21 -0800 Subject: swap: add a simple detector for inappropriate swapin readahead This is a patch to improve swap readahead algorithm. It's from Hugh and I slightly changed it. Hugh's original changelog: swapin readahead does a blind readahead, whether or not the swapin is sequential. This may be ok on harddisk, because large reads have relatively small costs, and if the readahead pages are unneeded they can be reclaimed easily - though, what if their allocation forced reclaim of useful pages? But on SSD devices large reads are more expensive than small ones: if the readahead pages are unneeded, reading them in caused significant overhead. This patch adds very simplistic random read detection. Stealing the PageReadahead technique from Konstantin Khlebnikov's patch, avoiding the vma/anon_vma sophistications of Shaohua Li's patch, swapin_nr_pages() simply looks at readahead's current success rate, and narrows or widens its readahead window accordingly. There is little science to its heuristic: it's about as stupid as can be whilst remaining effective. The table below shows elapsed times (in centiseconds) when running a single repetitive swapping load across a 1000MB mapping in 900MB ram with 1GB swap (the harddisk tests had taken painfully too long when I used mem=500M, but SSD shows similar results for that). Vanilla is the 3.6-rc7 kernel on which I started; Shaohua denotes his Sep 3 patch in mmotm and linux-next; HughOld denotes my Oct 1 patch which Shaohua showed to be defective; HughNew this Nov 14 patch, with page_cluster as usual at default of 3 (8-page reads); HughPC4 this same patch with page_cluster 4 (16-page reads); HughPC0 with page_cluster 0 (1-page reads: no readahead). HDD for swapping to harddisk, SSD for swapping to VertexII SSD. Seq for sequential access to the mapping, cycling five times around; Rand for the same number of random touches. Anon for a MAP_PRIVATE anon mapping; Shmem for a MAP_SHARED anon mapping, equivalent to tmpfs. One weakness of Shaohua's vma/anon_vma approach was that it did not optimize Shmem: seen below. Konstantin's approach was perhaps mistuned, 50% slower on Seq: did not compete and is not shown below. HDD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 73921 76210 75611 76904 78191 121542 Seq Shmem 73601 73176 73855 72947 74543 118322 Rand Anon 895392 831243 871569 845197 846496 841680 Rand Shmem 1058375 1053486 827935 764955 764376 756489 SSD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 24634 24198 24673 25107 21614 70018 Seq Shmem 24959 24932 25052 25703 22030 69678 Rand Anon 43014 26146 28075 25989 26935 25901 Rand Shmem 45349 45215 28249 24268 24138 24332 These tests are, of course, two extremes of a very simple case: under heavier mixed loads I've not yet observed any consistent improvement or degradation, and wider testing would be welcome. Shaohua Li: Test shows Vanilla is slightly better in sequential workload than Hugh's patch. I observed with Hugh's patch sometimes the readahead size is shrinked too fast (from 8 to 1 immediately) in sequential workload if there is no hit. And in such case, continuing doing readahead is good actually. I don't prepare a sophisticated algorithm for the sequential workload because so far we can't guarantee sequential accessed pages are swap out sequentially. So I slightly change Hugh's heuristic - don't shrink readahead size too fast. Here is my test result (unit second, 3 runs average): Vanilla Hugh New Seq 356 370 360 Random 4525 2447 2444 Attached graph is the swapin/swapout throughput I collected with 'vmstat 2'. The first part is running a random workload (till around 1200 of the x-axis) and the second part is running a sequential workload. swapin and swapout throughput are almost identical in steady state in both workloads. These are expected behavior. while in Vanilla, swapin is much bigger than swapout especially in random workload (because wrong readahead). Original patches by: Shaohua Li and Konstantin Khlebnikov. [fengguang.wu@intel.com: swapin_nr_pages() can be static] Signed-off-by: Hugh Dickins Signed-off-by: Shaohua Li Signed-off-by: Fengguang Wu Cc: Rik van Riel Cc: Wu Fengguang Cc: Minchan Kim Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +-- mm/swap_state.c | 63 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e464b4e987e8..d1fe1a761047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) #ifdef CONFIG_HIGHMEM /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } -- cgit v1.2.3-70-g09d2 From ee262ad827f89e2dc7851ec2986953b5b125c6bc Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Thu, 6 Feb 2014 12:10:00 +0100 Subject: inet: defines IPPROTO_* needed for module alias generation Commit cfd280c91253 ("net: sync some IP headers with glibc") changed a set of define's to an enum (with no explanation why) which introduced a bug in module mip6 where aliases are generated using the IPPROTO_* defines; mip6 doesn't load if require_module called with the aliases from xfrm_get_type(). Reverting this change back to define's to fix the aliases. modinfo mip6 (before this change) alias: xfrm-type-10-IPPROTO_DSTOPTS alias: xfrm-type-10-IPPROTO_ROUTING modinfo mip6 (after this change) alias: xfrm-type-10-43 alias: xfrm-type-10-60 Signed-off-by: Jan Moskyto Matejka Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 633b93cac1ed..e9a1d2d973b6 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -128,22 +128,13 @@ struct in6_flowlabel_req { * IPV6 extension headers */ #if __UAPI_DEF_IPPROTO_V6 -enum { - IPPROTO_HOPOPTS = 0, /* IPv6 hop-by-hop options */ -#define IPPROTO_HOPOPTS IPPROTO_HOPOPTS - IPPROTO_ROUTING = 43, /* IPv6 routing header */ -#define IPPROTO_ROUTING IPPROTO_ROUTING - IPPROTO_FRAGMENT = 44, /* IPv6 fragmentation header */ -#define IPPROTO_FRAGMENT IPPROTO_FRAGMENT - IPPROTO_ICMPV6 = 58, /* ICMPv6 */ -#define IPPROTO_ICMPV6 IPPROTO_ICMPV6 - IPPROTO_NONE = 59, /* IPv6 no next header */ -#define IPPROTO_NONE IPPROTO_NONE - IPPROTO_DSTOPTS = 60, /* IPv6 destination options */ -#define IPPROTO_DSTOPTS IPPROTO_DSTOPTS - IPPROTO_MH = 135, /* IPv6 mobility header */ -#define IPPROTO_MH IPPROTO_MH -}; +#define IPPROTO_HOPOPTS 0 /* IPv6 hop-by-hop options */ +#define IPPROTO_ROUTING 43 /* IPv6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IPv6 fragmentation header */ +#define IPPROTO_ICMPV6 58 /* ICMPv6 */ +#define IPPROTO_NONE 59 /* IPv6 no next header */ +#define IPPROTO_DSTOPTS 60 /* IPv6 destination options */ +#define IPPROTO_MH 135 /* IPv6 mobility header */ #endif /* __UAPI_DEF_IPPROTO_V6 */ /* -- cgit v1.2.3-70-g09d2 From 78c0f98cc9dd46824fa66f35f14ea24ba733d145 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 30 Jan 2014 13:49:48 +0200 Subject: IB/mlx5: Fix binary compatibility with libmlx5 Commit c1be5232d21d ("Fix micro UAR allocator") broke binary compatibility between libmlx5 and mlx5_ib since it defines a different value to the number of micro UARs per page, leading to wrong calculation in libmlx5. This patch defines struct mlx5_ib_alloc_ucontext_req_v2 as an extension to struct mlx5_ib_alloc_ucontext_req. The extended size is determined in mlx5_ib_alloc_ucontext() and in case of old library we use uuarn 0 which works fine -- this is acheived due to create_user_qp() falling back from high to medium then to low class where low class will return 0. For new libraries we use the more sophisticated allocation algorithm. Signed-off-by: Eli Cohen Reviewed-by: Yann Droneaud Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/main.c | 19 +++++++++++++++++-- drivers/infiniband/hw/mlx5/qp.c | 10 ++++++++-- drivers/infiniband/hw/mlx5/user.h | 7 +++++++ include/linux/mlx5/driver.h | 1 + 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9660d093f8cf..f4ef4a24d410 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -536,24 +536,38 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req req; + struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; int num_uars; + int ver; int uuarn; int err; int i; + int reqlen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - err = ib_copy_from_udata(&req, udata, sizeof(req)); + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); if (err) return ERR_PTR(err); + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + if (req.total_num_uuars > MLX5_MAX_UUARS) return ERR_PTR(-ENOMEM); @@ -626,6 +640,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) goto out_uars; + uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 492dc330e907..091576a777e9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -430,11 +430,17 @@ static int alloc_uuar(struct mlx5_uuar_info *uuari, break; case MLX5_IB_LATENCY_CLASS_MEDIUM: - uuarn = alloc_med_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_HIGH: - uuarn = alloc_high_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_FAST_PATH: diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 32a2a5dfc523..0f4f8e42a17f 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -62,6 +62,13 @@ struct mlx5_ib_alloc_ucontext_req { __u32 num_low_latency_uuars; }; +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 554548cd3dd4..32cb18c399c2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -227,6 +227,7 @@ struct mlx5_uuar_info { * protect uuar allocation data structs */ struct mutex lock; + u32 ver; }; struct mlx5_bf { -- cgit v1.2.3-70-g09d2 From e28bab4828354583bb66ac09021ca69b341a7db4 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 15 Jan 2014 17:12:58 -0800 Subject: Drivers: hv: vmbus: Specify the target CPU that should receive notification During the initial VMBUS connect phase, starting with WS2012 R2, we should specify the VPCU in the guest that should receive the notification. Fix this issue. This fix is required to properly connect to the host in the kexeced kernel. Signed-off-by: K. Y. Srinivasan Cc: [3.9+] Signed-off-by: Greg Kroah-Hartman --- drivers/hv/connection.c | 2 ++ include/linux/hyperv.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index af6edf9b1936..855bbda9964d 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -78,6 +78,8 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); + if (version == VERSION_WIN8) + msg->target_vcpu = hv_context.vp_index[smp_processor_id()]; /* * Add to list before we send the request since we may diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 15da677478dd..344883dce584 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -875,7 +875,7 @@ struct vmbus_channel_relid_released { struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; - u32 padding2; + u32 target_vcpu; /* The VCPU the host should respond to */ u64 interrupt_page; u64 monitor_page1; u64 monitor_page2; -- cgit v1.2.3-70-g09d2 From 80bfa2f6e2e81049fc6cd3bfaeedcb64db3a9ba6 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 4 Feb 2014 11:26:15 +0100 Subject: xen-blkif: drop struct blkif_request_segment_aligned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was wrongly introduced in commit 402b27f9, the only difference between blkif_request_segment_aligned and blkif_request_segment is that the former has a named padding, while both share the same memory layout. Also correct a few minor glitches in the description, including for it to no longer assume PAGE_SIZE == 4096. Signed-off-by: Roger Pau Monné [Description fix by Jan Beulich] Signed-off-by: Jan Beulich Reported-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: David Vrabel Cc: Boris Ostrovsky Tested-by: Matt Rushton Cc: Matt Wilson Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 2 +- drivers/block/xen-blkback/common.h | 2 +- drivers/block/xen-blkfront.c | 6 +++--- include/xen/interface/io/blkif.h | 34 ++++++++++++++-------------------- 4 files changed, 19 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 394fa2eabf87..e612627ae981 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -847,7 +847,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct grant_page **pages = pending_req->indirect_pages; struct xen_blkif *blkif = pending_req->blkif; int indirect_grefs, rc, n, nseg, i; - struct blkif_request_segment_aligned *segments = NULL; + struct blkif_request_segment *segments = NULL; nseg = pending_req->nr_pages; indirect_grefs = INDIRECT_PAGES(nseg); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index e40326a7f707..9eb34e24b4fe 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -57,7 +57,7 @@ #define MAX_INDIRECT_SEGMENTS 256 #define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) + (PAGE_SIZE/sizeof(struct blkif_request_segment)) #define MAX_INDIRECT_PAGES \ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) #define INDIRECT_PAGES(_segs) \ diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index c4a4c9006288..7d09dfc9735c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -162,7 +162,7 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ #define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) + (PAGE_SIZE/sizeof(struct blkif_request_segment)) #define INDIRECT_GREFS(_segs) \ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) @@ -393,7 +393,7 @@ static int blkif_queue_request(struct request *req) unsigned long id; unsigned int fsect, lsect; int i, ref, n; - struct blkif_request_segment_aligned *segments = NULL; + struct blkif_request_segment *segments = NULL; /* * Used to store if we are able to queue the request by just using @@ -550,7 +550,7 @@ static int blkif_queue_request(struct request *req) } else { n = i % SEGS_PER_INDIRECT_FRAME; segments[n] = - (struct blkif_request_segment_aligned) { + (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ae665ac59c36..32ec05a6572f 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -113,13 +113,13 @@ typedef uint64_t blkif_sector_t; * it's less than the number provided by the backend. The indirect_grefs field * in blkif_request_indirect should be filled by the frontend with the * grant references of the pages that are holding the indirect segments. - * This pages are filled with an array of blkif_request_segment_aligned - * that hold the information about the segments. The number of indirect - * pages to use is determined by the maximum number of segments - * a indirect request contains. Every indirect page can contain a maximum - * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), - * so to calculate the number of indirect pages to use we have to do - * ceil(indirect_segments/512). + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). * * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* * create the "feature-max-indirect-segments" node! @@ -135,13 +135,12 @@ typedef uint64_t blkif_sector_t; #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 -struct blkif_request_segment_aligned { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ -} __attribute__((__packed__)); +struct blkif_request_segment { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; +}; struct blkif_request_rw { uint8_t nr_segments; /* number of segments */ @@ -151,12 +150,7 @@ struct blkif_request_rw { #endif uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } __attribute__((__packed__)); struct blkif_request_discard { -- cgit v1.2.3-70-g09d2 From 72a0a36e2854a6eadb4cf2561858f613f9cd4639 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:36 -0800 Subject: blk-mq: support at_head inserations for blk_execute_rq This is neede for proper SG_IO operation as well as various uses of blk_execute_rq from the SCSI midlayer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-mq.c | 17 ++++++++++------- include/linux/blk-mq.h | 3 ++- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/block/blk-exec.c b/block/blk-exec.c index bbfc072a79c2..c68613bb4c79 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be resued after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(q, rq, true); + blk_mq_insert_request(q, rq, at_head, true); return; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9072d0ab184f..c9306e3403fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -714,13 +714,16 @@ static void blk_mq_work_fn(struct work_struct *work) } static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) + struct request *rq, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; trace_block_rq_insert(hctx->queue, rq); - list_add_tail(&rq->queuelist, &ctx->rq_list); + if (at_head) + list_add(&rq->queuelist, &ctx->rq_list); + else + list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); /* @@ -730,7 +733,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, } void blk_mq_insert_request(struct request_queue *q, struct request *rq, - bool run_queue) + bool at_head, bool run_queue) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx, *current_ctx; @@ -749,7 +752,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq, rq->mq_ctx = ctx; } spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -781,7 +784,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async) /* ctx->cpu might be offline */ spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -819,7 +822,7 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); @@ -971,7 +974,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) __blk_mq_free_request(hctx, ctx, rq); else { blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1e8f16f65af4..b7638be58599 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -122,7 +122,8 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_insert_request(struct request_queue *, struct request *, + bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v1.2.3-70-g09d2 From 3d4b81eda2211f32886e2978daf6f39885042fc4 Mon Sep 17 00:00:00 2001 From: Sarah Sharp Date: Fri, 31 Jan 2014 11:52:57 -0800 Subject: Revert "usb: xhci: Link TRB must not occur within a USB payload burst" This reverts commit 35773dac5f862cb1c82ea151eba3e2f6de51ec3e. It's a hack that caused regressions in the usb-storage and userspace USB drivers that use usbfs and libusb. Commit 70cabb7d992f "xhci 1.0: Limit arbitrarily-aligned scatter gather." should fix the issues seen with the ax88179_178a driver on xHCI 1.0 hosts, without causing regressions. Signed-off-by: Sarah Sharp Cc: stable@vger.kernel.org # 3.12 --- drivers/usb/host/xhci-ring.c | 54 ++------------------------------------------ include/linux/usb.h | 2 -- 2 files changed, 2 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 0272450a1667..0ed64eb68e48 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2967,58 +2967,8 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring, } while (1) { - if (room_on_ring(xhci, ep_ring, num_trbs)) { - union xhci_trb *trb = ep_ring->enqueue; - unsigned int usable = ep_ring->enq_seg->trbs + - TRBS_PER_SEGMENT - 1 - trb; - u32 nop_cmd; - - /* - * Section 4.11.7.1 TD Fragments states that a link - * TRB must only occur at the boundary between - * data bursts (eg 512 bytes for 480M). - * While it is possible to split a large fragment - * we don't know the size yet. - * Simplest solution is to fill the trb before the - * LINK with nop commands. - */ - if (num_trbs == 1 || num_trbs <= usable || usable == 0) - break; - - if (ep_ring->type != TYPE_BULK) - /* - * While isoc transfers might have a buffer that - * crosses a 64k boundary it is unlikely. - * Since we can't add NOPs without generating - * gaps in the traffic just hope it never - * happens at the end of the ring. - * This could be fixed by writing a LINK TRB - * instead of the first NOP - however the - * TRB_TYPE_LINK_LE32() calls would all need - * changing to check the ring length. - */ - break; - - if (num_trbs >= TRBS_PER_SEGMENT) { - xhci_err(xhci, "Too many fragments %d, max %d\n", - num_trbs, TRBS_PER_SEGMENT - 1); - return -ENOMEM; - } - - nop_cmd = cpu_to_le32(TRB_TYPE(TRB_TR_NOOP) | - ep_ring->cycle_state); - ep_ring->num_trbs_free -= usable; - do { - trb->generic.field[0] = 0; - trb->generic.field[1] = 0; - trb->generic.field[2] = 0; - trb->generic.field[3] = nop_cmd; - trb++; - } while (--usable); - ep_ring->enqueue = trb; - if (room_on_ring(xhci, ep_ring, num_trbs)) - break; - } + if (room_on_ring(xhci, ep_ring, num_trbs)) + break; if (ep_ring == xhci->cmd_ring) { xhci_err(xhci, "Do not support expand command ring\n"); diff --git a/include/linux/usb.h b/include/linux/usb.h index c716da18c668..7f6eb859873e 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1265,8 +1265,6 @@ typedef void (*usb_complete_t)(struct urb *); * @sg: scatter gather buffer list, the buffer size of each element in * the list (except the last) must be divisible by the endpoint's * max packet size if no_sg_constraint isn't set in 'struct usb_bus' - * (FIXME: scatter-gather under xHCI is broken for periodic transfers. - * Do not use urb->sg for interrupt endpoints for now, only bulk.) * @num_mapped_sgs: (internal) number of mapped sg entries * @num_sgs: number of entries in the sg list * @transfer_buffer_length: How big is transfer_buffer. The transfer may -- cgit v1.2.3-70-g09d2 From 3b1cc9b9622a022208ec95b1259b05bbdf712eb7 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Mon, 3 Feb 2014 14:53:19 -0800 Subject: misc: mic: fix possible signed underflow (undefined behavior) in userspace API iovcnt is declared as a signed integer in both the userspace API and as a local variable in mic_virtio.c. The while() loop in mic_virtio.c iterates until the local variable iovcnt reaches the value 0. If userspace passes e.g. INT_MIN as iovcnt field, this loop then appears to depend on an undefined behavior (signed underflow) to complete. The fix is to use unsigned integers in both the userspace API and the local variable. This issue was reported @ https://lkml.org/lkml/2014/1/10/10 Reported-by: Mathieu Desnoyers Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mic/host/mic_virtio.c | 3 ++- include/uapi/linux/mic_ioctl.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/misc/mic/host/mic_virtio.c b/drivers/misc/mic/host/mic_virtio.c index 752ff873f891..7e1ef0ebbb80 100644 --- a/drivers/misc/mic/host/mic_virtio.c +++ b/drivers/misc/mic/host/mic_virtio.c @@ -156,7 +156,8 @@ static int mic_vringh_copy(struct mic_vdev *mvdev, struct vringh_kiov *iov, static int _mic_virtio_copy(struct mic_vdev *mvdev, struct mic_copy_desc *copy) { - int ret = 0, iovcnt = copy->iovcnt; + int ret = 0; + u32 iovcnt = copy->iovcnt; struct iovec iov; struct iovec __user *u_iov = copy->iov; void __user *ubuf = NULL; diff --git a/include/uapi/linux/mic_ioctl.h b/include/uapi/linux/mic_ioctl.h index 7fabba5059cf..feb0b4c0814c 100644 --- a/include/uapi/linux/mic_ioctl.h +++ b/include/uapi/linux/mic_ioctl.h @@ -39,7 +39,7 @@ struct mic_copy_desc { #else struct iovec *iov; #endif - int iovcnt; + __u32 iovcnt; __u8 vr_idx; __u8 update_used; __u32 out_len; -- cgit v1.2.3-70-g09d2 From 0668d3065128d39449c097e62dbdb5707820137d Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 2 Jan 2014 16:37:32 -0800 Subject: genirq: Add devm_request_any_context_irq() Some drivers use request_any_context_irq() but there isn't a devm_* function for it. Add one so that these drivers don't need to explicitly free the irq on driver detach. Signed-off-by: Stephen Boyd Cc: linux-arm-kernel@lists.infradead.org Cc: Dmitry Torokhov Link: http://lkml.kernel.org/r/1388709460-19222-3-git-send-email-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 5 +++++ kernel/irq/devres.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0053adde0ed9..a2678d35b5a2 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -158,6 +158,11 @@ devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, devname, dev_id); } +extern int __must_check +devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id); + extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); /* diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e0..1ef0606797c9 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -72,6 +72,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, } EXPORT_SYMBOL(devm_request_threaded_irq); +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_any_context_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_any_context_irq); + /** * devm_free_irq - free an interrupt * @dev: device to free interrupt for -- cgit v1.2.3-70-g09d2 From d311d79de305f1ada47cadd672e6ed1b28a949eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 15:18:09 -0500 Subject: fix O_SYNC|O_APPEND syncing the wrong range on write() It actually goes back to 2004 ([PATCH] Concurrent O_SYNC write support) when sync_page_range() had been introduced; generic_file_write{,v}() correctly synced pos_after_write - written .. pos_after_write - 1 but generic_file_aio_write() synced pos_before_write .. pos_before_write + written - 1 instead. Which is not the same thing with O_APPEND, obviously. A couple of years later correct variant had been killed off when everything switched to use of generic_file_aio_write(). All users of generic_file_aio_write() are affected, and the same bug has been copied into other instances of ->aio_write(). The fix is trivial; the only subtle point is that generic_write_sync() ought to be inlined to avoid calculations useless for the majority of calls. Signed-off-by: Al Viro --- fs/cifs/file.c | 4 ++-- fs/ext4/file.c | 2 +- fs/ntfs/file.c | 2 +- fs/sync.c | 17 ----------------- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 8 +++++++- mm/filemap.c | 4 ++-- 7 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 853d6d1cc822..a7eda8ebfacc 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2559,8 +2559,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, if (rc > 0) { ssize_t err; - err = generic_write_sync(file, pos, rc); - if (err < 0 && rc > 0) + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) rc = err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 43e64f6022eb..1a5073959f32 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0 && ret > 0) ret = err; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb47..db9bd8a31725 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0) { - int err = generic_write_sync(file, pos, ret); + int err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/sync.c b/fs/sync.c index f15537452231..e8ba024a055b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -222,23 +222,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/** - * generic_write_sync - perform syncing after a write if file / inode is sync - * @file: file to which the write happened - * @pos: offset where the write started - * @count: length of the write - * - * This is just a simple wrapper about our general syncing function. - */ -int generic_write_sync(struct file *file, loff_t pos, loff_t count) -{ - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) - return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); -} -EXPORT_SYMBOL(generic_write_sync); - /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2e7989e3a2d6..64b48eade91d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -799,7 +799,7 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c59813..75ff961be051 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2273,7 +2273,13 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); +static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); +} extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/filemap.c b/mm/filemap.c index d56d3c145b9f..7a13f6ac5421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } return ret; -- cgit v1.2.3-70-g09d2 From c4540a7d8c1e595560e53acedf88901daf15a2b5 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 18:30:39 +0530 Subject: fs: Add prototype declaration to appropriate header file include/linux/bio.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add prototype declaration to header file include/linux/bio.h because it is used by more than one file. This eliminates the following warning in bio-integrity.c: fs/bio-integrity.c:214:14: warning: no previous prototype for ‘bio_integrity_tag_size’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 70654521dab6..d6791bba8264 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -332,6 +332,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set *fs_bio_set; +unsigned int bio_integrity_tag_size(struct bio *bio); static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { -- cgit v1.2.3-70-g09d2 From f56b8bf6e445b22bb6207dbcf30b2c7f5ddfdf96 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:20:09 +0530 Subject: net: Move prototype declaration to appropriate header file from decnet/af_decnet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/dn_route.h from net/decnet/af_decnet.c because it is used by more than one file. This eliminates the following warning in net/decnet/dn_route.c: net/decnet/dn_route.c:629:5: warning: no previous prototype for ‘dn_route_rcv’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: David S. Miller --- include/net/dn_route.h | 2 ++ net/decnet/af_decnet.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dn_route.h b/include/net/dn_route.h index b409ad6b8d7a..55df9939bca2 100644 --- a/include/net/dn_route.h +++ b/include/net/dn_route.h @@ -20,6 +20,8 @@ int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *, struct sock *sk, int flags); int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb); void dn_rt_cache_flush(int delay); +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); /* Masks for flags field */ #define DN_RT_F_PID 0x07 /* Mask for packet type */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 2954dcbca832..24d9193240e3 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2104,8 +2104,6 @@ static struct notifier_block dn_dev_notifier = { .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); - static struct packet_type dn_dix_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DNA_RT), .func = dn_route_rcv, -- cgit v1.2.3-70-g09d2 From ab3301bd96c024ec9c2e1c35c90327dc5c8012a4 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:22:53 +0530 Subject: net: Move prototype declaration to header file include/net/dn.h from net/decnet/af_decnet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/dn.h from net/decnet/af_decnet.c because they are used by more than one file. This eliminates the following warning in net/decnet/af_decnet.c: net/decnet/sysctl_net_decnet.c:354:6: warning: no previous prototype for ‘dn_register_sysctl’ [-Wmissing-prototypes] net/decnet/sysctl_net_decnet.c:359:6: warning: no previous prototype for ‘dn_unregister_sysctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: David S. Miller --- include/net/dn.h | 2 ++ net/decnet/af_decnet.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dn.h b/include/net/dn.h index ccc15588d108..913b73d239f5 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -200,6 +200,8 @@ static inline void dn_sk_ports_copy(struct flowidn *fld, struct dn_scp *scp) } unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu); +void dn_register_sysctl(void); +void dn_unregister_sysctl(void); #define DN_MENUVER_ACC 0x01 #define DN_MENUVER_USR 0x02 diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 24d9193240e3..4c04848953bd 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2351,9 +2351,6 @@ static const struct proto_ops dn_proto_ops = { .sendpage = sock_no_sendpage, }; -void dn_register_sysctl(void); -void dn_unregister_sysctl(void); - MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From 493cc5e5ba450356560cc12a8f71ff6a8574c91c Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:24:33 +0530 Subject: net: Move prototype declaration to include/net/ipx.h from net/ipx/ipx_route.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype definition of function to header file include/net/ipx.h from net/ipx/ipx_route.c because they are used by more than one file. This eliminates the following warning from net/ipx/af_ipx.c: net/ipx/af_ipx.c:193:23: warning: no previous prototype for ‘ipxitf_find_using_net’ [-Wmissing-prototypes] net/ipx/af_ipx.c:577:5: warning: no previous prototype for ‘ipxitf_send’ [-Wmissing-prototypes] net/ipx/af_ipx.c:1219:8: warning: no previous prototype for ‘ipx_cksum’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: David S. Miller --- include/net/ipx.h | 3 +++ net/ipx/ipx_route.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ipx.h b/include/net/ipx.h index 9e9e35465baf..75466acdce21 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -140,6 +140,9 @@ static __inline__ void ipxitf_hold(struct ipx_interface *intrfc) } void ipxitf_down(struct ipx_interface *intrfc); +struct ipx_interface *ipxitf_find_using_net(__be32 net); +int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node); +__be16 ipx_cksum(struct ipxhdr *packet, int length); static __inline__ void ipxitf_put(struct ipx_interface *intrfc) { diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index 30f4519b092f..c1f03185c5e1 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -20,15 +20,11 @@ DEFINE_RWLOCK(ipx_routes_lock); extern struct ipx_interface *ipx_internal_net; -extern __be16 ipx_cksum(struct ipxhdr *packet, int length); extern struct ipx_interface *ipxitf_find_using_net(__be32 net); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); -extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, - char *node); -extern struct ipx_interface *ipxitf_find_using_net(__be32 net); struct ipx_route *ipxrtr_lookup(__be32 net) { -- cgit v1.2.3-70-g09d2 From 578efbc19f468686ad7eb1e505bab56d04e92784 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:26:32 +0530 Subject: net: Move prototype declaration to header file include/net/ipx.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/ipx.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/ipx_route.c:33:19: warning: no previous prototype for ‘ipxrtr_lookup’ [-Wmissing-prototypes] net/ipx/ipx_route.c:52:5: warning: no previous prototype for ‘ipxrtr_add_route’ [-Wmissing-prototypes] net/ipx/ipx_route.c:94:6: warning: no previous prototype for ‘ipxrtr_del_routes’ [-Wmissing-prototypes] net/ipx/ipx_route.c:149:5: warning: no previous prototype for ‘ipxrtr_route_skb’ [-Wmissing-prototypes] net/ipx/ipx_route.c:171:5: warning: no previous prototype for ‘ipxrtr_route_packet’ [-Wmissing-prototypes] net/ipx/ipx_route.c:261:5: warning: no previous prototype for ‘ipxrtr_ioctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: David S. Miller --- include/net/ipx.h | 8 ++++++++ net/ipx/af_ipx.c | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ipx.h b/include/net/ipx.h index 75466acdce21..0143180fecc9 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -143,6 +143,14 @@ void ipxitf_down(struct ipx_interface *intrfc); struct ipx_interface *ipxitf_find_using_net(__be32 net); int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node); __be16 ipx_cksum(struct ipxhdr *packet, int length); +int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, + unsigned char *node); +void ipxrtr_del_routes(struct ipx_interface *intrfc); +int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, size_t len, int noblock); +int ipxrtr_route_skb(struct sk_buff *skb); +struct ipx_route *ipxrtr_lookup(__be32 net); +int ipxrtr_ioctl(unsigned int cmd, void __user *arg); static __inline__ void ipxitf_put(struct ipx_interface *intrfc) { diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 994e28bfb32e..e5a00a9b52f3 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -84,15 +84,6 @@ DEFINE_SPINLOCK(ipx_interfaces_lock); struct ipx_interface *ipx_primary_net; struct ipx_interface *ipx_internal_net; -extern int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, - unsigned char *node); -extern void ipxrtr_del_routes(struct ipx_interface *intrfc); -extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, - struct iovec *iov, size_t len, int noblock); -extern int ipxrtr_route_skb(struct sk_buff *skb); -extern struct ipx_route *ipxrtr_lookup(__be32 net); -extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg); - struct ipx_interface *ipx_interfaces_head(void) { struct ipx_interface *rc = NULL; -- cgit v1.2.3-70-g09d2 From 7780d8ae4ae1960ef7c0570de0a1ecd7b60c8152 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:27:41 +0530 Subject: net: Move prototype declaration to header file include/net/datalink.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declarations of function to header file include/net/datalink.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/pe2.c: net/ipx/pe2.c:20:24: warning: no previous prototype for ‘make_EII_client’ [-Wmissing-prototypes] net/ipx/pe2.c:32:6: warning: no previous prototype for ‘destroy_EII_client’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: David S. Miller --- include/net/datalink.h | 2 ++ net/ipx/af_ipx.c | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/datalink.h b/include/net/datalink.h index deb7ca75db48..93cb18f729b5 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -15,4 +15,6 @@ struct datalink_proto { struct list_head node; }; +struct datalink_proto *make_EII_client(void); +void destroy_EII_client(struct datalink_proto *dl); #endif diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index e5a00a9b52f3..224d05856b85 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -1977,9 +1978,6 @@ static struct notifier_block ipx_dev_notifier = { .notifier_call = ipxitf_device_event, }; -extern struct datalink_proto *make_EII_client(void); -extern void destroy_EII_client(struct datalink_proto *); - static const unsigned char ipx_8022_type = 0xE0; static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; static const char ipx_EII_err_msg[] __initconst = -- cgit v1.2.3-70-g09d2 From 535d3ae9c808e6a5248f0524dbc7a9e997cf3288 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 22:29:14 +0530 Subject: net: Move prototype declaration to header file include/net/net_namespace.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of function to header file include/net/net_namespace.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/sysctl_net_ipx.c: net/ipx/sysctl_net_ipx.c:33:6: warning: no previous prototype for ‘ipx_register_sysctl’ [-Wmissing-prototypes] net/ipx/sysctl_net_ipx.c:38:6: warning: no previous prototype for ‘ipx_unregister_sysctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Signed-off-by: David S. Miller --- include/net/net_namespace.h | 8 ++++++++ net/ipx/af_ipx.c | 9 +-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index da68c9a90ac5..991dcd94cbbf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -162,6 +162,14 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int pid); +#ifdef CONFIG_SYSCTL +void ipx_register_sysctl(void); +void ipx_unregister_sysctl(void); +#else +#define ipx_register_sysctl() +#define ipx_unregister_sysctl() +#endif + #ifdef CONFIG_NET_NS void __put_net(struct net *net); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 224d05856b85..00b2a6d1c009 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -54,17 +54,10 @@ #include #include #include +#include #include -#ifdef CONFIG_SYSCTL -extern void ipx_register_sysctl(void); -extern void ipx_unregister_sysctl(void); -#else -#define ipx_register_sysctl() -#define ipx_unregister_sysctl() -#endif - /* Configuration Variables */ static unsigned char ipxcfg_max_hops = 16; static char ipxcfg_auto_select_primary; -- cgit v1.2.3-70-g09d2 From 30a91cb4ef385fe1b260df204ef314d86fff2850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 03:24:38 -0800 Subject: blk-mq: rework I/O completions Rework I/O completions to work more like the old code path. blk_mq_end_io now stays out of the business of deferring completions to others CPUs and calling blk_mark_rq_complete. The latter is very important to allow completing requests that have timed out and thus are already marked completed, the former allows using the IPI callout even for driver specific completions instead of having to reimplement them. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 52 ++++++++++++++++++++++++++++++-------------------- block/blk-mq.h | 3 +-- block/blk-timeout.c | 2 +- include/linux/blk-mq.h | 4 ++++ 4 files changed, 37 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index cee96234bf58..14c8f35946e1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -326,7 +326,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) bio_endio(bio, error); } -void blk_mq_complete_request(struct request *rq, int error) +void blk_mq_end_io(struct request *rq, int error) { struct bio *bio = rq->bio; unsigned int bytes = 0; @@ -351,46 +351,53 @@ void blk_mq_complete_request(struct request *rq, int error) else blk_mq_free_request(rq); } +EXPORT_SYMBOL(blk_mq_end_io); -void __blk_mq_end_io(struct request *rq, int error) -{ - if (!blk_mark_rq_complete(rq)) - blk_mq_complete_request(rq, error); -} - -static void blk_mq_end_io_remote(void *data) +static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; - __blk_mq_end_io(rq, rq->errors); + rq->q->softirq_done_fn(rq); } -/* - * End IO on this request on a multiqueue enabled driver. We'll either do - * it directly inline, or punt to a local IPI handler on the matching - * remote CPU. - */ -void blk_mq_end_io(struct request *rq, int error) +void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; int cpu; - if (!ctx->ipi_redirect) - return __blk_mq_end_io(rq, error); + if (!ctx->ipi_redirect) { + rq->q->softirq_done_fn(rq); + return; + } cpu = get_cpu(); if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { - rq->errors = error; - rq->csd.func = blk_mq_end_io_remote; + rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; __smp_call_function_single(ctx->cpu, &rq->csd, 0); } else { - __blk_mq_end_io(rq, error); + rq->q->softirq_done_fn(rq); } put_cpu(); } -EXPORT_SYMBOL(blk_mq_end_io); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions. + * The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (unlikely(blk_should_fake_timeout(rq->q))) + return; + if (!blk_mark_rq_complete(rq)) + __blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); static void blk_mq_start_request(struct request *rq) { @@ -1399,6 +1406,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, if (reg->timeout) blk_queue_rq_timeout(q, reg->timeout); + if (reg->ops->complete) + blk_queue_softirq_done(q, reg->ops->complete); + blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5c3917984b00..f29b645f0e1c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -22,8 +22,7 @@ struct blk_mq_ctx { struct kobject kobj; }; -void __blk_mq_end_io(struct request *rq, int error); -void blk_mq_complete_request(struct request *rq, int error); +void __blk_mq_complete_request(struct request *rq); void blk_mq_run_request(struct request *rq, bool run_queue, bool async); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index bba81c9348e1..d96f7061c6fd 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -91,7 +91,7 @@ static void blk_rq_timed_out(struct request *req) case BLK_EH_HANDLED: /* Can we use req->errors here? */ if (q->mq_ops) - blk_mq_complete_request(req, req->errors); + __blk_mq_complete_request(req); else __blk_complete_request(req); break; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b7638be58599..468be242db90 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,8 @@ struct blk_mq_ops { */ rq_timed_out_fn *timeout; + softirq_done_fn *complete; + /* * Override for hctx allocations (should probably go) */ @@ -137,6 +139,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -- cgit v1.2.3-70-g09d2 From 18741986a4b1dc4b1f171634c4191abc3b0fa023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 09:29:00 -0700 Subject: blk-mq: rework flush sequencing logic Witch to using a preallocated flush_rq for blk-mq similar to what's done with the old request path. This allows us to set up the request properly with a tag from the actually allowed range and ->rq_disk as needed by some drivers. To make life easier we also switch to dynamic allocation of ->flush_rq for the old path. This effectively reverts most of "blk-mq: fix for flush deadlock" and "blk-mq: Don't reserve a tag for flush request" Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 15 +++++-- block/blk-flush.c | 105 ++++++++++++++++++------------------------------- block/blk-mq.c | 54 +++++++++---------------- block/blk-mq.h | 1 + block/blk-sysfs.c | 2 + include/linux/blk-mq.h | 5 +-- include/linux/blkdev.h | 11 ++---- 7 files changed, 76 insertions(+), 117 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 06636f3ad424..853f92749202 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; + uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!uninit_q->flush_rq) + goto out_cleanup_queue; + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) - blk_cleanup_queue(uninit_q); - + goto out_free_flush_rq; return q; + +out_free_flush_rq: + kfree(uninit_q->flush_rq); +out_cleanup_queue: + blk_cleanup_queue(uninit_q); + return NULL; } EXPORT_SYMBOL(blk_init_queue_node); @@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, gfp_mask); else return blk_old_get_request(q, rw, gfp_mask); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 9143e85226c7..66e2b697f5db 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_data_run(struct work_struct *work) +static void mq_flush_run(struct work_struct *work) { struct request *rq; - rq = container_of(work, struct request, mq_flush_data); + rq = container_of(work, struct request, mq_flush_work); memset(&rq->csd, 0, sizeof(rq->csd)); blk_mq_run_request(rq, true, false); } -static void blk_mq_flush_data_insert(struct request *rq) +static bool blk_flush_queue_rq(struct request *rq) { - INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_data); + if (rq->q->mq_ops) { + INIT_WORK(&rq->mq_flush_work, mq_flush_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_work); + return false; + } else { + list_add_tail(&rq->queuelist, &rq->q->queue_head); + return true; + } } /** @@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - if (q->mq_ops) - blk_mq_flush_data_insert(rq); - else { - list_add(&rq->queuelist, &q->queue_head); - queued = true; - } + queued = blk_flush_queue_rq(rq); break; case REQ_FSEQ_DONE: @@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, } kicked = blk_kick_flush(q); - /* blk_mq_run_flush will run queue */ - if (q->mq_ops) - return queued; return kicked | queued; } @@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error) struct request *rq, *n; unsigned long flags = 0; - if (q->mq_ops) { - blk_mq_free_request(flush_rq); + if (q->mq_ops) spin_lock_irqsave(&q->mq_flush_lock, flags); - } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); @@ -263,48 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error) * kblockd. */ if (queued || q->flush_queue_delayed) { - if (!q->mq_ops) - blk_run_queue_async(q); - else - /* - * This can be optimized to only run queues with requests - * queued if necessary. - */ - blk_mq_run_queues(q, true); + WARN_ON(q->mq_ops); + blk_run_queue_async(q); } q->flush_queue_delayed = 0; if (q->mq_ops) spin_unlock_irqrestore(&q->mq_flush_lock, flags); } -static void mq_flush_work(struct work_struct *work) -{ - struct request_queue *q; - struct request *rq; - - q = container_of(work, struct request_queue, mq_flush_work); - - rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, false); - rq->cmd_type = REQ_TYPE_FS; - rq->end_io = flush_end_io; - - blk_mq_run_request(rq, true, false); -} - -/* - * We can't directly use q->flush_rq, because it doesn't have tag and is not in - * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, - * so offload the work to workqueue. - * - * Note: we assume a flush request finished in any hardware queue will flush - * the whole disk cache. - */ -static void mq_run_flush(struct request_queue *q) -{ - kblockd_schedule_work(q, &q->mq_flush_work); -} - /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked @@ -339,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q) * different from running_idx, which means flush is in flight. */ q->flush_pending_idx ^= 1; + if (q->mq_ops) { - mq_run_flush(q); - return true; + struct blk_mq_ctx *ctx = first_rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + + blk_mq_rq_init(hctx, q->flush_rq); + q->flush_rq->mq_ctx = ctx; + + /* + * Reuse the tag value from the fist waiting request, + * with blk-mq the tag is generated during request + * allocation and drivers can rely on it being inside + * the range they asked for. + */ + q->flush_rq->tag = first_rq->tag; + } else { + blk_rq_init(q, q->flush_rq); } - blk_rq_init(q, &q->flush_rq); - q->flush_rq.cmd_type = REQ_TYPE_FS; - q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq.rq_disk = first_rq->rq_disk; - q->flush_rq.end_io = flush_end_io; + q->flush_rq->cmd_type = REQ_TYPE_FS; + q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq->rq_disk = first_rq->rq_disk; + q->flush_rq->end_io = flush_end_io; - list_add_tail(&q->flush_rq.queuelist, &q->queue_head); - return true; + return blk_flush_queue_rq(q->flush_rq); } static void flush_data_end_io(struct request *rq, int error) @@ -407,11 +382,8 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. - * We keep REQ_FLUSH for mq to track flush requests. For !FUA, - * we never dispatch the request directly. */ - if (rq->cmd_flags & REQ_FUA) - rq->cmd_flags &= ~REQ_FLUSH; + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; @@ -560,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush); void blk_mq_init_flush(struct request_queue *q) { spin_lock_init(&q->mq_flush_lock); - INIT_WORK(&q->mq_flush_work, mq_flush_work); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 14c8f35946e1..a59b0565e940 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,27 +194,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved, - int rw) + gfp_t gfp, bool reserved) { - struct request *req; - bool is_flush = false; - /* - * flush need allocate a request, leave at least one request for - * non-flush IO to avoid deadlock - */ - if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { - if (atomic_inc_return(&hctx->pending_flush) >= - hctx->queue_depth - hctx->reserved_tags - 1) { - atomic_dec(&hctx->pending_flush); - return NULL; - } - is_flush = true; - } - req = blk_mq_alloc_rq(hctx, gfp, reserved); - if (!req && is_flush) - atomic_dec(&hctx->pending_flush); - return req; + return blk_mq_alloc_rq(hctx, gfp, reserved); } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -227,7 +209,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -244,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) { struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); if (rq) blk_mq_put_ctx(rq->mq_ctx); return rq; @@ -276,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request); /* * Re-init and set pdu, if we have it */ -static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_rq_init(hctx->queue, rq); @@ -290,9 +271,6 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - atomic_dec(&hctx->pending_flush); - blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -946,14 +924,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); + blk_mq_rq_ctx_init(q, ctx, rq, rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, - __GFP_WAIT|GFP_ATOMIC, false); + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, + false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1230,9 +1208,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; - hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; - atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1412,9 +1388,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); - if (blk_mq_init_hw_queues(q, reg, driver_data)) + q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, + cache_line_size()), GFP_KERNEL); + if (!q->flush_rq) goto err_hw; + if (blk_mq_init_hw_queues(q, reg, driver_data)) + goto err_flush_rq; + blk_mq_map_swqueue(q); mutex_lock(&all_q_mutex); @@ -1422,6 +1403,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, mutex_unlock(&all_q_mutex); return q; + +err_flush_rq: + kfree(q->flush_rq); err_hw: kfree(q->mq_map); err_map: diff --git a/block/blk-mq.h b/block/blk-mq.h index f29b645f0e1c..ed0035cd458e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -28,6 +28,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); /* * CPU hotplug helpers diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8095c4a21fc0..7500f876dae4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj) if (q->mq_ops) blk_mq_free_queue(q); + kfree(q->flush_rq); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 468be242db90..18ba8a627f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,15 +36,12 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; - atomic_t pending_flush; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; - unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ @@ -129,7 +126,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0375654adb28..b2d25ecbcbc1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -101,7 +101,7 @@ struct request { }; union { struct call_single_data csd; - struct work_struct mq_flush_data; + struct work_struct mq_flush_work; }; struct request_queue *q; @@ -451,13 +451,8 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - union { - struct request flush_rq; - struct { - spinlock_t mq_flush_lock; - struct work_struct mq_flush_work; - }; - }; + struct request *flush_rq; + spinlock_t mq_flush_lock; struct mutex sysfs_lock; -- cgit v1.2.3-70-g09d2 From fb37bb04d6c8d6c44e61d9da189dcfa6aa0f135e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 10 Feb 2014 14:25:49 -0800 Subject: smp.h: fix x86+cpu.c sparse warnings about arch nonboot CPU calls Use what we already do for arch_disable_smp_support() to fix these: arch/x86/kernel/smpboot.c:1155:6: warning: symbol 'arch_enable_nonboot_cpus_begin' was not declared. Should it be static? arch/x86/kernel/smpboot.c:1160:6: warning: symbol 'arch_enable_nonboot_cpus_end' was not declared. Should it be static? kernel/cpu.c:512:13: warning: symbol 'arch_enable_nonboot_cpus_begin' was not declared. Should it be static? kernel/cpu.c:516:13: warning: symbol 'arch_enable_nonboot_cpus_end' was not declared. Should it be static? Signed-off-by: Paul Gortmaker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 3834f43f9993..6ae004e437ea 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -188,6 +188,9 @@ static inline void kick_all_cpus_sync(void) { } */ extern void arch_disable_smp_support(void); +extern void arch_enable_nonboot_cpus_begin(void); +extern void arch_enable_nonboot_cpus_end(void); + void smp_setup_processor_id(void); #endif /* __LINUX_SMP_H */ -- cgit v1.2.3-70-g09d2 From 564eb714f5f09ac733c26860d5f0831f213fbdf1 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 10 Feb 2014 13:54:02 +0000 Subject: xen: install xen/gntdev.h and xen/gntalloc.h xen/gntdev.h and xen/gntalloc.h both provide userspace ABIs so they should be installed. CC: stable@vger.kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- include/uapi/xen/Kbuild | 2 + include/uapi/xen/gntalloc.h | 82 ++++++++++++++++++++++++ include/uapi/xen/gntdev.h | 150 ++++++++++++++++++++++++++++++++++++++++++++ include/xen/gntalloc.h | 82 ------------------------ include/xen/gntdev.h | 150 -------------------------------------------- 5 files changed, 234 insertions(+), 232 deletions(-) create mode 100644 include/uapi/xen/gntalloc.h create mode 100644 include/uapi/xen/gntdev.h delete mode 100644 include/xen/gntalloc.h delete mode 100644 include/xen/gntdev.h (limited to 'include') diff --git a/include/uapi/xen/Kbuild b/include/uapi/xen/Kbuild index 61257cb14653..5c459628e8c7 100644 --- a/include/uapi/xen/Kbuild +++ b/include/uapi/xen/Kbuild @@ -1,3 +1,5 @@ # UAPI Header export list header-y += evtchn.h +header-y += gntalloc.h +header-y += gntdev.h header-y += privcmd.h diff --git a/include/uapi/xen/gntalloc.h b/include/uapi/xen/gntalloc.h new file mode 100644 index 000000000000..76bd58065f4f --- /dev/null +++ b/include/uapi/xen/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h new file mode 100644 index 000000000000..5304bd3c84c5 --- /dev/null +++ b/include/uapi/xen/gntdev.h @@ -0,0 +1,150 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/include/xen/gntalloc.h b/include/xen/gntalloc.h deleted file mode 100644 index 76bd58065f4f..000000000000 --- a/include/xen/gntalloc.h +++ /dev/null @@ -1,82 +0,0 @@ -/****************************************************************************** - * gntalloc.h - * - * Interface to /dev/xen/gntalloc. - * - * Author: Daniel De Graaf - * - * This file is in the public domain. - */ - -#ifndef __LINUX_PUBLIC_GNTALLOC_H__ -#define __LINUX_PUBLIC_GNTALLOC_H__ - -/* - * Allocates a new page and creates a new grant reference. - */ -#define IOCTL_GNTALLOC_ALLOC_GREF \ -_IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) -struct ioctl_gntalloc_alloc_gref { - /* IN parameters */ - /* The ID of the domain to be given access to the grants. */ - uint16_t domid; - /* Flags for this mapping */ - uint16_t flags; - /* Number of pages to map */ - uint32_t count; - /* OUT parameters */ - /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; - /* The grant references of the newly created grant, one per page */ - /* Variable size, depending on count */ - uint32_t gref_ids[1]; -}; - -#define GNTALLOC_FLAG_WRITABLE 1 - -/* - * Deallocates the grant reference, allowing the associated page to be freed if - * no other domains are using it. - */ -#define IOCTL_GNTALLOC_DEALLOC_GREF \ -_IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) -struct ioctl_gntalloc_dealloc_gref { - /* IN parameters */ - /* The offset returned in the map operation */ - uint64_t index; - /* Number of references to unmap */ - uint32_t count; -}; - -/* - * Sets up an unmap notification within the page, so that the other side can do - * cleanup if this side crashes. Required to implement cross-domain robust - * mutexes or close notification on communication channels. - * - * Each mapped page only supports one notification; multiple calls referring to - * the same page overwrite the previous notification. You must clear the - * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it - * to occur. - */ -#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ -_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) -struct ioctl_gntalloc_unmap_notify { - /* IN parameters */ - /* Offset in the file descriptor for a byte within the page (same as - * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to - * be cleared. Otherwise, it can be any byte in the page whose - * notification we are adjusting. - */ - uint64_t index; - /* Action(s) to take on unmap */ - uint32_t action; - /* Event channel to notify */ - uint32_t event_channel_port; -}; - -/* Clear (set to zero) the byte specified by index */ -#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 -/* Send an interrupt on the indicated event channel */ -#define UNMAP_NOTIFY_SEND_EVENT 0x2 - -#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h deleted file mode 100644 index 5304bd3c84c5..000000000000 --- a/include/xen/gntdev.h +++ /dev/null @@ -1,150 +0,0 @@ -/****************************************************************************** - * gntdev.h - * - * Interface to /dev/xen/gntdev. - * - * Copyright (c) 2007, D G Murray - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __LINUX_PUBLIC_GNTDEV_H__ -#define __LINUX_PUBLIC_GNTDEV_H__ - -struct ioctl_gntdev_grant_ref { - /* The domain ID of the grant to be mapped. */ - uint32_t domid; - /* The grant reference of the grant to be mapped. */ - uint32_t ref; -}; - -/* - * Inserts the grant references into the mapping table of an instance - * of gntdev. N.B. This does not perform the mapping, which is deferred - * until mmap() is called with @index as the offset. - */ -#define IOCTL_GNTDEV_MAP_GRANT_REF \ -_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) -struct ioctl_gntdev_map_grant_ref { - /* IN parameters */ - /* The number of grants to be mapped. */ - uint32_t count; - uint32_t pad; - /* OUT parameters */ - /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; - /* Variable IN parameter. */ - /* Array of grant references, of size @count. */ - struct ioctl_gntdev_grant_ref refs[1]; -}; - -/* - * Removes the grant references from the mapping table of an instance of - * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) - * before this ioctl is called, or an error will result. - */ -#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) -struct ioctl_gntdev_unmap_grant_ref { - /* IN parameters */ - /* The offset was returned by the corresponding map operation. */ - uint64_t index; - /* The number of pages to be unmapped. */ - uint32_t count; - uint32_t pad; -}; - -/* - * Returns the offset in the driver's address space that corresponds - * to @vaddr. This can be used to perform a munmap(), followed by an - * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by - * the caller. The number of pages that were allocated at the same time as - * @vaddr is returned in @count. - * - * N.B. Where more than one page has been mapped into a contiguous range, the - * supplied @vaddr must correspond to the start of the range; otherwise - * an error will result. It is only possible to munmap() the entire - * contiguously-allocated range at once, and not any subrange thereof. - */ -#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ -_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) -struct ioctl_gntdev_get_offset_for_vaddr { - /* IN parameters */ - /* The virtual address of the first mapped page in a range. */ - uint64_t vaddr; - /* OUT parameters */ - /* The offset that was used in the initial mmap() operation. */ - uint64_t offset; - /* The number of pages mapped in the VM area that begins at @vaddr. */ - uint32_t count; - uint32_t pad; -}; - -/* - * Sets the maximum number of grants that may mapped at once by this gntdev - * instance. - * - * N.B. This must be called before any other ioctl is performed on the device. - */ -#define IOCTL_GNTDEV_SET_MAX_GRANTS \ -_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) -struct ioctl_gntdev_set_max_grants { - /* IN parameter */ - /* The maximum number of grants that may be mapped at once. */ - uint32_t count; -}; - -/* - * Sets up an unmap notification within the page, so that the other side can do - * cleanup if this side crashes. Required to implement cross-domain robust - * mutexes or close notification on communication channels. - * - * Each mapped page only supports one notification; multiple calls referring to - * the same page overwrite the previous notification. You must clear the - * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it - * to occur. - */ -#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ -_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify)) -struct ioctl_gntdev_unmap_notify { - /* IN parameters */ - /* Offset in the file descriptor for a byte within the page (same as - * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to - * be cleared. Otherwise, it can be any byte in the page whose - * notification we are adjusting. - */ - uint64_t index; - /* Action(s) to take on unmap */ - uint32_t action; - /* Event channel to notify */ - uint32_t event_channel_port; -}; - -/* Clear (set to zero) the byte specified by index */ -#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 -/* Send an interrupt on the indicated event channel */ -#define UNMAP_NOTIFY_SEND_EVENT 0x2 - -#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ -- cgit v1.2.3-70-g09d2 From d8320b2d2e7d1cda8216d496c7c685c5fdf74ff0 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 8 Feb 2014 23:35:43 +0100 Subject: ia64/xen: Remove Xen support for ia64 even more Commit d52eefb47d4e ("ia64/xen: Remove Xen support for ia64") removed the Kconfig symbol XEN_XENCOMM. But it didn't remove the code depending on that symbol. Remove that code now. Signed-off-by: Paul Bolle Acked-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Makefile | 1 - drivers/xen/xencomm.c | 219 ---------------------------------------- include/xen/interface/xencomm.h | 41 -------- include/xen/xencomm.h | 77 -------------- 4 files changed, 338 deletions(-) delete mode 100644 drivers/xen/xencomm.c delete mode 100644 include/xen/interface/xencomm.h delete mode 100644 include/xen/xencomm.h (limited to 'include') diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index d75c811bfa56..45e00afa7f2d 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -16,7 +16,6 @@ xen-pad-$(CONFIG_X86) += xen-acpi-pad.o dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index 4793fc594549..000000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} diff --git a/include/xen/interface/xencomm.h b/include/xen/interface/xencomm.h deleted file mode 100644 index ac45e0712afa..000000000000 --- a/include/xen/interface/xencomm.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Copyright (C) IBM Corp. 2006 - */ - -#ifndef _XEN_XENCOMM_H_ -#define _XEN_XENCOMM_H_ - -/* A xencomm descriptor is a scatter/gather list containing physical - * addresses corresponding to a virtually contiguous memory area. The - * hypervisor translates these physical addresses to machine addresses to copy - * to and from the virtually contiguous area. - */ - -#define XENCOMM_MAGIC 0x58434F4D /* 'XCOM' */ -#define XENCOMM_INVALID (~0UL) - -struct xencomm_desc { - uint32_t magic; - uint32_t nr_addrs; /* the number of entries in address[] */ - uint64_t address[0]; -}; - -#endif /* _XEN_XENCOMM_H_ */ diff --git a/include/xen/xencomm.h b/include/xen/xencomm.h deleted file mode 100644 index e43b039be112..000000000000 --- a/include/xen/xencomm.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard - * Jerone Young - */ - -#ifndef _LINUX_XENCOMM_H_ -#define _LINUX_XENCOMM_H_ - -#include - -#define XENCOMM_MINI_ADDRS 3 -struct xencomm_mini { - struct xencomm_desc _desc; - uint64_t address[XENCOMM_MINI_ADDRS]; -}; - -/* To avoid additionnal virt to phys conversion, an opaque structure is - presented. */ -struct xencomm_handle; - -extern void xencomm_free(struct xencomm_handle *desc); -extern struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes); -extern struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, - unsigned long bytes, struct xencomm_mini *xc_area); - -#if 0 -#define XENCOMM_MINI_ALIGNED(xc_desc, n) \ - struct xencomm_mini xc_desc ## _base[(n)] \ - __attribute__((__aligned__(sizeof(struct xencomm_mini)))); \ - struct xencomm_mini *xc_desc = &xc_desc ## _base[0]; -#else -/* - * gcc bug workaround: - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16660 - * gcc doesn't handle properly stack variable with - * __attribute__((__align__(sizeof(struct xencomm_mini)))) - */ -#define XENCOMM_MINI_ALIGNED(xc_desc, n) \ - unsigned char xc_desc ## _base[((n) + 1 ) * \ - sizeof(struct xencomm_mini)]; \ - struct xencomm_mini *xc_desc = (struct xencomm_mini *) \ - ((unsigned long)xc_desc ## _base + \ - (sizeof(struct xencomm_mini) - \ - ((unsigned long)xc_desc ## _base) % \ - sizeof(struct xencomm_mini))); -#endif -#define xencomm_map_no_alloc(ptr, bytes) \ - ({ XENCOMM_MINI_ALIGNED(xc_desc, 1); \ - __xencomm_map_no_alloc(ptr, bytes, xc_desc); }) - -/* provided by architecture code: */ -extern unsigned long xencomm_vtop(unsigned long vaddr); - -static inline void *xencomm_pa(void *ptr) -{ - return (void *)xencomm_vtop((unsigned long)ptr); -} - -#define xen_guest_handle(hnd) ((hnd).p) - -#endif /* _LINUX_XENCOMM_H_ */ -- cgit v1.2.3-70-g09d2 From 8423ae3d7a3cfe084865262cfaeba1359d405182 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2014 17:45:50 -0800 Subject: block: Fix cloning of discard/write same bios Immutable biovecs changed the way bio segments are treated in such a way that bio_for_each_segment() cannot now do what we want for discard/write same bios, since bi_size means something completely different for them. Fortunately discard and write same bios never have more than a single biovec, so bio_for_each_segment() is unnecessary and not terribly meaningful for them, but we still have to special case them in a few places. Signed-off-by: Kent Overstreet Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe --- fs/bio.c | 15 ++++++++++----- include/linux/bio.h | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/bio.c b/fs/bio.c index 75c49a382239..8754e7b6eb49 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -611,7 +611,6 @@ EXPORT_SYMBOL(bio_clone_fast); struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs) { - unsigned nr_iovecs = 0; struct bvec_iter iter; struct bio_vec bv; struct bio *bio; @@ -638,10 +637,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, * __bio_clone_fast() anyways. */ - bio_for_each_segment(bv, bio_src, iter) - nr_iovecs++; - - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs); + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; @@ -650,9 +646,18 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; +integrity_clone: if (bio_integrity(bio_src)) { int ret; diff --git a/include/linux/bio.h b/include/linux/bio.h index d6791bba8264..5a4d39b4686b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio) struct bio_vec bv; struct bvec_iter iter; + /* + * We special case discard/write same, because they interpret bi_size + * differently: + */ + + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + bio_for_each_segment(bv, bio, iter) segs++; -- cgit v1.2.3-70-g09d2 From 3dca1471993f9b89f3184468f8bbab2b1e024451 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 3 Feb 2014 14:08:26 -0800 Subject: target: Simplify command completion by removing CMD_T_FAILED flag The CMD_T_FAILED flag is set used in one place to record the result of a trivial test, and it is only tested once, few lines later. We might as well make the code simpler and easier to read by directly doing the test of "success" where we want to use it. Signed-off-by: Roland Dreier Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 5 +---- include/target/target_core_base.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index c50fd9f11aab..24b4f65d8777 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -669,9 +669,6 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status) return; } - if (!success) - cmd->transport_state |= CMD_T_FAILED; - /* * Check for case where an explicit ABORT_TASK has been received * and transport_wait_for_tasks() will be waiting for completion.. @@ -681,7 +678,7 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status) spin_unlock_irqrestore(&cmd->t_state_lock, flags); complete(&cmd->t_transport_stop_comp); return; - } else if (cmd->transport_state & CMD_T_FAILED) { + } else if (!success) { INIT_WORK(&cmd->work, target_complete_failure_work); } else { INIT_WORK(&cmd->work, target_complete_ok_work); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index c9c791209cd1..1772fadcff62 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -525,7 +525,6 @@ struct se_cmd { #define CMD_T_COMPLETE (1 << 2) #define CMD_T_SENT (1 << 4) #define CMD_T_STOP (1 << 5) -#define CMD_T_FAILED (1 << 6) #define CMD_T_DEV_ACTIVE (1 << 7) #define CMD_T_REQUEST_STOP (1 << 8) #define CMD_T_BUSY (1 << 9) -- cgit v1.2.3-70-g09d2 From 709c078e176bd47227e89bb34de7c64b57aaaeab Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Wed, 12 Feb 2014 10:01:03 -0800 Subject: intel_pstate: Remove energy reporting from pstate_sample tracepoint Remove the reporting of energy since it does not provide any useful information about the state of the driver and will be a maintainance headache going forward since the RAPL energy units register is not architectural and subject to change between micro-architectures References: https://bugzilla.kernel.org/show_bug.cgi?id=69831 Fixes: b69880f9ccf7 (intel_pstate: Add trace point to report internal state.) Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 --------- include/trace/events/power.h | 7 +------ 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 79606f473f48..c788abf1c457 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -51,8 +51,6 @@ static inline int32_t div_fp(int32_t x, int32_t y) return div_s64((int64_t)x << FRAC_BITS, (int64_t)y); } -static u64 energy_divisor; - struct sample { int32_t core_pct_busy; u64 aperf; @@ -630,12 +628,10 @@ static void intel_pstate_timer_func(unsigned long __data) { struct cpudata *cpu = (struct cpudata *) __data; struct sample *sample; - u64 energy; intel_pstate_sample(cpu); sample = &cpu->samples[cpu->sample_ptr]; - rdmsrl(MSR_PKG_ENERGY_STATUS, energy); intel_pstate_adjust_busy_pstate(cpu); @@ -644,7 +640,6 @@ static void intel_pstate_timer_func(unsigned long __data) cpu->pstate.current_pstate, sample->mperf, sample->aperf, - div64_u64(energy, energy_divisor), sample->freq); intel_pstate_set_sample_time(cpu); @@ -926,7 +921,6 @@ static int __init intel_pstate_init(void) int cpu, rc = 0; const struct x86_cpu_id *id; struct cpu_defaults *cpu_info; - u64 units; if (no_load) return -ENODEV; @@ -960,9 +954,6 @@ static int __init intel_pstate_init(void) if (rc) goto out; - rdmsrl(MSR_RAPL_POWER_UNIT, units); - energy_divisor = 1 << ((units >> 8) & 0x1f); /* bits{12:8} */ - intel_pstate_debug_expose_params(); intel_pstate_sysfs_expose_params(); diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9e9475c85de5..e5bf9a76f169 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -42,7 +42,6 @@ TRACE_EVENT(pstate_sample, u32 state, u64 mperf, u64 aperf, - u32 energy, u32 freq ), @@ -51,7 +50,6 @@ TRACE_EVENT(pstate_sample, state, mperf, aperf, - energy, freq ), @@ -61,7 +59,6 @@ TRACE_EVENT(pstate_sample, __field(u32, state) __field(u64, mperf) __field(u64, aperf) - __field(u32, energy) __field(u32, freq) ), @@ -72,17 +69,15 @@ TRACE_EVENT(pstate_sample, __entry->state = state; __entry->mperf = mperf; __entry->aperf = aperf; - __entry->energy = energy; __entry->freq = freq; ), - TP_printk("core_busy=%lu scaled=%lu state=%lu mperf=%llu aperf=%llu energy=%lu freq=%lu ", + TP_printk("core_busy=%lu scaled=%lu state=%lu mperf=%llu aperf=%llu freq=%lu ", (unsigned long)__entry->core_busy, (unsigned long)__entry->scaled_busy, (unsigned long)__entry->state, (unsigned long long)__entry->mperf, (unsigned long long)__entry->aperf, - (unsigned long)__entry->energy, (unsigned long)__entry->freq ) -- cgit v1.2.3-70-g09d2 From a9f180345f5378ac87d80ed0bea55ba421d83859 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Wed, 12 Feb 2014 23:01:07 -0800 Subject: compiler/gcc4: Make quirk for asm_volatile_goto() unconditional I started noticing problems with KVM guest destruction on Linux 3.12+, where guest memory wasn't being cleaned up. I bisected it down to the commit introducing the new 'asm goto'-based atomics, and found this quirk was later applied to those. Unfortunately, even with GCC 4.8.2 (which ostensibly fixed the known 'asm goto' bug) I am still getting some kind of miscompilation. If I enable the asm_volatile_goto quirk for my compiler, KVM guests are destroyed correctly and the memory is cleaned up. So make the quirk unconditional for now, until bug is found and fixed. Suggested-by: Linus Torvalds Signed-off-by: Steven Noonan Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Jakub Jelinek Cc: Richard Henderson Cc: Andrew Morton Cc: Oleg Nesterov Cc: Link: http://lkml.kernel.org/r/1392274867-15236-1-git-send-email-steven@uplinklabs.net Link: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 Signed-off-by: Ingo Molnar --- include/linux/compiler-gcc4.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index ded429966c1f..2507fd2a1eb4 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -75,11 +75,7 @@ * * (asm goto is automatically volatile - the naming reflects this.) */ -#if GCC_VERSION <= 40801 -# define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) -#else -# define asm_volatile_goto(x...) do { asm goto(x); } while (0) -#endif +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) #ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP #if GCC_VERSION >= 40400 -- cgit v1.2.3-70-g09d2 From b4a26a27287a7f81933ba016aeed6c69dd155323 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Sun, 9 Feb 2014 11:54:34 +0200 Subject: IB: Report using RoCE IP based gids in port caps For userspace RoCE UD QPs we need to know the GID format that the kernel uses, e.g when working over older kernels. For that end, add a new port capability IB_PORT_IP_BASED_GIDS and report it when query port is issued. Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- include/rdma/ib_verbs.h | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c2702f549f10..5d4088ad6bd5 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -347,7 +347,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; - props->port_cap_flags = IB_PORT_CM_SUP; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index aa92f40c9d50..7890e7b14d81 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -176,7 +176,7 @@ int ocrdma_query_port(struct ib_device *ibdev, props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | - IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP; + IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = OCRDMA_MAX_SGID; props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8d4a1c06f7e4..6793f32ccb58 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -226,7 +226,8 @@ enum ib_port_cap_flags { IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, - IB_PORT_CLIENT_REG_SUP = 1 << 25 + IB_PORT_CLIENT_REG_SUP = 1 << 25, + IB_PORT_IP_BASED_GIDS = 1 << 26 }; enum ib_port_width { -- cgit v1.2.3-70-g09d2 From 6ecde51dd7894ffe2f959cca1fea3ea2b9ee2394 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 13 Feb 2014 20:45:17 -0800 Subject: mlx5: Add include of because of kzalloc()/kfree() use On some architectures (for example, arm), we don't end up indirectly pulling in the declaration of kzalloc() and kfree(), and so building anything that includes breaks. Fix this by adding an explicit include to get the declaration. Reported-by: kbuild test robot Signed-off-by: Roland Dreier --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 32cb18c399c2..130bc8d77fa5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -38,8 +38,10 @@ #include #include #include +#include #include #include + #include #include -- cgit v1.2.3-70-g09d2 From 11bcac89c0d73dea42f1cb8646b532035796a5d6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 14 Feb 2014 13:42:13 -0800 Subject: Revert "btrfs: add ioctl to export size of global metadata reservation" This reverts commit 01e219e8069516cdb98594d417b8bb8d906ed30d. David Sterba found a different way to provide these features without adding a new ioctl. We haven't released any progs with this ioctl yet, so I'm taking this out for now until we finalize things. Signed-off-by: Chris Mason Signed-off-by: David Sterba CC: Jeff Mahoney --- fs/btrfs/ioctl.c | 16 ---------------- include/uapi/linux/btrfs.h | 1 - 2 files changed, 17 deletions(-) (limited to 'include') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ebdd866d4cfd..9a9044585da7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3559,20 +3559,6 @@ out: return ret; } -static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; - u64 reserved; - - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (arg && copy_to_user(arg, &reserved, sizeof(reserved))) - return -EFAULT; - return 0; -} - /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -4779,8 +4765,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_GLOBAL_RSV: - return btrfs_ioctl_global_rsv(root, argp); case BTRFS_IOC_SYNC: { int ret; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 1b8a0f4c9590..b4d69092fbdb 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -558,7 +558,6 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, __u64) #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ struct btrfs_ioctl_space_args) -#define BTRFS_IOC_GLOBAL_RSV _IOR(BTRFS_IOCTL_MAGIC, 20, __u64) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ -- cgit v1.2.3-70-g09d2