From f550d94cf9f86bc54e31dae2aee1a03d678c6e7f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 10 May 2007 12:50:28 +0900 Subject: net: Trivial MLX4_DEBUG dependency fix. CONFIG_MLX4_DEBUG works out to a def_bool y for those that have CONFIG_EMBEDDED set. Make it depend on MLX4_CORE. Signed-off-by: Paul Mundt Signed-off-by: Roland Dreier --- drivers/net/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index fa489b10c38c..b3f4ffa89d02 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2500,6 +2500,7 @@ config MLX4_CORE config MLX4_DEBUG bool "Verbose debugging output" if (MLX4_CORE && EMBEDDED) + depends on MLX4_CORE default y ---help--- This option causes debugging code to be compiled into the -- cgit v1.2.3-70-g09d2 From 20eebcf09c2d329e4dcdd765634c0a524195e16d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 13 May 2007 08:54:18 -0700 Subject: mlx4_core: Remove unused doorbell_lock struct mlx4_priv.doorbell_lock is never used, so delete it. Signed-off-by: Roland Dreier --- drivers/net/mlx4/main.c | 2 -- drivers/net/mlx4/mlx4.h | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 4debb024eaf9..20b8c0d3ced4 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -542,8 +542,6 @@ static int __devinit mlx4_setup_hca(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); int err; - MLX4_INIT_DOORBELL_LOCK(&priv->doorbell_lock); - err = mlx4_init_uar_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 9befbae3d196..3d3b6d24d8d3 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -275,7 +275,6 @@ struct mlx4_priv { struct mlx4_uar driver_uar; void __iomem *kar; - MLX4_DECLARE_DOORBELL_LOCK(doorbell_lock) u32 rev_id; char board_id[MLX4_BOARD_ID_LEN]; -- cgit v1.2.3-70-g09d2 From 30046e5885848fe5c2c66177dca6b277323e31ab Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 13 May 2007 11:55:14 -0400 Subject: [CPUFREQ] Support rev H AMD64s in powernow-k8 Reported-by: Calvin Dodge Signed-off-by: Dave Jones --- arch/i386/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/i386/kernel/cpu/cpufreq/powernow-k8.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 7cf3d207b6b3..5c715f17a679 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -521,7 +521,7 @@ static int check_supported_cpu(unsigned int cpu) if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_G)) { + ((eax & CPUID_XMOD) > CPUID_XMOD_REV_H)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); goto out; } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h index 95be5013c984..ae2d9da084dd 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h @@ -46,8 +46,8 @@ struct powernow_k8_data { #define CPUID_XFAM 0x0ff00000 /* extended family */ #define CPUID_XFAM_K8 0 #define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_G 0x00060000 -#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ +#define CPUID_XMOD_REV_H 0x00070000 +#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ #define CPUID_USE_XFAM_XMOD 0x00000f00 #define CPUID_GET_MAX_CAPABILITIES 0x80000000 #define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -- cgit v1.2.3-70-g09d2 From dc2585eb478cfeb45b3d6e235ac7b5918fb859f7 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Wed, 2 May 2007 23:19:05 +0100 Subject: [CPUFREQ] powernow-k7: fix MHz rounding issue with perflib When the PST tables are broken, powernow-k7 uses ACPI's processor_perflib to deduce the available frequency multipliers from the _PSS tables. Upon frequency change, processor_perflib performs some verification on the frequency (checks that it's within allowable bounds). powernow-k7 deals with absolute frequencies in KHz, whereas perflib only deals with MHz values. When performing the above verification, perflib multiplies the MHz values by 1000 to obtain the KHz value. We then end up with situations like the following: - powernow-k7 multiplies the multiplier by the FSB, and obtains a value such as 1266768 KHz - perflib belives the same state has frequency of 1266 MHz - acpi_processor_ppc_notifier calls cpufreq_verify_within_limits to verify that 1266768 is in the allowable range of 0 to 1266000 (i.e. 1266 * 1000) - it's not, so that frequency is rejected - the maximum CPU frequency is not reachable This patch solves the problem by rounding up the MHz values stored in perflib's tables. Additionally it corrects a broken URL. It also fixes http://bugzilla.kernel.org/show_bug.cgi?id=8255 although this case is a bit different: the frequencies in the _PSS tables are wildly wrong, but we get better results if we force ACPI to respect the fsb * multiplier calculations (even though it seems that the multiplier values aren't entirely correct either). Signed-off-by: Daniel Drake Signed-off-by: Dave Jones --- arch/i386/kernel/cpu/cpufreq/powernow-k7.c | 36 +++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c index 837b04166a47..ca3e1d341889 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c @@ -341,15 +341,17 @@ static int powernow_acpi_init(void) pc.val = (unsigned long) acpi_processor_perf->states[0].control; for (i = 0; i < number_scales; i++) { u8 fid, vid; - unsigned int speed; + struct acpi_processor_px *state = + &acpi_processor_perf->states[i]; + unsigned int speed, speed_mhz; - pc.val = (unsigned long) acpi_processor_perf->states[i].control; + pc.val = (unsigned long) state->control; dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", i, - (u32) acpi_processor_perf->states[i].core_frequency, - (u32) acpi_processor_perf->states[i].power, - (u32) acpi_processor_perf->states[i].transition_latency, - (u32) acpi_processor_perf->states[i].control, + (u32) state->core_frequency, + (u32) state->power, + (u32) state->transition_latency, + (u32) state->control, pc.bits.sgtc); vid = pc.bits.vid; @@ -360,6 +362,18 @@ static int powernow_acpi_init(void) powernow_table[i].index |= (vid << 8); /* upper 8 bits */ speed = powernow_table[i].frequency; + speed_mhz = speed / 1000; + + /* processor_perflib will multiply the MHz value by 1000 to + * get a KHz value (e.g. 1266000). However, powernow-k7 works + * with true KHz values (e.g. 1266768). To ensure that all + * powernow frequencies are available, we must ensure that + * ACPI doesn't restrict them, so we round up the MHz value + * to ensure that perflib's computed KHz value is greater than + * or equal to powernow's KHz value. + */ + if (speed % 1000 > 0) + speed_mhz++; if ((fid_codes[fid] % 10)==5) { if (have_a0 == 1) @@ -368,10 +382,16 @@ static int powernow_acpi_init(void) dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed/1000, vid, + fid_codes[fid] % 10, speed_mhz, vid, mobile_vid_table[vid]/1000, mobile_vid_table[vid]%1000); + if (state->core_frequency != speed_mhz) { + state->core_frequency = speed_mhz; + dprintk(" Corrected ACPI frequency to %d\n", + speed_mhz); + } + if (latency < pc.bits.sgtc) latency = pc.bits.sgtc; @@ -602,7 +622,7 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy) result = powernow_acpi_init(); if (result) { printk (KERN_INFO PFX "ACPI and legacy methods failed\n"); - printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.shtml\n"); + printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n"); } } else { /* SGTC use the bus clock as timer */ -- cgit v1.2.3-70-g09d2 From bbdfff86a8f0c91ad8b6dedf74bc14de4ba39679 Mon Sep 17 00:00:00 2001 From: Gabriel Mansi Date: Mon, 7 May 2007 18:55:13 -0300 Subject: [AGPGART] Fix wrong ID in via-agp.c there is a wrong id in drivers/char/agp/via-agp.c #define PCI_DEVICE_ID_VIA_CX700 0x8324 It must be 0x0324 Notice that PCI_DEVICE_ID_VIA_CX700 is also used in drivers/i2c/busses/i2c-viapro.c and drivers/ide/pci/via82cxxx.c So, I think that constant must be renamed to avoid conflicting. I attached a proposed patch. Signed-off-by: Dave Jones --- drivers/char/agp/via-agp.c | 6 +++--- include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c index a2bb4eccaab4..9aaf401a8975 100644 --- a/drivers/char/agp/via-agp.c +++ b/drivers/char/agp/via-agp.c @@ -384,9 +384,9 @@ static struct agp_device_ids via_agp_device_ids[] __devinitdata = .device_id = PCI_DEVICE_ID_VIA_P4M800CE, .chipset_name = "VT3314", }, - /* CX700 */ + /* VT3324 / CX700 */ { - .device_id = PCI_DEVICE_ID_VIA_CX700, + .device_id = PCI_DEVICE_ID_VIA_VT3324, .chipset_name = "CX700", }, /* VT3336 */ @@ -540,7 +540,7 @@ static const struct pci_device_id agp_via_pci_table[] = { ID(PCI_DEVICE_ID_VIA_83_87XX_1), ID(PCI_DEVICE_ID_VIA_3296_0), ID(PCI_DEVICE_ID_VIA_P4M800CE), - ID(PCI_DEVICE_ID_VIA_CX700), + ID(PCI_DEVICE_ID_VIA_VT3324), ID(PCI_DEVICE_ID_VIA_VT3336), ID(PCI_DEVICE_ID_VIA_P4M890), { } diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ccd85e4d3b8f..3b1fbf49fa7d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1288,6 +1288,7 @@ #define PCI_DEVICE_ID_VIA_8363_0 0x0305 #define PCI_DEVICE_ID_VIA_P4M800CE 0x0314 #define PCI_DEVICE_ID_VIA_P4M890 0x0327 +#define PCI_DEVICE_ID_VIA_VT3324 0x0324 #define PCI_DEVICE_ID_VIA_VT3336 0x0336 #define PCI_DEVICE_ID_VIA_8371_0 0x0391 #define PCI_DEVICE_ID_VIA_8501_0 0x0501 -- cgit v1.2.3-70-g09d2 From 26c6bc7b812b4157ba929035e467c0f4dd165916 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sun, 13 May 2007 17:18:23 +0300 Subject: IB/mlx4: Fix uninitialized spinlock for 32-bit archs uar_lock spinlock was used in mlx4_ib_cq_arm without being initialized (this only affects 32-bit archs, because uar_lock is not used on 64-bit archs and MLX4_INIT_DOORBELL_LOCK() is a NOP). Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 688ecb4c39f3..402f3a20ec0a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -489,6 +489,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->uar_map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->uar_map) goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); INIT_LIST_HEAD(&ibdev->pgdir_list); mutex_init(&ibdev->pgdir_mutex); -- cgit v1.2.3-70-g09d2 From 8f140b407f3be04e7202be9aa0cfef3006d14c9f Mon Sep 17 00:00:00 2001 From: Arthur Jones Date: Thu, 10 May 2007 12:10:49 -0700 Subject: IB/ipath: Shadow the gpio_mask register Once upon a time, GPIO interrupts were rare. But then a chip bug in the waldo series forced the use of a GPIO interrupt to signal packet reception. This greatly increased the frequency of GPIO interrupts which have the gpio_mask bits set on the waldo chips. Other bits in the gpio_status register are used for I2C clock and data lines, these bits are usually on. An "unlikely" annotation leftover from the old days was improperly applied to these bits, and an unnecessary chip mmio read was being accessed in the interrupt fast path on waldo. Remove the stagnant unlikely annotation in the interrupt handler and keep a shadow copy of the gpio_mask register to avoid the slow mmio read when testing for interruptable GPIO bits. Signed-off-by: Arthur Jones Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_iba6120.c | 7 +++---- drivers/infiniband/hw/ipath/ipath_intr.c | 7 +++---- drivers/infiniband/hw/ipath/ipath_kernel.h | 2 ++ drivers/infiniband/hw/ipath/ipath_verbs.c | 12 ++++++------ 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 1b9c30857754..4e2e3dfeb2c8 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -747,7 +747,6 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd) static int ipath_pe_intconfig(struct ipath_devdata *dd) { - u64 val; u32 chiprev; /* @@ -760,9 +759,9 @@ static int ipath_pe_intconfig(struct ipath_devdata *dd) if ((chiprev & INFINIPATH_R_CHIPREVMINOR_MASK) > 1) { /* Rev2+ reports extra errors via internal GPIO pins */ dd->ipath_flags |= IPATH_GPIO_ERRINTRS; - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val |= IPATH_GPIO_ERRINTR_MASK; - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } return 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 45d033169c6e..a90d3b5699c4 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1056,7 +1056,7 @@ irqreturn_t ipath_intr(int irq, void *data) gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); chk0rcv = 1; } - if (unlikely(gpiostatus)) { + if (gpiostatus) { /* * Some unexpected bits remain. If they could have * caused the interrupt, complain and clear. @@ -1065,9 +1065,8 @@ irqreturn_t ipath_intr(int irq, void *data) * GPIO interrupts, possibly on a "three strikes" * basis. */ - u32 mask; - mask = ipath_read_kreg32( - dd, dd->ipath_kregs->kr_gpio_mask); + const u32 mask = (u32) dd->ipath_gpio_mask; + if (mask & gpiostatus) { ipath_dbg("Unexpected GPIO IRQ bits %x\n", gpiostatus & mask); diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index e900c2593f44..12194f3dd8cc 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -397,6 +397,8 @@ struct ipath_devdata { unsigned long ipath_pioavailshadow[8]; /* shadow of kr_gpio_out, for rmw ops */ u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; /* kr_revision shadow */ u64 ipath_revision; /* diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 12933e77c7e9..bb70845279b8 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1387,13 +1387,12 @@ static int enable_timer(struct ipath_devdata *dd) * processing. */ if (dd->ipath_flags & IPATH_GPIO_INTR) { - u64 val; ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, 0x2074076542310ULL); /* Enable GPIO bit 2 interrupt */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val |= (u64) (1 << IPATH_GPIO_PORT0_BIT); - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } init_timer(&dd->verbs_timer); @@ -1412,8 +1411,9 @@ static int disable_timer(struct ipath_devdata *dd) u64 val; /* Disable GPIO bit 2 interrupt */ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask); - val &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); - ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val); + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); /* * We might want to undo changes to debugportselect, * but how? -- cgit v1.2.3-70-g09d2 From 5d88278e3bdb6f2e4ed43306659e930ecd715f0c Mon Sep 17 00:00:00 2001 From: Stefan Roscher Date: Wed, 9 May 2007 13:47:56 +0200 Subject: IB/ehca: Serialize hypervisor calls in ehca_register_mr() Some pSeries hypervisor versions show a race condition in the allocate MR hCall. Serialize this call per adapter to circumvent this problem. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_classes.h | 1 + drivers/infiniband/hw/ehca/ehca_main.c | 2 ++ drivers/infiniband/hw/ehca/hcp_if.c | 13 +++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index f64d42b08674..1d286d3cc2d5 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -277,6 +277,7 @@ void ehca_cleanup_mrmw_cache(void); extern spinlock_t ehca_qp_idr_lock; extern spinlock_t ehca_cq_idr_lock; +extern spinlock_t hcall_lock; extern struct idr ehca_qp_idr; extern struct idr ehca_cq_idr; diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index fe90e7454560..b917cc130e48 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -98,6 +98,7 @@ MODULE_PARM_DESC(scaling_code, spinlock_t ehca_qp_idr_lock; spinlock_t ehca_cq_idr_lock; +spinlock_t hcall_lock; DEFINE_IDR(ehca_qp_idr); DEFINE_IDR(ehca_cq_idr); @@ -817,6 +818,7 @@ int __init ehca_module_init(void) idr_init(&ehca_cq_idr); spin_lock_init(&ehca_qp_idr_lock); spin_lock_init(&ehca_cq_idr_lock); + spin_lock_init(&hcall_lock); INIT_LIST_HEAD(&shca_list); spin_lock_init(&shca_list_lock); diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index b564fcd3b282..7f0beec74f70 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -154,7 +154,8 @@ static long ehca_plpar_hcall9(unsigned long opcode, unsigned long arg9) { long ret; - int i, sleep_msecs; + int i, sleep_msecs, lock_is_set = 0; + unsigned long flags; ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx " "arg5=%lx arg6=%lx arg7=%lx arg8=%lx arg9=%lx", @@ -162,10 +163,18 @@ static long ehca_plpar_hcall9(unsigned long opcode, arg8, arg9); for (i = 0; i < 5; i++) { + if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) { + spin_lock_irqsave(&hcall_lock, flags); + lock_is_set = 1; + } + ret = plpar_hcall9(opcode, outs, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); + if (lock_is_set) + spin_unlock_irqrestore(&hcall_lock, flags); + if (H_IS_LONG_BUSY(ret)) { sleep_msecs = get_longbusy_msecs(ret); msleep_interruptible(sleep_msecs); @@ -193,11 +202,11 @@ static long ehca_plpar_hcall9(unsigned long opcode, opcode, ret, outs[0], outs[1], outs[2], outs[3], outs[4], outs[5], outs[6], outs[7], outs[8]); return ret; - } return H_BUSY; } + u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, struct ehca_pfeq *pfeq, const u32 neq_control, -- cgit v1.2.3-70-g09d2 From 92761cdaf215599a1bd81d383facb32adabfa620 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Wed, 9 May 2007 13:48:01 +0200 Subject: IB/ehca: Correctly set GRH mask bit in ehca_modify_qp() The driver needs to always supply the "GRH present" flag to the hypervisor, whether it's true or false. Not supplying it (i.e. not setting the corresponding mask bit) amounts to a "perhaps", which we don't want. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_qp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index df0516f24379..e21d79684057 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -968,17 +968,21 @@ static int internal_modify_qp(struct ib_qp *ibqp, ((ehca_mult - 1) / ah_mult) : 0; else mqpcb->max_static_rate = 0; - update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1); + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + /* * only if GRH is TRUE we might consider SOURCE_GID_IDX * and DEST_GID otherwise phype will return H_ATTR_PARM!!! */ if (attr->ah_attr.ah_flags == IB_AH_GRH) { - mqpcb->send_grh_flag = 1 << 31; - update_mask |= - EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + mqpcb->send_grh_flag = 1; + mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index; update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1); -- cgit v1.2.3-70-g09d2 From c55a0ddd8ebdd657224449c2fbfcd427e054c8cc Mon Sep 17 00:00:00 2001 From: Hoang-Nam Nguyen Date: Wed, 9 May 2007 13:48:11 +0200 Subject: IB/ehca: Fix AQP0/1 QP number AQP0/1 should report qp_num={0|1} and the actual QP# should be stored in struct ehca_qp, not the other way round. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_qp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index e21d79684057..b5bc787c77b6 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -523,6 +523,8 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, goto create_qp_exit1; } + my_qp->ib_qp.qp_num = my_qp->real_qp_num; + switch (init_attr->qp_type) { case IB_QPT_RC: if (isdaqp == 0) { @@ -568,7 +570,7 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, parms.act_nr_recv_wqes = init_attr->cap.max_recv_wr; parms.act_nr_send_sges = init_attr->cap.max_send_sge; parms.act_nr_recv_sges = init_attr->cap.max_recv_sge; - my_qp->real_qp_num = + my_qp->ib_qp.qp_num = (init_attr->qp_type == IB_QPT_SMI) ? 0 : 1; } @@ -595,7 +597,6 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, my_qp->ib_qp.recv_cq = init_attr->recv_cq; my_qp->ib_qp.send_cq = init_attr->send_cq; - my_qp->ib_qp.qp_num = my_qp->real_qp_num; my_qp->ib_qp.qp_type = init_attr->qp_type; my_qp->qp_type = init_attr->qp_type; -- cgit v1.2.3-70-g09d2 From c7a14939e78e75dd90b54cb0df371019bc6d3e89 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Wed, 9 May 2007 13:48:20 +0200 Subject: IB/ehca: Remove _irqsave, move #ifdef - In ehca_process_eq(), we're IRQ safe throughout the whole function, so we don't need another _irqsave in the middle of flight. - take_over_work() is only called by comp_pool_callback(), so it can move into the same #ifdef block. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_irq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index 82dda2faf4d0..100329ba3343 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -517,12 +517,11 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq) else { struct ehca_cq *cq = eq->eqe_cache[i].cq; comp_event_callback(cq); - spin_lock_irqsave(&ehca_cq_idr_lock, flags); + spin_lock(&ehca_cq_idr_lock); cq->nr_events--; if (!cq->nr_events) wake_up(&cq->wait_completion); - spin_unlock_irqrestore(&ehca_cq_idr_lock, - flags); + spin_unlock(&ehca_cq_idr_lock); } } else { ehca_dbg(&shca->ib_device, "Got non completion event"); @@ -711,6 +710,7 @@ static void destroy_comp_task(struct ehca_comp_pool *pool, kthread_stop(task); } +#ifdef CONFIG_HOTPLUG_CPU static void take_over_work(struct ehca_comp_pool *pool, int cpu) { @@ -735,7 +735,6 @@ static void take_over_work(struct ehca_comp_pool *pool, } -#ifdef CONFIG_HOTPLUG_CPU static int comp_pool_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -- cgit v1.2.3-70-g09d2 From bba9b6013e604fadb298191c058149acf1cdfced Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Wed, 9 May 2007 13:48:25 +0200 Subject: IB/ehca: Beautify sysfs attribute code and fix compiler warnings eHCA's sysfs attributes are now being created via sysfs_create_group(), making the process neatly table-driven. The return value is checked, thus fixing a few compiler warnings. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_main.c | 86 +++++++++++++++------------------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index b917cc130e48..7d276e034e47 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -454,15 +454,14 @@ static ssize_t ehca_store_debug_level(struct device_driver *ddp, DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR, ehca_show_debug_level, ehca_store_debug_level); -void ehca_create_driver_sysfs(struct ibmebus_driver *drv) -{ - driver_create_file(&drv->driver, &driver_attr_debug_level); -} +static struct attribute *ehca_drv_attrs[] = { + &driver_attr_debug_level.attr, + NULL +}; -void ehca_remove_driver_sysfs(struct ibmebus_driver *drv) -{ - driver_remove_file(&drv->driver, &driver_attr_debug_level); -} +static struct attribute_group ehca_drv_attr_grp = { + .attrs = ehca_drv_attrs +}; #define EHCA_RESOURCE_ATTR(name) \ static ssize_t ehca_show_##name(struct device *dev, \ @@ -524,44 +523,28 @@ static ssize_t ehca_show_adapter_handle(struct device *dev, } static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL); +static struct attribute *ehca_dev_attrs[] = { + &dev_attr_adapter_handle.attr, + &dev_attr_num_ports.attr, + &dev_attr_hw_ver.attr, + &dev_attr_max_eq.attr, + &dev_attr_cur_eq.attr, + &dev_attr_max_cq.attr, + &dev_attr_cur_cq.attr, + &dev_attr_max_qp.attr, + &dev_attr_cur_qp.attr, + &dev_attr_max_mr.attr, + &dev_attr_cur_mr.attr, + &dev_attr_max_mw.attr, + &dev_attr_cur_mw.attr, + &dev_attr_max_pd.attr, + &dev_attr_max_ah.attr, + NULL +}; -void ehca_create_device_sysfs(struct ibmebus_dev *dev) -{ - device_create_file(&dev->ofdev.dev, &dev_attr_adapter_handle); - device_create_file(&dev->ofdev.dev, &dev_attr_num_ports); - device_create_file(&dev->ofdev.dev, &dev_attr_hw_ver); - device_create_file(&dev->ofdev.dev, &dev_attr_max_eq); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_eq); - device_create_file(&dev->ofdev.dev, &dev_attr_max_cq); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_cq); - device_create_file(&dev->ofdev.dev, &dev_attr_max_qp); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_qp); - device_create_file(&dev->ofdev.dev, &dev_attr_max_mr); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_mr); - device_create_file(&dev->ofdev.dev, &dev_attr_max_mw); - device_create_file(&dev->ofdev.dev, &dev_attr_cur_mw); - device_create_file(&dev->ofdev.dev, &dev_attr_max_pd); - device_create_file(&dev->ofdev.dev, &dev_attr_max_ah); -} - -void ehca_remove_device_sysfs(struct ibmebus_dev *dev) -{ - device_remove_file(&dev->ofdev.dev, &dev_attr_adapter_handle); - device_remove_file(&dev->ofdev.dev, &dev_attr_num_ports); - device_remove_file(&dev->ofdev.dev, &dev_attr_hw_ver); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_eq); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_eq); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_cq); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_cq); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_qp); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_qp); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_mr); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mr); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_mw); - device_remove_file(&dev->ofdev.dev, &dev_attr_cur_mw); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_pd); - device_remove_file(&dev->ofdev.dev, &dev_attr_max_ah); -} +static struct attribute_group ehca_dev_attr_grp = { + .attrs = ehca_dev_attrs +}; static int __devinit ehca_probe(struct ibmebus_dev *dev, const struct of_device_id *id) @@ -669,7 +652,10 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev, } } - ehca_create_device_sysfs(dev); + ret = sysfs_create_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_err(&shca->ib_device, + "Cannot create device attributes ret=%d", ret); spin_lock(&shca_list_lock); list_add(&shca->shca_list, &shca_list); @@ -721,7 +707,7 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev) struct ehca_shca *shca = dev->ofdev.dev.driver_data; int ret; - ehca_remove_device_sysfs(dev); + sysfs_remove_group(&dev->ofdev.dev.kobj, &ehca_dev_attr_grp); if (ehca_open_aqp1 == 1) { int i; @@ -840,7 +826,9 @@ int __init ehca_module_init(void) goto module_init2; } - ehca_create_driver_sysfs(&ehca_driver); + ret = sysfs_create_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_gen_err("Cannot create driver attributes ret=%d", ret); if (ehca_poll_all_eqs != 1) { ehca_gen_err("WARNING!!!"); @@ -867,7 +855,7 @@ void __exit ehca_module_exit(void) if (ehca_poll_all_eqs == 1) del_timer_sync(&poll_eqs_timer); - ehca_remove_driver_sysfs(&ehca_driver); + sysfs_remove_group(&ehca_driver.driver.kobj, &ehca_drv_attr_grp); ibmebus_unregister_driver(&ehca_driver); ehca_destroy_slab_caches(); -- cgit v1.2.3-70-g09d2 From 4e430dcb7b132a4076e533a9d69907acecbe71be Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Wed, 9 May 2007 13:48:31 +0200 Subject: IB/ehca: Disable scaling code by default, bump version number - Scaling code is still considered experimental, so disable it by default - Increase version to SVNEHCA_0023 Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 7d276e034e47..c3f99f33b49c 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -52,7 +52,7 @@ MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch "); MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); -MODULE_VERSION("SVNEHCA_0022"); +MODULE_VERSION("SVNEHCA_0023"); int ehca_open_aqp1 = 0; int ehca_debug_level = 0; @@ -62,7 +62,7 @@ int ehca_use_hp_mr = 0; int ehca_port_act_time = 30; int ehca_poll_all_eqs = 1; int ehca_static_rate = -1; -int ehca_scaling_code = 1; +int ehca_scaling_code = 0; module_param_named(open_aqp1, ehca_open_aqp1, int, 0); module_param_named(debug_level, ehca_debug_level, int, 0); @@ -799,7 +799,7 @@ int __init ehca_module_init(void) int ret; printk(KERN_INFO "eHCA Infiniband Device Driver " - "(Rel.: SVNEHCA_0022)\n"); + "(Rel.: SVNEHCA_0023)\n"); idr_init(&ehca_qp_idr); idr_init(&ehca_cq_idr); spin_lock_init(&ehca_qp_idr_lock); -- cgit v1.2.3-70-g09d2 From 8aa08602bdd617a9cdd147f19076a8c8a70e03ef Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 7 May 2007 11:49:00 -0700 Subject: RDMA/cma: Simplify device removal handling code Add a new routine and rename another to encapsulate common code for synchronizing with device removal. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 89 +++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fde92ce45153..d026764c7e9c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -346,7 +346,23 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -static void cma_release_remove(struct rdma_id_private *id_priv) +static int cma_disable_remove(struct rdma_id_private *id_priv, + enum cma_state state) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == state) { + atomic_inc(&id_priv->dev_remove); + ret = 0; + } else + ret = -EINVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static void cma_enable_remove(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->dev_remove)) wake_up(&id_priv->wait_remove); @@ -884,9 +900,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_CONNECT)) - goto out; + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -942,12 +957,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -1057,11 +1072,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int offset, ret; listen_id = cm_id->context; - atomic_inc(&listen_id->dev_remove); - if (!cma_comp(listen_id, CMA_LISTEN)) { - ret = -ECONNABORTED; - goto out; - } + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); @@ -1101,11 +1113,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) release_conn_id: cma_exch(conn_id, CMA_DESTROYING); - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(&conn_id->id); out: - cma_release_remove(listen_id); + cma_enable_remove(listen_id); return ret; } @@ -1214,12 +1226,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -1234,11 +1246,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, int ret; listen_id = cm_id->context; - atomic_inc(&listen_id->dev_remove); - if (!cma_comp(listen_id, CMA_LISTEN)) { - ret = -ECONNABORTED; - goto out; - } + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, @@ -1255,13 +1264,13 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, dev = ip_dev_find(iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } @@ -1270,7 +1279,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); if (ret) { - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(new_cm_id); goto out; } @@ -1293,14 +1302,14 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, CMA_DESTROYING); - cma_release_remove(conn_id); + cma_enable_remove(conn_id); rdma_destroy_id(&conn_id->id); } out: if (dev) dev_put(dev); - cma_release_remove(listen_id); + cma_enable_remove(listen_id); return ret; } @@ -1519,7 +1528,7 @@ static void cma_work_handler(struct work_struct *_work) destroy = 1; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); @@ -1711,13 +1720,13 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); cma_deref_id(id_priv); } @@ -2042,11 +2051,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - memset(&event, 0, sizeof event); - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_CONNECT)) - goto out; + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; @@ -2084,12 +2092,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_release_remove(id_priv); + cma_enable_remove(id_priv); return ret; } @@ -2499,10 +2507,9 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) int ret; id_priv = mc->id_priv; - atomic_inc(&id_priv->dev_remove); - if (!cma_comp(id_priv, CMA_ADDR_BOUND) && - !cma_comp(id_priv, CMA_ADDR_RESOLVED)) - goto out; + if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) && + cma_disable_remove(id_priv, CMA_ADDR_RESOLVED)) + return 0; if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, @@ -2524,12 +2531,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, CMA_DESTROYING); - cma_release_remove(id_priv); + cma_enable_remove(id_priv); rdma_destroy_id(&id_priv->id); return 0; } -out: - cma_release_remove(id_priv); + + cma_enable_remove(id_priv); return 0; } -- cgit v1.2.3-70-g09d2 From be65f086f2a50c478b2f5ecf4c55a52a4e95059a Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 7 May 2007 11:49:12 -0700 Subject: RDMA/cma: Fix synchronization with device removal in cma_iw_handler The cma_iw_handler needs to validate the state of the rdma_cm_id before processing a new connection request to ensure that a device removal is not already being processed for the same rdma_cm_id. Without the state check, the user can receive simultaneous callbacks for the same cm_id, or a callback after they've destroyed the cm_id. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d026764c7e9c..cfd57b4abd03 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1183,9 +1183,10 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr_in *sin; int ret = 0; - memset(&event, 0, sizeof event); - atomic_inc(&id_priv->dev_remove); + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; -- cgit v1.2.3-70-g09d2 From 6c719f5c6c823901fac2d46b83db5a69ba7e9152 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 7 May 2007 11:49:27 -0700 Subject: RDMA/cma: Add check to validate that cm_id is bound to a device Several checks in the rdma_cm check against the state of the cm_id, but only to validate that the cm_id is bound to an underlying transport specific CM and an RDMA device. Make the check explicit in what we're trying to check for, since we're not synchronizing against the cm_id state. This will allow a user to disconnect a cm_id or reject a connection after receiving a device removal event. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cfd57b4abd03..2eb52b7a71da 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -368,6 +368,11 @@ static void cma_enable_remove(struct rdma_id_private *id_priv) wake_up(&id_priv->wait_remove); } +static int cma_has_cm_dev(struct rdma_id_private *id_priv) +{ + return (id_priv->id.device && id_priv->cm_id.ib); +} + struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps) { @@ -2422,7 +2427,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (id->device->node_type) { @@ -2444,7 +2449,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { @@ -2475,8 +2480,7 @@ int rdma_disconnect(struct rdma_cm_id *id) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT) && - !cma_comp(id_priv, CMA_DISCONNECT)) + if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { -- cgit v1.2.3-70-g09d2 From 3e28c56b9b67347b42ba06f9a9373b408902beee Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 14 May 2007 07:26:51 +0300 Subject: IB/mthca: Fix posting >255 recv WRs for Tavor Fix posting lists of > 255 receive WRs for Tavor: rq.next_ind must be updated each doorbell, otherwise the next doorbell will use an incorrect index. Found by Ronni Zimmermann at Mellanox. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_qp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index fee60c852d14..72fabb822f1c 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1862,6 +1862,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + qp->rq.next_ind = ind; qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; size0 = 0; } -- cgit v1.2.3-70-g09d2 From bd18c112774db5bb887adb981ffbe9bfe00b2f3a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 14 May 2007 17:14:50 +0300 Subject: IB/mthca: Set cleaned CQEs back to HW ownership when cleaning CQ mthca_cq_clean() updates the CQ consumer index without moving CQEs back to HW ownership. As a result, the same WRID might get reported twice, resulting in a use-after-free. This was observed in IPoIB CM. Fix by moving all freed CQEs to HW ownership. This fixes Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index cf0868f6e965..ca224d018af2 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -284,7 +284,7 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, { struct mthca_cqe *cqe; u32 prod_index; - int nfreed = 0; + int i, nfreed = 0; spin_lock_irq(&cq->lock); @@ -321,6 +321,8 @@ void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, } if (nfreed) { + for (i = 0; i < nfreed; ++i) + set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe)); wmb(); cq->cons_index += nfreed; update_cons_index(dev, cq, nfreed); -- cgit v1.2.3-70-g09d2 From 7c5b9ef8577bfa7b74ea58fc9ff2934ffce13532 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 14 May 2007 07:26:51 +0300 Subject: IPoIB/cm: Optimize stale connection detection In the presence of some running RX connections, we repeat queue_delayed_work calls each 4 RX WRs, which is a waste. It's enough to start stale task when a first passive connection is added, and rerun it every IPOIB_CM_RX_DELAY as long as there are outstanding passive connections. This removes some code from RX data path. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 785bc8505f2a..eec833b81e9b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -257,10 +257,11 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even cm_id->context = p; p->jiffies = jiffies; spin_lock_irq(&priv->lock); + if (list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); list_add(&p->list, &priv->cm.passive_ids); spin_unlock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, - &priv->cm.stale_task, IPOIB_CM_RX_DELAY); return 0; err_rep: @@ -378,8 +379,6 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (!list_empty(&p->list)) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irqrestore(&priv->lock, flags); - queue_delayed_work(ipoib_workqueue, - &priv->cm.stale_task, IPOIB_CM_RX_DELAY); } } @@ -1100,6 +1099,10 @@ static void ipoib_cm_stale_task(struct work_struct *work) kfree(p); spin_lock_irq(&priv->lock); } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } -- cgit v1.2.3-70-g09d2 From 99fbe1ac217e8b9d4141504e879327cb4e42d4ff Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 14 May 2007 18:27:29 -0400 Subject: [CPUFREQ] Correct revision mask for powernow-k8 Mark Langsdorf points out that the correct define for this revision bump is 0x80000. Also to save us having to keep renaming the #define, give it a more meaningful name. Signed-off-by: Dave Jones --- arch/i386/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/i386/kernel/cpu/cpufreq/powernow-k8.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 5c715f17a679..4ade55c5f333 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -521,7 +521,7 @@ static int check_supported_cpu(unsigned int cpu) if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_H)) { + ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); goto out; } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h index ae2d9da084dd..b06c812208ca 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h @@ -46,7 +46,7 @@ struct powernow_k8_data { #define CPUID_XFAM 0x0ff00000 /* extended family */ #define CPUID_XFAM_K8 0 #define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_H 0x00070000 +#define CPUID_XMOD_REV_MASK 0x00080000 #define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ #define CPUID_USE_XFAM_XMOD 0x00000f00 #define CPUID_GET_MAX_CAPABILITIES 0x80000000 -- cgit v1.2.3-70-g09d2 From d47de16c7221968d3eab899d7540efa5ba77af5a Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 15 May 2007 01:40:41 -0700 Subject: fix epoll single pass code and add wait-exclusive flag Fixes the epoll single pass code. During the unlocked event delivery (to userspace) code, the poll callback can re-issue new events, and we must receive them correctly. Since we loop in a lockless fashion, we want to be O(nready), and we don't want to flash on/off the spinlock for every event, we have the poll callback to use a secondary list to queue events while we're inside the event delivery loop. The rw_semaphore has been turned into a mutex. This patch also adds the wait-exclusive flag, as suggested by Davi Arnaut. Signed-off-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 322 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 166 insertions(+), 156 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1aad34ea61a4..1dbedc71a28c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -39,14 +38,13 @@ #include #include #include -#include /* * LOCKING: * There are three level of locking required by epoll : * * 1) epmutex (mutex) - * 2) ep->sem (rw_semaphore) + * 2) ep->mtx (mutes) * 3) ep->lock (rw_lock) * * The acquire order is the one listed above, from 1 to 3. @@ -57,20 +55,20 @@ * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a - * read-write semaphore (ep->sem). It is acquired on read during - * the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL) - * and during eventpoll_release_file(). Then we also need a global - * semaphore to serialize eventpoll_release_file() and ep_free(). - * This semaphore is acquired by ep_free() during the epoll file + * mutex (ep->mtx). It is acquired during the event transfer loop, + * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). + * Then we also need a global mutex to serialize eventpoll_release_file() + * and ep_free(). + * This mutex is acquired by ep_free() during the epoll file * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). - * It is possible to drop the "ep->sem" and to use the global - * semaphore "epmutex" (together with "ep->lock") to have it working, - * but having "ep->sem" will make the interface more scalable. + * It is possible to drop the "ep->mtx" and to use the global + * mutex "epmutex" (together with "ep->lock") to have it working, + * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for - * normal operations the epoll private "ep->sem" will guarantee - * a greater scalability. + * normal operations the epoll private "ep->mtx" will guarantee + * a better scalability. */ #define DEBUG_EPOLL 0 @@ -102,6 +100,8 @@ #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) +#define EP_UNACTIVE_PTR ((void *) -1L) + struct epoll_filefd { struct file *file; int fd; @@ -111,7 +111,7 @@ struct epoll_filefd { * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". * It is used to keep track on all tasks that are currently inside the wake_up() code * to 1) short-circuit the one coming from the same task and same wait queue head - * ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting + * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting * 3) let go the ones coming from other tasks. */ struct wake_task_node { @@ -129,6 +129,48 @@ struct poll_safewake { spinlock_t lock; }; +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + */ +struct epitem { + /* RB-Tree node used to link this structure to the eventpoll rb-tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; + + /* + * Used to keep track of the usage count of the structure. This avoids + * that the structure will desappear from underneath our processing. + */ + atomic_t usecnt; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; +}; + /* * This structure is stored inside the "private_data" member of the file * structure and rapresent the main data sructure for the eventpoll @@ -139,12 +181,12 @@ struct eventpoll { rwlock_t lock; /* - * This semaphore is used to ensure that files are not removed - * while epoll is using them. This is read-held during the event - * collection loop and it is write-held during the file cleanup - * path, the epoll file exit code and the ctl operations. + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. */ - struct rw_semaphore sem; + struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; @@ -157,6 +199,13 @@ struct eventpoll { /* RB-Tree root used to store monitored fd structs */ struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transfering ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; }; /* Wait structure used by the poll hooks */ @@ -177,42 +226,6 @@ struct eppoll_entry { wait_queue_head_t *whead; }; -/* - * Each file descriptor added to the eventpoll interface will - * have an entry of this type linked to the "rbr" RB tree. - */ -struct epitem { - /* RB-Tree node used to link this structure to the eventpoll rb-tree */ - struct rb_node rbn; - - /* List header used to link this structure to the eventpoll ready list */ - struct list_head rdllink; - - /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; - - /* Number of active wait queue attached to poll operations */ - int nwait; - - /* List containing poll wait queues */ - struct list_head pwqlist; - - /* The "container" of this item */ - struct eventpoll *ep; - - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; - - /* - * Used to keep track of the usage count of the structure. This avoids - * that the structure will desappear from underneath our processing. - */ - atomic_t usecnt; - - /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; -}; - /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; @@ -220,7 +233,7 @@ struct ep_pqueue { }; /* - * This semaphore is used to serialize ep_free() and eventpoll_release_file(). + * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ static struct mutex epmutex; @@ -506,7 +519,7 @@ static void ep_free(struct eventpoll *ep) /* * We need to lock this because we could be hit by * eventpoll_release_file() while we're freeing the "struct eventpoll". - * We do not need to hold "ep->sem" here because the epoll file + * We do not need to hold "ep->mtx" here because the epoll file * is on the way to be removed and no one has references to it * anymore. The only hit might come from eventpoll_release_file() but * holding "epmutex" is sufficent here. @@ -525,7 +538,7 @@ static void ep_free(struct eventpoll *ep) /* * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by - * write-holding "sem" we can be sure that no file cleanup code will hit + * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". */ while ((rbp = rb_first(&ep->rbr)) != 0) { @@ -534,6 +547,8 @@ static void ep_free(struct eventpoll *ep) } mutex_unlock(&epmutex); + + mutex_destroy(&ep->mtx); } static int ep_eventpoll_release(struct inode *inode, struct file *file) @@ -594,9 +609,9 @@ void eventpoll_release_file(struct file *file) * We don't want to get "file->f_ep_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that noone is using this file anymore. - * The only hit might come from ep_free() but by holding the semaphore + * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire - * "ep->sem" after "epmutex" because ep_remove() requires it when called + * "ep->mtx" after "epmutex" because ep_remove() requires it when called * from anywhere but ep_free(). */ mutex_lock(&epmutex); @@ -606,9 +621,9 @@ void eventpoll_release_file(struct file *file) ep = epi->ep; list_del_init(&epi->fllink); - down_write(&ep->sem); + mutex_lock(&ep->mtx); ep_remove(ep, epi); - up_write(&ep->sem); + mutex_unlock(&ep->mtx); } mutex_unlock(&epmutex); @@ -622,11 +637,12 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; rwlock_init(&ep->lock); - init_rwsem(&ep->sem); + mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; + ep->ovflist = EP_UNACTIVE_PTR; *pep = ep; @@ -695,7 +711,21 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) - goto is_disabled; + goto out_unlock; + + /* + * If we are trasfering events to userspace, we can hold no locks + * (because we're accessing user memory, and because of linux f_op->poll() + * semantics). All the events that happens during that period of time are + * chained in ep->ovflist and requeued later on. + */ + if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = ep->ovflist; + ep->ovflist = epi; + } + goto out_unlock; + } /* If this file is already in the ready list we exit soon */ if (ep_is_linked(&epi->rdllink)) @@ -714,7 +744,7 @@ is_linked: if (waitqueue_active(&ep->poll_wait)) pwake++; -is_disabled: +out_unlock: write_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ @@ -788,6 +818,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; + epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -920,36 +951,50 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even return 0; } -/* - * This function is called without holding the "ep->lock" since the call to - * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ - * because of the way poll() is traditionally implemented in Linux. - */ -static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, - struct epoll_event __user *events, int maxevents) +static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, + int maxevents) { int eventcnt, error = -EFAULT, pwake = 0; unsigned int revents; unsigned long flags; - struct epitem *epi; - struct list_head injlist; + struct epitem *epi, *nepi; + struct list_head txlist; + + INIT_LIST_HEAD(&txlist); - INIT_LIST_HEAD(&injlist); + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). + */ + mutex_lock(&ep->mtx); + + /* + * Steal the ready list, and re-init the original one to the + * empty list. Also, set ep->ovflist to NULL so that events + * happening while looping w/out locks, are not lost. We cannot + * have the poll callback to queue directly on ep->rdllist, + * because we are doing it in the loop below, in a lockless way. + */ + write_lock_irqsave(&ep->lock, flags); + list_splice(&ep->rdllist, &txlist); + INIT_LIST_HEAD(&ep->rdllist); + ep->ovflist = NULL; + write_unlock_irqrestore(&ep->lock, flags); /* * We can loop without lock because this is a task private list. * We just splice'd out the ep->rdllist in ep_collect_ready_items(). - * Items cannot vanish during the loop because we are holding "sem" in - * read. + * Items cannot vanish during the loop because we are holding "mtx". */ - for (eventcnt = 0; !list_empty(txlist) && eventcnt < maxevents;) { - epi = list_first_entry(txlist, struct epitem, rdllink); - prefetch(epi->rdllink.next); + for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { + epi = list_first_entry(&txlist, struct epitem, rdllink); + + list_del_init(&epi->rdllink); /* * Get the ready file event set. We can safely use the file - * because we are holding the "sem" in read and this will - * guarantee that both the file and the item will not vanish. + * because we are holding the "mtx" and this will guarantee + * that both the file and the item will not vanish. */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); revents &= epi->event.events; @@ -957,8 +1002,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, /* * Is the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding - * "sem" in read, so no operations coming from userspace - * can change the item. + * "mtx", so no operations coming from userspace can change + * the item. */ if (revents) { if (__put_user(revents, @@ -970,49 +1015,47 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, epi->event.events &= EP_PRIVATE_BITS; eventcnt++; } - /* - * This is tricky. We are holding the "sem" in read, and this - * means that the operations that can change the "linked" status - * of the epoll item (epi->rbn and epi->rdllink), cannot touch - * them. Also, since we are "linked" from a epi->rdllink POV - * (the item is linked to our transmission list we just - * spliced), the ep_poll_callback() cannot touch us either, - * because of the check present in there. Another parallel - * epoll_wait() will not get the same result set, since we - * spliced the ready list before. Note that list_del() still - * shows the item as linked to the test in ep_poll_callback(). + * At this point, noone can insert into ep->rdllist besides + * us. The epoll_ctl() callers are locked out by us holding + * "mtx" and the poll callback will queue them in ep->ovflist. */ - list_del(&epi->rdllink); if (!(epi->event.events & EPOLLET) && - (revents & epi->event.events)) - list_add_tail(&epi->rdllink, &injlist); - else { - /* - * Be sure the item is totally detached before re-init - * the list_head. After INIT_LIST_HEAD() is committed, - * the ep_poll_callback() can requeue the item again, - * but we don't care since we are already past it. - */ - smp_mb(); - INIT_LIST_HEAD(&epi->rdllink); - } + (revents & epi->event.events)) + list_add_tail(&epi->rdllink, &ep->rdllist); } error = 0; - errxit: +errxit: + write_lock_irqsave(&ep->lock, flags); + /* + * During the time we spent in the loop above, some other events + * might have been queued by the poll callback. We re-insert them + * here (in case they are not already queued, or they're one-shot). + */ + for (nepi = ep->ovflist; (epi = nepi) != NULL; + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + if (!ep_is_linked(&epi->rdllink) && + (epi->event.events & ~EP_PRIVATE_BITS)) + list_add_tail(&epi->rdllink, &ep->rdllist); + } /* - * If the re-injection list or the txlist are not empty, re-splice - * them to the ready list and do proper wakeups. + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after + * releasing the lock, events will be queued in the normal way inside + * ep->rdllist. */ - if (!list_empty(&injlist) || !list_empty(txlist)) { - write_lock_irqsave(&ep->lock, flags); + ep->ovflist = EP_UNACTIVE_PTR; - list_splice(txlist, &ep->rdllist); - list_splice(&injlist, &ep->rdllist); + /* + * In case of error in the event-send loop, we might still have items + * inside the "txlist". We need to splice them back inside ep->rdllist. + */ + list_splice(&txlist, &ep->rdllist); + + if (!list_empty(&ep->rdllist)) { /* - * Wake up ( if active ) both the eventpoll wait list and the ->poll() + * Wake up (if active) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) @@ -1020,9 +1063,10 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; - - write_unlock_irqrestore(&ep->lock, flags); } + write_unlock_irqrestore(&ep->lock, flags); + + mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) @@ -1031,41 +1075,6 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, return eventcnt == 0 ? error: eventcnt; } -/* - * Perform the transfer of events to user space. - */ -static int ep_events_transfer(struct eventpoll *ep, - struct epoll_event __user *events, int maxevents) -{ - int eventcnt; - unsigned long flags; - struct list_head txlist; - - INIT_LIST_HEAD(&txlist); - - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). - */ - down_read(&ep->sem); - - /* - * Steal the ready list, and re-init the original one to the - * empty list. - */ - write_lock_irqsave(&ep->lock, flags); - list_splice(&ep->rdllist, &txlist); - INIT_LIST_HEAD(&ep->rdllist); - write_unlock_irqrestore(&ep->lock, flags); - - /* Build result set in userspace */ - eventcnt = ep_send_events(ep, &txlist, events, maxevents); - - up_read(&ep->sem); - - return eventcnt; -} - static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { @@ -1093,6 +1102,7 @@ retry: * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); + wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(&ep->wq, &wait); for (;;) { @@ -1129,7 +1139,7 @@ retry: * more luck. */ if (!res && eavail && - !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) + !(res = ep_send_events(ep, events, maxevents)) && jtimeout) goto retry; return res; @@ -1237,7 +1247,7 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, */ ep = file->private_data; - down_write(&ep->sem); + mutex_lock(&ep->mtx); /* Try to lookup the file inside our RB tree */ epi = ep_find(ep, tfile, fd); @@ -1272,7 +1282,7 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, */ if (epi) ep_release_epitem(epi); - up_write(&ep->sem); + mutex_unlock(&ep->mtx); error_tgt_fput: fput(tfile); -- cgit v1.2.3-70-g09d2 From c7ea76302547f81e4583d0d7c52a1c37c6747f5d Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 15 May 2007 01:40:47 -0700 Subject: epoll locks changes and cleanups Changes the rwlock to a spinlock, and drops the use-count variable. Operations are always bound by the mutex now, so the use-count is no more needed. For the same reason, the rwlock can become a simple spinlock. Signed-off-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 234 ++++++++++++++++++--------------------------------------- 1 file changed, 73 insertions(+), 161 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1dbedc71a28c..4c16127c96be 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1,6 +1,6 @@ /* - * fs/eventpoll.c ( Efficent event polling implementation ) - * Copyright (C) 2001,...,2006 Davide Libenzi + * fs/eventpoll.c (Efficent event polling implementation) + * Copyright (C) 2001,...,2007 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -44,8 +44,8 @@ * There are three level of locking required by epoll : * * 1) epmutex (mutex) - * 2) ep->mtx (mutes) - * 3) ep->lock (rw_lock) + * 2) ep->mtx (mutex) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. * We need a spinlock (ep->lock) because we manipulate objects @@ -140,6 +140,12 @@ struct epitem { /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + /* The file descriptor information this item refers to */ struct epoll_filefd ffd; @@ -152,23 +158,11 @@ struct epitem { /* The "container" of this item */ struct eventpoll *ep; - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; - - /* - * Used to keep track of the usage count of the structure. This avoids - * that the structure will desappear from underneath our processing. - */ - atomic_t usecnt; - /* List header used to link this item to the "struct file" items list */ struct list_head fllink; - /* - * Works together "struct eventpoll"->ovflist in keeping the - * single linked chain of items. - */ - struct epitem *next; + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; }; /* @@ -178,7 +172,7 @@ struct epitem { */ struct eventpoll { /* Protect the this structure access */ - rwlock_t lock; + spinlock_t lock; /* * This mutex is used to ensure that files are not removed @@ -393,79 +387,12 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } -/* - * Unlink the "struct epitem" from all places it might have been hooked up. - * This function must be called with write IRQ lock on "ep->lock". - */ -static int ep_unlink(struct eventpoll *ep, struct epitem *epi) -{ - int error; - - /* - * It can happen that this one is called for an item already unlinked. - * The check protect us from doing a double unlink ( crash ). - */ - error = -ENOENT; - if (!ep_rb_linked(&epi->rbn)) - goto error_return; - - /* - * Clear the event mask for the unlinked item. This will avoid item - * notifications to be sent after the unlink operation from inside - * the kernel->userspace event transfer loop. - */ - epi->event.events = 0; - - /* - * At this point is safe to do the job, unlink the item from our rb-tree. - * This operation togheter with the above check closes the door to - * double unlinks. - */ - ep_rb_erase(&epi->rbn, &ep->rbr); - - /* - * If the item we are going to remove is inside the ready file descriptors - * we want to remove it from this list to avoid stale events. - */ - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - - error = 0; -error_return: - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", - current, ep, epi->ffd.file, error)); - - return error; -} - -/* - * Increment the usage count of the "struct epitem" making it sure - * that the user will have a valid pointer to reference. - */ -static void ep_use_epitem(struct epitem *epi) -{ - atomic_inc(&epi->usecnt); -} - -/* - * Decrement ( release ) the usage count by signaling that the user - * has finished using the structure. It might lead to freeing the - * structure itself if the count goes to zero. - */ -static void ep_release_epitem(struct epitem *epi) -{ - if (atomic_dec_and_test(&epi->usecnt)) - kmem_cache_free(epi_cache, epi); -} - /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates - * all the associated resources. + * all the associated resources. Must be called with "mtx" held. */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { - int error; unsigned long flags; struct file *file = epi->ffd.file; @@ -485,26 +412,21 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->fllink); spin_unlock(&file->f_ep_lock); - /* We need to acquire the write IRQ lock before calling ep_unlink() */ - write_lock_irqsave(&ep->lock, flags); - - /* Really unlink the item from the RB tree */ - error = ep_unlink(ep, epi); - - write_unlock_irqrestore(&ep->lock, flags); + if (ep_rb_linked(&epi->rbn)) + ep_rb_erase(&epi->rbn, &ep->rbr); - if (error) - goto error_return; + spin_lock_irqsave(&ep->lock, flags); + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + spin_unlock_irqrestore(&ep->lock, flags); /* At this point it is safe to free the eventpoll item */ - ep_release_epitem(epi); + kmem_cache_free(epi_cache, epi); - error = 0; -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", - current, ep, file, error)); + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", + current, ep, file)); - return error; + return 0; } static void ep_free(struct eventpoll *ep) @@ -574,10 +496,10 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) poll_wait(file, &ep->poll_wait, wait); /* Check our condition */ - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); if (!list_empty(&ep->rdllist)) pollflags = POLLIN | POLLRDNORM; - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); return pollflags; } @@ -636,7 +558,7 @@ static int ep_alloc(struct eventpoll **pep) if (!ep) return -ENOMEM; - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); @@ -652,20 +574,18 @@ static int ep_alloc(struct eventpoll **pep) } /* - * Search the file inside the eventpoll tree. It add usage count to - * the returned item, so the caller must call ep_release_epitem() - * after finished using the "struct epitem". + * Search the file inside the eventpoll tree. The RB tree operations + * are protected by the "mtx" mutex, and ep_find() must be called with + * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; - unsigned long flags; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); - read_lock_irqsave(&ep->lock, flags); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); @@ -674,12 +594,10 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) else if (kcmp < 0) rbp = rbp->rb_left; else { - ep_use_epitem(epi); epir = epi; break; } } - read_unlock_irqrestore(&ep->lock, flags); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", current, file, epir)); @@ -702,7 +620,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", current, epi->ffd.file, epi, ep)); - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); /* * If the event mask does not contain any poll(2) event, we consider the @@ -745,7 +663,7 @@ is_linked: pwake++; out_unlock: - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -796,6 +714,9 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } +/* + * Must be called with "mtx" held. + */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { @@ -816,7 +737,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; - atomic_set(&epi->usecnt, 1); epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; @@ -827,7 +747,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has - * been increased by the caller of this function. + * been increased by the caller of this function. Note that after + * this operation completes, the poll callback can start hitting + * the new item. */ revents = tfile->f_op->poll(tfile, &epq.pt); @@ -844,12 +766,15 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_ep_lock); - /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irqsave(&ep->lock, flags); - - /* Add the current item to the rb-tree */ + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ ep_rbtree_insert(ep, epi); + /* We have to drop the new item inside our item list to keep track of it */ + spin_lock_irqsave(&ep->lock, flags); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -861,7 +786,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, pwake++; } - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -879,10 +804,10 @@ error_unregister: * We need to do this because an event could have been arrived on some * allocated wait queue. */ - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); error_return: @@ -891,7 +816,7 @@ error_return: /* * Modify the interest event mask by dropping an event if the new mask - * has a match in the current file status. + * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) { @@ -913,36 +838,29 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); /* Copy the data member from inside the lock */ epi->event.data = event->data; /* - * If the item is not linked to the RB tree it means that it's on its - * way toward the removal. Do nothing in this case. + * If the item is "hot" and it is not registered inside the ready + * list, push it inside. If the item is not "hot" and it is currently + * registered inside the ready list, unlink it. */ - if (ep_rb_linked(&epi->rbn)) { - /* - * If the item is "hot" and it is not registered inside the ready - * list, push it inside. If the item is not "hot" and it is currently - * registered inside the ready list, unlink it. - */ - if (revents & event->events) { - if (!ep_is_linked(&epi->rdllink)) { - list_add_tail(&epi->rdllink, &ep->rdllist); - - /* Notify waiting tasks that events are available */ - if (waitqueue_active(&ep->wq)) - __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | - TASK_INTERRUPTIBLE); - if (waitqueue_active(&ep->poll_wait)) - pwake++; - } + if (revents & event->events) { + if (!ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); + if (waitqueue_active(&ep->poll_wait)) + pwake++; } } - - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -975,11 +893,11 @@ static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *event * have the poll callback to queue directly on ep->rdllist, * because we are doing it in the loop below, in a lockless way. */ - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); list_splice(&ep->rdllist, &txlist); INIT_LIST_HEAD(&ep->rdllist); ep->ovflist = NULL; - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* * We can loop without lock because this is a task private list. @@ -1028,7 +946,7 @@ static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *event errxit: - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent in the loop above, some other events * might have been queued by the poll callback. We re-insert them @@ -1064,7 +982,7 @@ errxit: if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); @@ -1092,7 +1010,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; retry: - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); res = 0; if (list_empty(&ep->rdllist)) { @@ -1119,9 +1037,9 @@ retry: break; } - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); jtimeout = schedule_timeout(jtimeout); - write_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); @@ -1131,7 +1049,7 @@ retry: /* Is it worth to try to dig for events ? */ eavail = !list_empty(&ep->rdllist); - write_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* * Try to transfer events to user space. In case we get 0 events and @@ -1276,12 +1194,6 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, error = -ENOENT; break; } - /* - * The function ep_find() increments the usage count of the structure - * so, if this is not NULL, we need to release it. - */ - if (epi) - ep_release_epitem(epi); mutex_unlock(&ep->mtx); error_tgt_fput: @@ -1388,7 +1300,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, if (sigmask) { if (error == -EINTR) { memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); + sizeof(sigsaved)); set_thread_flag(TIF_RESTORE_SIGMASK); } else sigprocmask(SIG_SETMASK, &sigsaved, NULL); -- cgit v1.2.3-70-g09d2 From 67647d0fb8bc03609d045a9cce85f7ef6d763036 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 15 May 2007 01:40:52 -0700 Subject: epoll: fix some comments Fixes some epoll code comments. Signed-off-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4c16127c96be..d4255be29bc6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -134,7 +134,7 @@ struct poll_safewake { * have an entry of this type linked to the "rbr" RB tree. */ struct epitem { - /* RB-Tree node used to link this structure to the eventpoll rb-tree */ + /* RB tree node used to link this structure to the eventpoll RB tree */ struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ @@ -191,7 +191,7 @@ struct eventpoll { /* List of ready file descriptors */ struct list_head rdllist; - /* RB-Tree root used to store monitored fd structs */ + /* RB tree root used to store monitored fd structs */ struct rb_root rbr; /* @@ -241,7 +241,7 @@ static struct kmem_cache *epi_cache __read_mostly; static struct kmem_cache *pwq_cache __read_mostly; -/* Setup the structure that is used as key for the rb-tree */ +/* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { @@ -249,7 +249,7 @@ static inline void ep_set_ffd(struct epoll_filefd *ffd, ffd->fd = fd; } -/* Compare rb-tree keys */ +/* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { @@ -257,20 +257,20 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1, (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } -/* Special initialization for the rb-tree node to detect linkage */ +/* Special initialization for the RB tree node to detect linkage */ static inline void ep_rb_initnode(struct rb_node *n) { rb_set_parent(n, n); } -/* Removes a node from the rb-tree and marks it for a fast is-linked check */ +/* Removes a node from the RB tree and marks it for a fast is-linked check */ static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r) { rb_erase(n, r); rb_set_parent(n, n); } -/* Fast check to verify that the item is linked to the main rb-tree */ +/* Fast check to verify that the item is linked to the main RB tree */ static inline int ep_rb_linked(struct rb_node *n) { return rb_parent(n) != n; @@ -531,6 +531,8 @@ void eventpoll_release_file(struct file *file) * We don't want to get "file->f_ep_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that noone is using this file anymore. + * So, for example, epoll_ctl() cannot hit here sicne if we reach this + * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire * "ep->mtx" after "epmutex" because ep_remove() requires it when called @@ -802,7 +804,9 @@ error_unregister: /* * We need to do this because an event could have been arrived on some - * allocated wait queue. + * allocated wait queue. Note that we don't care about the ep->ovflist + * list, since that is used/cleaned only inside a section bound by "mtx". + * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) @@ -845,8 +849,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even /* * If the item is "hot" and it is not registered inside the ready - * list, push it inside. If the item is not "hot" and it is currently - * registered inside the ready list, unlink it. + * list, push it inside. */ if (revents & event->events) { if (!ep_is_linked(&epi->rdllink)) { @@ -966,15 +969,16 @@ errxit: ep->ovflist = EP_UNACTIVE_PTR; /* - * In case of error in the event-send loop, we might still have items - * inside the "txlist". We need to splice them back inside ep->rdllist. + * In case of error in the event-send loop, or in case the number of + * ready events exceeds the userspace limit, we need to splice the + * "txlist" back inside ep->rdllist. */ list_splice(&txlist, &ep->rdllist); if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and the ->poll() - * wait list. + * wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | @@ -1064,11 +1068,10 @@ retry: } /* - * It opens an eventpoll file descriptor by suggesting a storage of "size" - * file descriptors. The size parameter is just an hint about how to size - * data structures. It won't prevent the user to store more than "size" - * file descriptors inside the epoll interface. It is the kernel part of - * the userspace epoll_create(2). + * It opens an eventpoll file descriptor. The "size" parameter is there + * for historical reasons, when epoll was using an hash instead of an + * RB tree. With the current implementation, the "size" parameter is ignored + * (besides sanity checks). */ asmlinkage long sys_epoll_create(int size) { @@ -1114,8 +1117,7 @@ error_return: /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. It represents - * the kernel part of the user space epoll_ctl(2). + * file descriptors inside the interest set. */ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) @@ -1167,7 +1169,11 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, mutex_lock(&ep->mtx); - /* Try to lookup the file inside our RB tree */ + /* + * Try to lookup the file inside our RB tree, Since we grabbed "mtx" + * above, we can be sure to be able to use the item looked up by + * ep_find() till we release the mutex. + */ epi = ep_find(ep, tfile, fd); error = -EINVAL; -- cgit v1.2.3-70-g09d2 From f0ee9aabb0520adea5937855a9575c08a97b16e7 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 15 May 2007 01:40:57 -0700 Subject: epoll: move kfree inside ep_free Move the kfree() call inside the ep_free() function. Signed-off-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d4255be29bc6..0b73cd45a06d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -469,18 +469,16 @@ static void ep_free(struct eventpoll *ep) } mutex_unlock(&epmutex); - mutex_destroy(&ep->mtx); + kfree(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; - if (ep) { + if (ep) ep_free(ep); - kfree(ep); - } DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); return 0; @@ -1107,7 +1105,6 @@ asmlinkage long sys_epoll_create(int size) error_free: ep_free(ep); - kfree(ep); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, error)); -- cgit v1.2.3-70-g09d2 From 218f0aaee8a6b0e5772b95b154dea5b7701b33aa Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 15 May 2007 01:41:02 -0700 Subject: nommu: add ioremap_page_range() lib/ioremap.c is presently only built in if CONFIG_MMU is set. While this is reasonable, platforms that support both CONFIG_MMU=y or n need to be able to call in to this regardless. As none of the current nommu platforms do anything special with ioremap(), we assume that it's always successful. This fixes the SH-4 build with CONFIG_MMU=n. Signed-off-by: Paul Mundt Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/io.h b/include/linux/io.h index 09d351236379..8423dd376514 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -27,8 +27,16 @@ struct device; void __iowrite32_copy(void __iomem *to, const void *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); +#ifdef CONFIG_MMU int ioremap_page_range(unsigned long addr, unsigned long end, unsigned long phys_addr, pgprot_t prot); +#else +static inline int ioremap_page_range(unsigned long addr, unsigned long end, + unsigned long phys_addr, pgprot_t prot) +{ + return 0; +} +#endif /* * Managed iomap interface -- cgit v1.2.3-70-g09d2 From b67405bbbba6bbd28dfd5337b29d5bc5a1140afb Mon Sep 17 00:00:00 2001 From: Yoshinori Sato Date: Tue, 15 May 2007 01:41:07 -0700 Subject: h8300 atomic.h update add atomic_sub_and_test define. Signed-off-by: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-h8300/atomic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-h8300/atomic.h b/include/asm-h8300/atomic.h index 21f54428c86b..b4cf0ea97ede 100644 --- a/include/asm-h8300/atomic.h +++ b/include/asm-h8300/atomic.h @@ -37,6 +37,7 @@ static __inline__ int atomic_sub_return(int i, atomic_t *v) } #define atomic_sub(i, v) atomic_sub_return(i, v) +#define atomic_sub_and_test(i,v) (atomic_sub_return(i, v) == 0) static __inline__ int atomic_inc_return(atomic_t *v) { -- cgit v1.2.3-70-g09d2 From 0fcdf96ca95f81a0e1fd91a2de16dc67c641c958 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 15 May 2007 01:41:15 -0700 Subject: alpha: fix hard_smp_processor_id compile error "Remove hardcoding of hard_smp_processor_id on UP systems", 2f4dfe206a2fc07099dfad77a8ea2f4b4ae2140f in Linus' tree, moved the definition of hard_smp_processor_id linux/smp.h to asm/smp.h for UP systems. This causes a regression on Alpha. cc1: warnings being treated as errors arch/alpha/kernel/setup.c: In function 'setup_arch': arch/alpha/kernel/setup.c:506: warning: implicit declaration of function 'hard_smp_processor_id' make[1]: *** [arch/alpha/kernel/setup.o] error 1 make: *** [arch/alpha/kernel] error 2 By including asm/smp.h non-conditionally in asm/mmu_context.h the problem appears to be resolved. Cc: Fernando Luis Vazquez Cao Signed-off-by: Simon Horman Cc: Richard Henderson Cc: Ivan Kokshaysky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h index 0bd7bd2ccb90..6a5be1f7debf 100644 --- a/include/asm-alpha/mmu_context.h +++ b/include/asm-alpha/mmu_context.h @@ -85,8 +85,8 @@ __reload_thread(struct pcb_struct *pcb) * +-------------+----------------+--------------+ */ -#ifdef CONFIG_SMP #include +#ifdef CONFIG_SMP #define cpu_last_asn(cpuid) (cpu_data[cpuid].last_asn) #else extern unsigned long last_asn; -- cgit v1.2.3-70-g09d2 From 3c46bdcaec53eda069a8a9cd60621c7431aa7842 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 15 May 2007 01:41:29 -0700 Subject: m68k: implement __clear_user() m68k: implement __clear_user(), which is needed by fs/signalfd.c Since we always let the MMU do all checking, clear_user() and __clear_user() are identical. The old clear_user() is renamed to __clear_user() for consistency. Signed-off-by: Geert Uytterhoeven Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/lib/uaccess.c | 4 ++-- include/asm-m68k/uaccess.h | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/m68k/lib/uaccess.c b/arch/m68k/lib/uaccess.c index 865f9fb9e686..13854ed8cd9a 100644 --- a/arch/m68k/lib/uaccess.c +++ b/arch/m68k/lib/uaccess.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL(strnlen_user); * Zero Userspace */ -unsigned long clear_user(void __user *to, unsigned long n) +unsigned long __clear_user(void __user *to, unsigned long n) { unsigned long res; @@ -219,4 +219,4 @@ unsigned long clear_user(void __user *to, unsigned long n) return res; } -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); diff --git a/include/asm-m68k/uaccess.h b/include/asm-m68k/uaccess.h index 6a4cf2081512..5c1264cf0c65 100644 --- a/include/asm-m68k/uaccess.h +++ b/include/asm-m68k/uaccess.h @@ -361,7 +361,9 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n) long strncpy_from_user(char *dst, const char __user *src, long count); long strnlen_user(const char __user *src, long n); -unsigned long clear_user(void __user *to, unsigned long n); +unsigned long __clear_user(void __user *to, unsigned long n); + +#define clear_user __clear_user #define strlen_user(str) strnlen_user(str, 32767) -- cgit v1.2.3-70-g09d2 From 8f89441b37536fea92b1ed7004e5e2dda011473d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 May 2007 01:41:32 -0700 Subject: clocksource: fix lock order in the resume path lockdep complains about the lock nesting of clocksource and watchdog lock in the resume path. Change the resume marker to a bit operation and remove the lock from this path. Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3db5c3c460d7..51b6a6a6158c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -74,7 +74,7 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static int watchdog_resumed; +static unsigned long watchdog_resumed; /* * Interval: 0.5sec Threshold: 0.0625s @@ -104,9 +104,7 @@ static void clocksource_watchdog(unsigned long data) spin_lock(&watchdog_lock); - resumed = watchdog_resumed; - if (unlikely(resumed)) - watchdog_resumed = 0; + resumed = test_and_clear_bit(0, &watchdog_resumed); wdnow = watchdog->read(); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); @@ -151,9 +149,7 @@ static void clocksource_watchdog(unsigned long data) } static void clocksource_resume_watchdog(void) { - spin_lock(&watchdog_lock); - watchdog_resumed = 1; - spin_unlock(&watchdog_lock); + set_bit(0, &watchdog_resumed); } static void clocksource_check_watchdog(struct clocksource *cs) -- cgit v1.2.3-70-g09d2 From 3bd2aad2103314a0a09614dc29926a1437db02f7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 15 May 2007 01:41:36 -0700 Subject: Revert "MAINTAINERS: remove invalid list address for TPM" This reverts commit b6d1c9a44744224d83125a5a89c1a6cc4db27361. Others tell me that this address has worked for them, so I can only assume that I hit a glitch in the sourceforge mail system. Signed-off-by: Stephen Rothwell Cc: Kylene Hall Cc: Marcel Selhorst Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 21f3fff5432f..bbeb5b6b5b05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3267,6 +3267,7 @@ W: http://tpmdd.sourceforge.net P: Marcel Selhorst M: tpm@selhorst.net W: http://www.prosec.rub.de/tpm/ +L: tpmdd-devel@lists.sourceforge.net S: Maintained Telecom Clock Driver for MCPL0010 -- cgit v1.2.3-70-g09d2 From 838c41184fee5e151c09972f2ba90c16493af614 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 15 May 2007 01:41:43 -0700 Subject: Remove cpu hotplug defines for __INIT & __INITDATA After examining what was checked in and the code base I discovered that most of 86c0baf123e474b6eb404798926ecf62b426bf3a wasn't necessary anymore.... So here's a patch that reverts the last part of that changeset: Revert part of 86c0baf123e474b6eb404798926ecf62b426bf3a. The kernel has moved forward to a state where the original change is not necessary. After porting forward, this final version of the patch was applied and broke non-x86 architectures. Signed-off-by: Prarit Bhargava Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index 8bc32bb2fce2..e007ae4dc41e 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -52,14 +52,9 @@ #endif /* For assembly routines */ -#ifdef CONFIG_HOTPLUG_CPU -#define __INIT .section ".text","ax" -#define __INITDATA .section ".data","aw" -#else #define __INIT .section ".init.text","ax" -#define __INITDATA .section ".init.data","aw" -#endif #define __FINIT .previous +#define __INITDATA .section ".init.data","aw" #ifndef __ASSEMBLY__ /* -- cgit v1.2.3-70-g09d2 From 297d9c035edd04327fedc0d1da27c2b112b66fcc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 May 2007 01:41:48 -0700 Subject: i386: move common parts of smp into their own file Several parts of kernel/smp.c and smpboot.c are generally useful for other subarchitectures and paravirt_ops implementations, so make them available for reuse. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: James Bottomley Cc: Eric W. Biederman Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/Makefile | 1 + arch/i386/kernel/smp.c | 65 ++++-------------------------------- arch/i386/kernel/smpboot.c | 22 ------------ arch/i386/kernel/smpcommon.c | 79 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/processor.h | 4 +++ 5 files changed, 90 insertions(+), 81 deletions(-) create mode 100644 arch/i386/kernel/smpcommon.c diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 91cff8dc9e1a..06da59f6f837 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 706bda72dc60..c9a7c9835aba 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -467,7 +467,7 @@ void flush_tlb_all(void) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void native_smp_send_reschedule(int cpu) +static void native_smp_send_reschedule(int cpu) { WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); @@ -546,9 +546,10 @@ static void __smp_call_function(void (*func) (void *info), void *info, * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int native_smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +static int +native_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -599,60 +600,6 @@ int native_smp_call_function_mask(cpumask_t mask, return 0; } -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - return smp_call_function_mask(cpu_online_map, func, info, wait); -} -EXPORT_SYMBOL(smp_call_function); - -/** - * smp_call_function_single - Run a function on another CPU - * @cpu: The target CPU. Cannot be the calling CPU. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); - if (cpu == me) { - WARN_ON(1); - put_cpu(); - return -EBUSY; - } - - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); - - put_cpu(); - return ret; -} -EXPORT_SYMBOL(smp_call_function_single); - static void stop_this_cpu (void * dummy) { local_irq_disable(); @@ -670,7 +617,7 @@ static void stop_this_cpu (void * dummy) * this function calls the 'stop' function on all other CPUs in the system. */ -void native_smp_send_stop(void) +static void native_smp_send_stop(void) { /* Don't deadlock on the call lock in panic */ int nolock = !spin_trylock(&call_lock); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index b92cc4e8b3bb..08f07a74a9d3 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -98,9 +98,6 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; -DEFINE_PER_CPU(unsigned long, this_cpu_off); -EXPORT_PER_CPU_SYMBOL(this_cpu_off); - /* * Trampoline 80x86 program as an array. */ @@ -763,25 +760,6 @@ static inline struct task_struct * alloc_idle_task(int cpu) #define alloc_idle_task(cpu) fork_idle(cpu) #endif -/* Initialize the CPU's GDT. This is either the boot CPU doing itself - (still using the master per-cpu area), or a CPU doing it for a - secondary which will soon come up. */ -static __cpuinit void init_gdt(int cpu) -{ - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, - (u32 *)&gdt[GDT_ENTRY_PERCPU].b, - __per_cpu_offset[cpu], 0xFFFFF, - 0x80 | DESCTYPE_S | 0x2, 0x8); - - per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; - per_cpu(cpu_number, cpu) = cpu; -} - -/* Defined in head.S */ -extern struct Xgt_desc_struct early_gdt_descr; - static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c new file mode 100644 index 000000000000..1868ae18eb4d --- /dev/null +++ b/arch/i386/kernel/smpcommon.c @@ -0,0 +1,79 @@ +/* + * SMP stuff which is common to all sub-architectures. + */ +#include +#include + +DEFINE_PER_CPU(unsigned long, this_cpu_off); +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + +/* Initialize the CPU's GDT. This is either the boot CPU doing itself + (still using the master per-cpu area), or a CPU doing it for a + secondary which will soon come up. */ +__cpuinit void init_gdt(int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, + (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + __per_cpu_offset[cpu], 0xFFFFF, + 0x80 | DESCTYPE_S | 0x2, 0x8); + + per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; + per_cpu(cpu_number, cpu) = cpu; +} + + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + return smp_call_function_mask(cpu_online_map, func, info, wait); +} +EXPORT_SYMBOL(smp_call_function); + +/** + * smp_call_function_single - Run a function on another CPU + * @cpu: The target CPU. Cannot be the calling CPU. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int ret; + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 70f3515c3db0..338668bfb0a2 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -749,9 +749,13 @@ extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); +/* Defined in head.S */ +extern struct Xgt_desc_struct early_gdt_descr; + extern void cpu_set_gdt(int); extern void switch_to_new_gdt(void); extern void cpu_init(void); +extern void init_gdt(int cpu); extern int force_mwait; -- cgit v1.2.3-70-g09d2 From 6a3ee3d5529c5e66aedf91401bfac65c61998639 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 May 2007 01:41:59 -0700 Subject: i386: fix voyager build This adds an smp_ops for voyager, and hooks things up appropriately. This is the first baby-step to making subarch runtime switchable. Signed-off-by: Jeremy Fitzhardinge Cc: James Bottomley Cc: Eric W. Biederman Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mach-voyager/voyager_smp.c | 106 ++++++++++++++--------------------- 1 file changed, 41 insertions(+), 65 deletions(-) diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index 50d9c52070b1..b87f8548e75a 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -27,7 +27,6 @@ #include #include #include -#include /* TLB state -- visible externally, indexed physically */ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; @@ -422,7 +421,7 @@ find_smp_config(void) VOYAGER_SUS_IN_CONTROL_PORT); current_thread_info()->cpu = boot_cpu_id; - write_pda(cpu_number, boot_cpu_id); + x86_write_percpu(cpu_number, boot_cpu_id); } /* @@ -435,7 +434,7 @@ smp_store_cpu_info(int id) *c = boot_cpu_data; - identify_cpu(c); + identify_secondary_cpu(c); } /* set up the trampoline and return the physical address of the code */ @@ -459,7 +458,7 @@ start_secondary(void *unused) /* external functions not defined in the headers */ extern void calibrate_delay(void); - secondary_cpu_init(); + cpu_init(); /* OK, we're in the routine */ ack_CPI(VIC_CPU_BOOT_CPI); @@ -572,7 +571,9 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; - init_gdt(cpu, idle); + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ @@ -859,8 +860,8 @@ smp_invalidate_interrupt(void) /* This routine is called with a physical cpu mask */ static void -flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, - unsigned long va) +voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, + unsigned long va) { int stuck = 50000; @@ -912,7 +913,7 @@ flush_tlb_current_task(void) cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); local_flush_tlb(); if (cpu_mask) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); } @@ -934,7 +935,7 @@ flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (cpu_mask) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); } @@ -955,7 +956,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) } if (cpu_mask) - flush_tlb_others(cpu_mask, mm, va); + voyager_flush_tlb_others(cpu_mask, mm, va); preempt_enable(); } @@ -1044,10 +1045,12 @@ smp_call_function_interrupt(void) } static int -__smp_call_function_mask (void (*func) (void *info), void *info, int retry, - int wait, __u32 mask) +voyager_smp_call_function_mask (cpumask_t cpumask, + void (*func) (void *info), void *info, + int wait) { struct call_data_struct data; + u32 mask = cpus_addr(cpumask)[0]; mask &= ~(1< The function to run. This must be fast and non-blocking. - An arbitrary pointer to pass to the function. - If true, keep retrying until ready. - If true, wait until function has completed on other CPUs. - [RETURNS] 0 on success, else a negative status code. Does not return until - remote CPUs are nearly ready to execute <> or are or have executed. -*/ -int -smp_call_function(void (*func) (void *info), void *info, int retry, - int wait) -{ - __u32 mask = cpus_addr(cpu_online_map)[0]; - - return __smp_call_function_mask(func, info, retry, wait, mask); -} -EXPORT_SYMBOL(smp_call_function); - -/* - * smp_call_function_single - Run a function on another CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute - * or is or has executed. - */ - -int -smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - __u32 mask = 1 << cpu; - - return __smp_call_function_mask(func, info, nonatomic, wait, mask); -} -EXPORT_SYMBOL(smp_call_function_single); - /* Sorry about the name. In an APIC based system, the APICs * themselves are programmed to send a timer interrupt. This is used * by linux to reschedule the processor. Voyager doesn't have this, @@ -1237,8 +1199,8 @@ smp_alloc_memory(void) } /* send a reschedule CPI to one CPU by physical CPU number*/ -void -smp_send_reschedule(int cpu) +static void +voyager_smp_send_reschedule(int cpu) { send_one_CPI(cpu, VIC_RESCHEDULE_CPI); } @@ -1267,8 +1229,8 @@ safe_smp_processor_id(void) } /* broadcast a halt to all other CPUs */ -void -smp_send_stop(void) +static void +voyager_smp_send_stop(void) { smp_call_function(smp_stop_cpu_function, NULL, 1, 1); } @@ -1930,23 +1892,26 @@ smp_voyager_power_off(void *dummy) smp_stop_cpu_function(NULL); } -void __init -smp_prepare_cpus(unsigned int max_cpus) +static void __init +voyager_smp_prepare_cpus(unsigned int max_cpus) { /* FIXME: ignore max_cpus for now */ smp_boot_cpus(); } -void __devinit smp_prepare_boot_cpu(void) +static void __devinit voyager_smp_prepare_boot_cpu(void) { + init_gdt(smp_processor_id()); + switch_to_new_gdt(); + cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_possible_map); cpu_set(smp_processor_id(), cpu_present_map); } -int __devinit -__cpu_up(unsigned int cpu) +static int __devinit +voyager_cpu_up(unsigned int cpu) { /* This only works at boot for x86. See "rewrite" above. */ if (cpu_isset(cpu, smp_commenced_mask)) @@ -1962,8 +1927,8 @@ __cpu_up(unsigned int cpu) return 0; } -void __init -smp_cpus_done(unsigned int max_cpus) +static void __init +voyager_smp_cpus_done(unsigned int max_cpus) { zap_low_mappings(); } @@ -1972,5 +1937,16 @@ void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); - write_pda(cpu_number, hard_smp_processor_id()); + x86_write_percpu(cpu_number, hard_smp_processor_id()); } + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu, + .smp_prepare_cpus = voyager_smp_prepare_cpus, + .cpu_up = voyager_cpu_up, + .smp_cpus_done = voyager_smp_cpus_done, + + .smp_send_stop = voyager_smp_send_stop, + .smp_send_reschedule = voyager_smp_send_reschedule, + .smp_call_function_mask = voyager_smp_call_function_mask, +}; -- cgit v1.2.3-70-g09d2 From cfbf07f2a80b618c42a42c20d83647ea8fcceca0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 15 May 2007 01:42:06 -0700 Subject: SLUB: CONFIG_LARGE_ALLOCS must consider MAX_ORDER limit Take MAX_ORDER into consideration when determining KMALLOC_SHIFT_HIGH. Otherwise we may run into a situation where we attempt to create general slabs larger than MAX_ORDER. Signed-off-by: Christoph Lameter Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index ea27065e80e6..fd6627e2d115 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -60,7 +60,8 @@ struct kmem_cache { #define KMALLOC_SHIFT_LOW 3 #ifdef CONFIG_LARGE_ALLOCS -#define KMALLOC_SHIFT_HIGH 25 +#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT) =< 25 ? \ + (MAX_ORDER + PAGE_SHIFT - 1) : 25) #else #if !defined(CONFIG_MMU) || NR_CPUS > 512 || MAX_NUMNODES > 256 #define KMALLOC_SHIFT_HIGH 20 @@ -87,6 +88,9 @@ static inline int kmalloc_index(int size) */ WARN_ON_ONCE(size == 0); + if (size >= (1 << KMALLOC_SHIFT_HIGH)) + return -1; + if (size > 64 && size <= 96) return 1; if (size > 128 && size <= 192) -- cgit v1.2.3-70-g09d2 From f653c34dd3d8bde2c918316fd5ba2e2c4f95afcf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 May 2007 19:30:07 +0200 Subject: ll_rw_blk: fix gcc 4.2 warning on current_io_context() current_io_context() is both static and exported with EXPORT_SYMBOL(). As there are no users outside of ll_rw_blk.c itself, just kill the export. Problem reported by Martin Michlmayr Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/ll_rw_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 74a567afb830..6b5173ac8131 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3802,7 +3802,6 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node) return ret; } -EXPORT_SYMBOL(current_io_context); /* * If the current task has no IO context then create one and initialise it. -- cgit v1.2.3-70-g09d2