From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:04 -0500 Subject: nmi_watchdog: Add new, generic implementation, using perf events This is a new generic nmi_watchdog implementation using the perf events infrastructure as suggested by Ingo. The implementation is simple, just create an in-kernel perf event and register an overflow handler to check for cpu lockups. I created a generic implementation that lives in kernel/ and the hardware specific part that for now lives in arch/x86. This approach has a number of advantages: - It simplifies the x86 PMU implementation in the long run, in that it removes the hardcoded low-level PMU implementation that was the NMI watchdog before. - It allows new NMI watchdog features to be added in a central place. - It allows other architectures to enable the NMI watchdog, as long as they have perf events (that provide NMIs) implemented. - It also allows for more graceful co-existence of existing perf events apps and the NMI watchdog - before these changes the relationship was exclusive. (The NMI watchdog will 'spend' a perf event when enabled. In later iterations we might be able to piggyback from an existing NMI event without having to allocate a hardware event for the NMI watchdog - turning this into a no-hardware-cost feature.) As for compatibility, we'll keep the old NMI watchdog code as well until the new one can 100% replace it on all CPUs, old and new alike. That might take some time as the NMI watchdog has been ported to many CPU models. I have done light testing to make sure the framework works correctly and it does. v2: Set the correct timeout values based on the old nmi watchdog Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/nmi_watchdog.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 kernel/nmi_watchdog.c (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c new file mode 100644 index 000000000000..36817b214d69 --- /dev/null +++ b/kernel/nmi_watchdog.c @@ -0,0 +1,191 @@ +/* + * Detect Hard Lockups using the NMI + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev); +static DEFINE_PER_CPU(int, nmi_watchdog_touch); +static DEFINE_PER_CPU(long, alert_counter); + +void touch_nmi_watchdog(void) +{ + __raw_get_cpu_var(nmi_watchdog_touch) = 1; + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +void touch_all_nmi_watchdog(void) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu(nmi_watchdog_touch, cpu) = 1; + touch_softlockup_watchdog(); +} + +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int cpu; + + if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) + nmi_watchdog_enabled = 0; + else + nmi_watchdog_enabled = 1; + + touch_all_nmi_watchdog(); + proc_dointvec(table, write, buffer, length, ppos); + if (nmi_watchdog_enabled) + for_each_online_cpu(cpu) + perf_event_enable(per_cpu(nmi_watchdog_ev, cpu)); + else + for_each_online_cpu(cpu) + perf_event_disable(per_cpu(nmi_watchdog_ev, cpu)); + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +struct perf_event_attr wd_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +static int panic_on_timeout; + +void wd_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + int touched = 0; + + if (__get_cpu_var(nmi_watchdog_touch)) { + per_cpu(nmi_watchdog_touch, cpu) = 0; + touched = 1; + } + + /* check to see if the cpu is doing anything */ + if (!touched && hw_nmi_is_cpu_stuck(regs)) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + per_cpu(alert_counter,cpu) += 1; + if (per_cpu(alert_counter,cpu) == 5) { + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } + } else { + per_cpu(alert_counter,cpu) = 0; + } + + return; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct perf_event *event; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + per_cpu(nmi_watchdog_touch, hotcpu) = 0; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + /* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */ + wd_attr.sample_period = cpu_khz * 1000; + event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); + if (IS_ERR(event)) { + printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); + return NOTIFY_BAD; + } + per_cpu(nmi_watchdog_ev, hotcpu) = event; + perf_event_enable(per_cpu(nmi_watchdog_ev, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + perf_event_disable(per_cpu(nmi_watchdog_ev, hotcpu)); + case CPU_DEAD: + case CPU_DEAD_FROZEN: + event = per_cpu(nmi_watchdog_ev, hotcpu); + per_cpu(nmi_watchdog_ev, hotcpu) = NULL; + perf_event_release_kernel(event); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __initdata nonmi_watchdog; + +static int __init nonmi_watchdog_setup(char *str) +{ + nonmi_watchdog = 1; + return 1; +} +__setup("nonmi_watchdog", nonmi_watchdog_setup); + +static int __init spawn_nmi_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (nonmi_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + if (err == NOTIFY_BAD) { + BUG(); + return 1; + } + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + return 0; +} +early_initcall(spawn_nmi_watchdog_task); -- cgit v1.2.3-70-g09d2 From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:05 -0500 Subject: nmi_watchdog: Config option to enable new nmi_watchdog These are the bits that enable the new nmi_watchdog and safely isolate the old nmi_watchdog. Only one or the other can run, not both at the same time. Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 7 ++++++- arch/x86/kernel/traps.c | 2 ++ include/linux/nmi.h | 4 ++++ kernel/Makefile | 1 + lib/Kconfig.debug | 13 +++++++++++++ 5 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..1a4512e48d24 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_NMI_WATCHDOG),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 51ef893ffa65..973cbc4f044f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; +#ifndef CONFIG_NMI_WATCHDOG /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_NMI_WATCHDOG */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b752e807adde..a42ff0bef708 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -47,4 +47,8 @@ static inline bool trigger_all_cpu_backtrace(void) } #endif +#ifdef CONFIG_NMI_WATCHDOG +int hw_nmi_is_cpu_stuck(struct pt_regs *); +#endif + #endif diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..8a5abe53ebad 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o +obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 25c3ed594c54..f80b67e72aa0 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -170,6 +170,19 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) +config NMI_WATCHDOG + bool "Detect Hard Lockups with an NMI Watchdog" + depends on DEBUG_KERNEL && PERF_EVENTS + default y + help + Say Y here to enable the kernel to use the NMI as a watchdog + to detect hard lockups. This is useful when a cpu hangs for no + reason but can still respond to NMIs. A backtrace is displayed + for reviewing and reporting. + + The overhead should be minimal, just an extra NMI every few + seconds. + config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" depends on DETECT_SOFTLOCKUP -- cgit v1.2.3-70-g09d2 From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:19 -0500 Subject: nmi_watchdog: Compile and portability fixes The original patch was x86_64 centric. Changed the code to make it less so. ested by building and running on a powerpc. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++----- include/linux/nmi.h | 9 ++++++++ kernel/nmi_watchdog.c | 52 ++++++++++++++++++++++++++++++++++--------- kernel/sysctl.c | 15 ++++++++++++- 5 files changed, 82 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 93da9c3f3341..5b41b0feb6db 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); +#if !defined(CONFIG_NMI_WATCHDOG) extern int nmi_watchdog_enabled; +#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c0e6a410d05..312d772c5c35 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; + +#if defined(CONFIG_X86_LOCAL_APIC) + irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif + + return irqs; } static inline int mce_in_progress(void) @@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } } +u64 hw_nmi_get_sample_period(void) +{ + return cpu_khz * 1000; +} + void arch_trigger_all_cpu_backtrace(void) { int i; @@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void) } /* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -int nmi_watchdog_enabled; int unknown_nmi_panic; void cpu_nmi_set_wd_enabled(void) { return; } -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } void stop_apic_nmi_watchdog(void *unused) { return; } void setup_apic_nmi_watchdog(void *unused) { return; } int __init check_nmi_watchdog(void) { return 0; } diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a42ff0bef708..794e7354c5bf 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,10 +20,14 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else +#ifndef CONFIG_NMI_WATCHDOG static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); } +#else +extern void touch_nmi_watchdog(void); +#endif static inline void acpi_nmi_disable(void) { } static inline void acpi_nmi_enable(void) { } #endif @@ -49,6 +53,11 @@ static inline bool trigger_all_cpu_backtrace(void) #ifdef CONFIG_NMI_WATCHDOG int hw_nmi_is_cpu_stuck(struct pt_regs *); +u64 hw_nmi_get_sample_period(void); +extern int nmi_watchdog_enabled; +struct ctl_table; +extern int proc_nmi_enabled(struct ctl_table *, int , + void __user *, size_t *, loff_t *); #endif #endif diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 36817b214d69..73c1954a97bb 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -30,6 +30,8 @@ static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev); static DEFINE_PER_CPU(int, nmi_watchdog_touch); static DEFINE_PER_CPU(long, alert_counter); +static int panic_on_timeout; + void touch_nmi_watchdog(void) { __raw_get_cpu_var(nmi_watchdog_touch) = 1; @@ -46,19 +48,49 @@ void touch_all_nmi_watchdog(void) touch_softlockup_watchdog(); } +static int __init setup_nmi_watchdog(char *str) +{ + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + return 1; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + #ifdef CONFIG_SYSCTL /* * proc handler for /proc/sys/kernel/nmi_watchdog */ +int nmi_watchdog_enabled; + int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int cpu; - if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) + if (!write) { + struct perf_event *event; + for_each_online_cpu(cpu) { + event = per_cpu(nmi_watchdog_ev, cpu); + if (event->state > PERF_EVENT_STATE_OFF) { + nmi_watchdog_enabled = 1; + break; + } + } + proc_dointvec(table, write, buffer, length, ppos); + return 0; + } + + if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) { nmi_watchdog_enabled = 0; - else - nmi_watchdog_enabled = 1; + proc_dointvec(table, write, buffer, length, ppos); + printk("NMI watchdog failed configuration, can not be enabled\n"); + return 0; + } touch_all_nmi_watchdog(); proc_dointvec(table, write, buffer, length, ppos); @@ -81,8 +113,6 @@ struct perf_event_attr wd_attr = { .disabled = 1, }; -static int panic_on_timeout; - void wd_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -103,11 +133,11 @@ void wd_overflow(struct perf_event *event, int nmi, */ per_cpu(alert_counter,cpu) += 1; if (per_cpu(alert_counter,cpu) == 5) { - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); + if (panic_on_timeout) { + panic("NMI Watchdog detected LOCKUP on cpu %d", cpu); + } else { + WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu); + } } } else { per_cpu(alert_counter,cpu) = 0; @@ -133,7 +163,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: case CPU_ONLINE_FROZEN: /* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */ - wd_attr.sample_period = cpu_khz * 1000; + wd_attr.sample_period = hw_nmi_get_sample_period(); event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); if (IS_ERR(event)) { printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..ac72c9e6bd9b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -60,6 +60,10 @@ #include #endif +#ifdef CONFIG_NMI_WATCHDOG +#include +#endif + #if defined(CONFIG_SYSCTL) @@ -692,7 +696,16 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +#if defined(CONFIG_NMI_WATCHDOG) + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_enabled, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, -- cgit v1.2.3-70-g09d2 From cf454aecb31741a0438ed1201b3dd153c7c7b19a Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:20 -0500 Subject: nmi_watchdog: Fallback to software events when no hardware pmu detected Not all arches have a PMU or have perf_event support for their PMU. The nmi_watchdog will fail in those cases. Fallback to using software events to generate nmi_watchdog traffic with local apic interrupts. Tested on a Pentium4 and it worked as expected, excepting for detecting cpu lockups. The problem with using software events as a cpu lock up detector is the nmi_watchdog uses the logic that if local apic interrupts stop incrementing then the cpu is probably locked up. But with software events we use the local apic to trigger the nmi_watchdog callback to see if local apic interrupts are still firing, which obviously they are otherwise we wouldn't have been triggered. The algorithm to detect cpu lock ups is the same as the old nmi_watchdog. Perhaps we need to find a better way to detect lock ups? Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/nmi_watchdog.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 73c1954a97bb..4f23505d887d 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -166,8 +166,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) wd_attr.sample_period = hw_nmi_get_sample_period(); event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); if (IS_ERR(event)) { - printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); - return NOTIFY_BAD; + wd_attr.type = PERF_TYPE_SOFTWARE; + event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); + if (IS_ERR(event)) { + printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); + return NOTIFY_BAD; + } } per_cpu(nmi_watchdog_ev, hotcpu) = event; perf_event_enable(per_cpu(nmi_watchdog_ev, hotcpu)); -- cgit v1.2.3-70-g09d2 From 6081b6cd9702967889de34fe5da1f96bb96d0ab8 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 16 Feb 2010 17:04:52 -0500 Subject: nmi_watchdog: support for oprofile Re-arrange the code so that when someone disables nmi_watchdog with: echo 0 > /proc/sys/kernel/nmi_watchdog it releases the hardware reservation on the PMUs. This allows the oprofile module to grab those PMUs and do its thing. Otherwise oprofile fails to load because the hardware is reserved by the perf_events subsystem. Tested using: oprofile --vm-linux --start and watched it failed when nmi_watchdog is enabled and succeed when: oprofile --deinit && echo 0 > /proc/sys/kernel/nmi_watchdog is run. Note: this has the side quirk of having the nmi_watchdog latch onto the software events instead of hardware events if oprofile has already reserved the hardware first. User beware! :-) Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: eranian@google.com LKML-Reference: <1266357892-30504-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/nmi_watchdog.c | 144 ++++++++++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 4f23505d887d..633b230c2319 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -61,50 +61,6 @@ static int __init setup_nmi_watchdog(char *str) } __setup("nmi_watchdog=", setup_nmi_watchdog); -#ifdef CONFIG_SYSCTL -/* - * proc handler for /proc/sys/kernel/nmi_watchdog - */ -int nmi_watchdog_enabled; - -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int cpu; - - if (!write) { - struct perf_event *event; - for_each_online_cpu(cpu) { - event = per_cpu(nmi_watchdog_ev, cpu); - if (event->state > PERF_EVENT_STATE_OFF) { - nmi_watchdog_enabled = 1; - break; - } - } - proc_dointvec(table, write, buffer, length, ppos); - return 0; - } - - if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) { - nmi_watchdog_enabled = 0; - proc_dointvec(table, write, buffer, length, ppos); - printk("NMI watchdog failed configuration, can not be enabled\n"); - return 0; - } - - touch_all_nmi_watchdog(); - proc_dointvec(table, write, buffer, length, ppos); - if (nmi_watchdog_enabled) - for_each_online_cpu(cpu) - perf_event_enable(per_cpu(nmi_watchdog_ev, cpu)); - else - for_each_online_cpu(cpu) - perf_event_disable(per_cpu(nmi_watchdog_ev, cpu)); - return 0; -} - -#endif /* CONFIG_SYSCTL */ - struct perf_event_attr wd_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -146,6 +102,85 @@ void wd_overflow(struct perf_event *event, int nmi, return; } +static int enable_nmi_watchdog(int cpu) +{ + struct perf_event *event; + + event = per_cpu(nmi_watchdog_ev, cpu); + if (event && event->state > PERF_EVENT_STATE_OFF) + return 0; + + if (event == NULL) { + /* Try to register using hardware perf events first */ + wd_attr.sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(&wd_attr, cpu, -1, wd_overflow); + if (IS_ERR(event)) { + wd_attr.type = PERF_TYPE_SOFTWARE; + event = perf_event_create_kernel_counter(&wd_attr, cpu, -1, wd_overflow); + if (IS_ERR(event)) { + printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", cpu, event); + return -1; + } + } + per_cpu(nmi_watchdog_ev, cpu) = event; + } + perf_event_enable(per_cpu(nmi_watchdog_ev, cpu)); + return 0; +} + +static void disable_nmi_watchdog(int cpu) +{ + struct perf_event *event; + + event = per_cpu(nmi_watchdog_ev, cpu); + if (event) { + perf_event_disable(per_cpu(nmi_watchdog_ev, cpu)); + per_cpu(nmi_watchdog_ev, cpu) = NULL; + perf_event_release_kernel(event); + } +} + +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ +int nmi_watchdog_enabled; + +int proc_nmi_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int cpu; + + if (!write) { + struct perf_event *event; + for_each_online_cpu(cpu) { + event = per_cpu(nmi_watchdog_ev, cpu); + if (event && event->state > PERF_EVENT_STATE_OFF) { + nmi_watchdog_enabled = 1; + break; + } + } + proc_dointvec(table, write, buffer, length, ppos); + return 0; + } + + touch_all_nmi_watchdog(); + proc_dointvec(table, write, buffer, length, ppos); + if (nmi_watchdog_enabled) { + for_each_online_cpu(cpu) + if (enable_nmi_watchdog(cpu)) { + printk("NMI watchdog failed configuration, " + " can not be enabled\n"); + } + } else { + for_each_online_cpu(cpu) + disable_nmi_watchdog(cpu); + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ + /* * Create/destroy watchdog threads as CPUs come and go: */ @@ -153,7 +188,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; - struct perf_event *event; switch (action) { case CPU_UP_PREPARE: @@ -162,29 +196,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - /* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */ - wd_attr.sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); - if (IS_ERR(event)) { - wd_attr.type = PERF_TYPE_SOFTWARE; - event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow); - if (IS_ERR(event)) { - printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event); - return NOTIFY_BAD; - } - } - per_cpu(nmi_watchdog_ev, hotcpu) = event; - perf_event_enable(per_cpu(nmi_watchdog_ev, hotcpu)); + if (enable_nmi_watchdog(hotcpu)) + return NOTIFY_BAD; break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - perf_event_disable(per_cpu(nmi_watchdog_ev, hotcpu)); + disable_nmi_watchdog(hotcpu); case CPU_DEAD: case CPU_DEAD_FROZEN: - event = per_cpu(nmi_watchdog_ev, hotcpu); - per_cpu(nmi_watchdog_ev, hotcpu) = NULL; - perf_event_release_kernel(event); break; #endif /* CONFIG_HOTPLUG_CPU */ } -- cgit v1.2.3-70-g09d2 From 96ca4028aca0638d0e11dcbfabc4283054072933 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 16 Feb 2010 17:02:25 -0500 Subject: nmi_watchdog: Properly configure for software events Paul Mackerras brought up a good point that when fallbacking to software events, I may have been lucky in my configuration. Modified the code to explicit provide a new configuration for software events. Suggested-by: Paul Mackerras Signed-off-by: Don Zickus Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1266357745-26671-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/nmi_watchdog.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 633b230c2319..3c75cbf3acb8 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -61,7 +61,7 @@ static int __init setup_nmi_watchdog(char *str) } __setup("nmi_watchdog=", setup_nmi_watchdog); -struct perf_event_attr wd_attr = { +struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .size = sizeof(struct perf_event_attr), @@ -69,6 +69,14 @@ struct perf_event_attr wd_attr = { .disabled = 1, }; +struct perf_event_attr wd_sw_attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + void wd_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -105,6 +113,7 @@ void wd_overflow(struct perf_event *event, int nmi, static int enable_nmi_watchdog(int cpu) { struct perf_event *event; + struct perf_event_attr *wd_attr; event = per_cpu(nmi_watchdog_ev, cpu); if (event && event->state > PERF_EVENT_STATE_OFF) @@ -112,11 +121,15 @@ static int enable_nmi_watchdog(int cpu) if (event == NULL) { /* Try to register using hardware perf events first */ - wd_attr.sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(&wd_attr, cpu, -1, wd_overflow); + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); if (IS_ERR(event)) { - wd_attr.type = PERF_TYPE_SOFTWARE; - event = perf_event_create_kernel_counter(&wd_attr, cpu, -1, wd_overflow); + /* hardware doesn't exist or not supported, fallback to software events */ + printk("nmi_watchdog: hardware not available, trying software events\n"); + wd_attr = &wd_sw_attr; + wd_attr->sample_period = NSEC_PER_SEC; + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); if (IS_ERR(event)) { printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", cpu, event); return -1; -- cgit v1.2.3-70-g09d2 From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Feb 2010 18:09:03 -0500 Subject: nmi_watchdog: Clean up various small details Mostly copy/paste whitespace damage with a couple of nitpicks by the checkpatch script. Fix the struct definition as requested by Ingo too. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/apic/hw_nmi.c | 14 +++++------ arch/x86/kernel/traps.c | 6 ++-- include/linux/nmi.h | 2 - kernel/nmi_watchdog.c | 51 ++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) --- arch/x86/kernel/apic/hw_nmi.c | 14 ++++++------ arch/x86/kernel/traps.c | 6 ++--- include/linux/nmi.h | 2 +- kernel/nmi_watchdog.c | 51 +++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 0b4d205a6b8e..e8b78a0be5de 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu) irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; #endif - return irqs; + return irqs; } static inline int mce_in_progress(void) { #if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; + return atomic_read(&mce_entry) > 0; #endif - return 0; + return 0; } int hw_nmi_is_cpu_stuck(struct pt_regs *regs) @@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; /* We determine if the cpu is stuck by checking whether any * interrupts have happened since we last checked. Of course @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return cpu_khz * 1000; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 973cbc4f044f..bdc7fab3ef3e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; #ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifndef CONFIG_NMI_WATCHDOG /* diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 794e7354c5bf..22cc7960b649 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -57,7 +57,7 @@ u64 hw_nmi_get_sample_period(void); extern int nmi_watchdog_enabled; struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); #endif #endif diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 3c75cbf3acb8..0a6f57f537a7 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -50,31 +50,31 @@ void touch_all_nmi_watchdog(void) static int __init setup_nmi_watchdog(char *str) { - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - return 1; + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, }; struct perf_event_attr wd_sw_attr = { - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_CPU_CLOCK, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, }; void wd_overflow(struct perf_event *event, int nmi, @@ -95,16 +95,15 @@ void wd_overflow(struct perf_event *event, int nmi, * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - per_cpu(alert_counter,cpu) += 1; - if (per_cpu(alert_counter,cpu) == 5) { - if (panic_on_timeout) { + per_cpu(alert_counter, cpu) += 1; + if (per_cpu(alert_counter, cpu) == 5) { + if (panic_on_timeout) panic("NMI Watchdog detected LOCKUP on cpu %d", cpu); - } else { + else WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu); - } } } else { - per_cpu(alert_counter,cpu) = 0; + per_cpu(alert_counter, cpu) = 0; } return; @@ -126,7 +125,7 @@ static int enable_nmi_watchdog(int cpu) event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); if (IS_ERR(event)) { /* hardware doesn't exist or not supported, fallback to software events */ - printk("nmi_watchdog: hardware not available, trying software events\n"); + printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n"); wd_attr = &wd_sw_attr; wd_attr->sample_period = NSEC_PER_SEC; event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); @@ -182,7 +181,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, if (nmi_watchdog_enabled) { for_each_online_cpu(cpu) if (enable_nmi_watchdog(cpu)) { - printk("NMI watchdog failed configuration, " + printk(KERN_ERR "NMI watchdog failed configuration, " " can not be enabled\n"); } } else { -- cgit v1.2.3-70-g09d2 From 5671a10e2bc7f99d9157c6044faf8be2ef302361 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 14:20:14 +0100 Subject: nmi_watchdog: Tell the world we're active Because I was wondering why perf stat wasn't working as expected.. Signed-off-by: Peter Zijlstra Cc: Don Zickus LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/nmi_watchdog.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c index 0a6f57f537a7..a79d211c30df 100644 --- a/kernel/nmi_watchdog.c +++ b/kernel/nmi_watchdog.c @@ -244,6 +244,8 @@ static int __init spawn_nmi_watchdog_task(void) if (nonmi_watchdog) return 0; + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); if (err == NOTIFY_BAD) { BUG(); -- cgit v1.2.3-70-g09d2 From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:44 -0400 Subject: lockup_detector: Combine nmi_watchdog and softlockup detector The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 2 + arch/x86/include/asm/nmi.h | 2 +- arch/x86/kernel/apic/Makefile | 4 +- arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/traps.c | 4 +- include/linux/nmi.h | 8 +- include/linux/sched.h | 6 + init/Kconfig | 5 +- kernel/Makefile | 3 +- kernel/sysctl.c | 21 +- kernel/watchdog.c | 592 ++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 30 +- 12 files changed, 650 insertions(+), 29 deletions(-) create mode 100644 kernel/watchdog.c (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 839b21b0699a..dfe8d1c226c6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file nousb [USB] Disable the USB subsystem + nowatchdog [KNL] Disable the lockup detector. + nowb [ARM] nox2apic [X86-64,APIC] Do not enable x2APIC mode. diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5b41b0feb6db..932f0f86b4b7 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); -#if !defined(CONFIG_NMI_WATCHDOG) +#if !defined(CONFIG_LOCKUP_DETECTOR) extern int nmi_watchdog_enabled; #endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 1a4512e48d24..52f32e0ea194 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_NMI_WATCHDOG),y) +ifneq ($(CONFIG_LOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o +obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index e8b78a0be5de..79425f96fcee 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bdc7fab3ef3e..bd347c2b34dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_NMI_WATCHDOG */ +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 22cc7960b649..abd48aacaf79 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); @@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void) } #endif -#ifdef CONFIG_NMI_WATCHDOG +#ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(void); -extern int nmi_watchdog_enabled; +extern int watchdog_enabled; struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , +extern int proc_dowatchdog_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf7..37efe8fa5306 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_LOCKUP_DETECTOR +extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/init/Kconfig b/init/Kconfig index c6c8903cb534..e44e25422f22 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -944,8 +944,11 @@ config PERF_USE_VMALLOC config PERF_EVENTS_NMI bool + depends on PERF_EVENTS help - Arch has support for nmi_watchdog + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. menu "Kernel Performance Events And Counters" diff --git a/kernel/Makefile b/kernel/Makefile index d5c30060ac14..6adeafc3e259 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o -obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a38af430f0d8..0f9adda85f97 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -74,7 +74,7 @@ #include #endif -#ifdef CONFIG_NMI_WATCHDOG +#ifdef CONFIG_LOCKUP_DETECTOR #include #endif @@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_NMI_WATCHDOG) +#if defined(CONFIG_LOCKUP_DETECTOR) { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, + .procname = "watchdog", + .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_nmi_enabled, + .proc_handler = proc_dowatchdog_enabled, + }, + { + .procname = "watchdog_thresh", + .data = &softlockup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog_thresh, + .extra1 = &neg_one, + .extra2 = &sixty, }, #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG) +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 000000000000..6b7fad8497af --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,592 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +int watchdog_enabled; +int __read_mostly softlockup_thresh = 60; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_PERF_EVENTS_NMI +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +static int __read_mostly did_panic; +static int __initdata no_watchdog; + + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_PERF_EVENTS_NMI +static int hardlockup_panic; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert softlockup_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return softlockup_thresh / 5 * NSEC_PER_SEC; +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = raw_smp_processor_id(); + + __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +} + +void touch_watchdog(void) +{ + __get_cpu_var(watchdog_touch_ts) = 0; +} +EXPORT_SYMBOL(touch_watchdog); + +void touch_all_watchdog(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +void touch_nmi_watchdog(void) +{ + touch_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +void touch_all_nmi_watchdog(void) +{ + touch_all_watchdog(); +} + +void touch_softlockup_watchdog(void) +{ + touch_watchdog(); +} + +void touch_all_softlockup_watchdogs(void) +{ + touch_all_watchdog(); +} + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +void softlockup_tick(void) +{ +} + +#ifdef CONFIG_PERF_EVENTS_NMI +/* watchdog detector functions */ +static int is_hardlockup(int cpu) +{ + unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return 1; + + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts, int cpu) +{ + unsigned long now = get_timestamp(cpu); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + softlockup_thresh)) + return now - touch_ts; + + return 0; +} + +static int +watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = watchdog_panic, +}; + +#ifdef CONFIG_PERF_EVENTS_NMI +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu); + + if (touch_ts == 0) { + __touch_watchdog(); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup(this_cpu)) { + /* only print hardlockups once */ + if (__get_cpu_var(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __get_cpu_var(hard_watchdog_warn) = true; + return; + } + + __get_cpu_var(hard_watchdog_warn) = false; + return; +} +static void watchdog_interrupt_count(void) +{ + __get_cpu_var(hrtimer_interrupts)++; +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_PERF_EVENTS_NMI */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + int this_cpu = smp_processor_id(); + unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__get_cpu_var(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlockup_touch_sync, this_cpu) = false; + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts, this_cpu); + if (unlikely(duration)) { + /* only warn once */ + if (__get_cpu_var(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + this_cpu, duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __get_cpu_var(soft_watchdog_warn) = true; + } else + __get_cpu_var(soft_watchdog_warn) = false; + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *__bind_cpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in softlockup_tick(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_PERF_EVENTS_NMI +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + return -1; + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_PERF_EVENTS_NMI */ + +/* prepare/enable/disable routines */ +static int watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + return 0; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + + /* enable the perf event */ + if (watchdog_nmi_enable(cpu) != 0) + return -1; + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + return -1; + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + + return 0; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } + + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + int result; + + for_each_online_cpu(cpu) + result += watchdog_enable(cpu); + + if (result) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ + +int proc_dowatchdog_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + return 0; +} + +int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +/* stub functions */ +int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos); +} +/* end of stub functions */ +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (watchdog_prepare_cpu(hotcpu)) + return NOTIFY_BAD; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (watchdog_enable(hotcpu)) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init spawn_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (no_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(err == NOTIFY_BAD); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; +} +early_initcall(spawn_watchdog_task); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 220ae6063b6f..49e285dcaf57 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -153,7 +153,7 @@ config DEBUG_SHIRQ points; some don't and need to be caught. config DETECT_SOFTLOCKUP - bool "Detect Soft Lockups" + bool depends on DEBUG_KERNEL && !S390 default y help @@ -171,17 +171,27 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) -config NMI_WATCHDOG - bool "Detect Hard Lockups with an NMI Watchdog" - depends on DEBUG_KERNEL && PERF_EVENTS && PERF_EVENTS_NMI +config LOCKUP_DETECTOR + bool "Detect Hard and Soft Lockups" + depends on DEBUG_KERNEL + default DETECT_SOFTLOCKUP help - Say Y here to enable the kernel to use the NMI as a watchdog - to detect hard lockups. This is useful when a cpu hangs for no - reason but can still respond to NMIs. A backtrace is displayed - for reviewing and reporting. + Say Y here to enable the kernel to act as a watchdog to detect + hard and soft lockups. + + Softlockups are bugs that cause the kernel to loop in kernel + mode for more than 60 seconds, without giving other tasks a + chance to run. The current stack trace is displayed upon + detection and the system will stay locked up. + + Hardlockups are bugs that cause the CPU to loop in kernel mode + for more than 60 seconds, without letting other interrupts have a + chance to run. The current stack trace is displayed upon detection + and the system will stay locked up. - The overhead should be minimal, just an extra NMI every few - seconds. + The overhead should be minimal. A periodic hrtimer runs to + generate interrupts and kick the watchdog task every 10-12 seconds. + An NMI is generated every 60 seconds or so to check for hardlockups. config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" -- cgit v1.2.3-70-g09d2 From 332fbdbca3f7716c5620970755ae054d213bcc4e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:45 -0400 Subject: lockup_detector: Touch_softlockup cleanups and softlockup_tick removal Just some code cleanup to make touch_softlockup clearer and remove the softlockup_tick function as it is no longer needed. Also remove the /proc softlockup_thres call as it has been changed to watchdog_thres. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-3-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 16 +++------------- kernel/sysctl.c | 9 --------- kernel/timer.c | 1 - kernel/watchdog.c | 35 +++-------------------------------- 4 files changed, 6 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 37efe8fa5306..33f9b2ad0bbb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -312,19 +312,15 @@ extern void scheduler_tick(void); extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP -extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); -extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; #else -static inline void softlockup_tick(void) -{ -} static inline void touch_softlockup_watchdog(void) { } @@ -346,12 +342,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); #endif -#ifdef CONFIG_LOCKUP_DETECTOR -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -#endif - /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f9adda85f97..999bc3fccf47 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -817,15 +817,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dosoftlockup_thresh, - .extra1 = &neg_one, - .extra2 = &sixty, - }, #endif #ifdef CONFIG_DETECT_HUNG_TASK { diff --git a/kernel/timer.c b/kernel/timer.c index aeb6a54f2771..e8de5eb07a02 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1225,7 +1225,6 @@ void run_local_timers(void) { hrtimer_run_queues(); raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); } /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6b7fad8497af..f1541b7e3244 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -119,13 +119,12 @@ static void __touch_watchdog(void) __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); } -void touch_watchdog(void) +void touch_softlockup_watchdog(void) { __get_cpu_var(watchdog_touch_ts) = 0; } -EXPORT_SYMBOL(touch_watchdog); -void touch_all_watchdog(void) +void touch_all_softlockup_watchdogs(void) { int cpu; @@ -140,35 +139,16 @@ void touch_all_watchdog(void) void touch_nmi_watchdog(void) { - touch_watchdog(); + touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); -void touch_all_nmi_watchdog(void) -{ - touch_all_watchdog(); -} - -void touch_softlockup_watchdog(void) -{ - touch_watchdog(); -} - -void touch_all_softlockup_watchdogs(void) -{ - touch_all_watchdog(); -} - void touch_softlockup_watchdog_sync(void) { __raw_get_cpu_var(softlockup_touch_sync) = true; __raw_get_cpu_var(watchdog_touch_ts) = 0; } -void softlockup_tick(void) -{ -} - #ifdef CONFIG_PERF_EVENTS_NMI /* watchdog detector functions */ static int is_hardlockup(int cpu) @@ -522,15 +502,6 @@ int proc_dowatchdog_thresh(struct ctl_table *table, int write, { return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } - -/* stub functions */ -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos); -} -/* end of stub functions */ #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3-70-g09d2 From 2508ce1845a3b256798532b2c6b7997c2dc6533b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:46 -0400 Subject: lockup_detector: Remove old softlockup code Now that is no longer compiled or used, just remove it. Also move some of the code wrapped with DETECT_SOFTLOCKUP to the LOCKUP_DETECTOR wrappers because that is the code that uses it now. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-4-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/softlockup.c | 293 ---------------------------------------------------- kernel/sysctl.c | 22 ++-- 2 files changed, 10 insertions(+), 305 deletions(-) delete mode 100644 kernel/softlockup.c (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb5..000000000000 --- a/kernel/softlockup.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Detect Soft Lockups - * - * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. - * - * this code detects soft lockups: incidents in where on a CPU - * the kernel does not reschedule for 10 seconds or more. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static DEFINE_SPINLOCK(print_lock); - -static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ -static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); -static DEFINE_PER_CPU(bool, softlock_touch_sync); - -static int __read_mostly did_panic; -int __read_mostly softlockup_thresh = 60; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * soft-lockup occurs: - */ -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int -softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = softlock_panic, -}; - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void __touch_softlockup_watchdog(void) -{ - int this_cpu = raw_smp_processor_id(); - - __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); -} - -void touch_softlockup_watchdog(void) -{ - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_softlockup_watchdog_sync(void) -{ - __raw_get_cpu_var(softlock_touch_sync) = true; - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* Cause each CPU to re-update its timestamp rather than complain */ - for_each_online_cpu(cpu) - per_cpu(softlockup_touch_ts, cpu) = 0; -} -EXPORT_SYMBOL(touch_all_softlockup_watchdogs); - -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - -/* - * This callback runs from the timer interrupt, and checks - * whether the watchdog thread has hung or not: - */ -void softlockup_tick(void) -{ - int this_cpu = smp_processor_id(); - unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); - unsigned long print_ts; - struct pt_regs *regs = get_irq_regs(); - unsigned long now; - - /* Is detection switched off? */ - if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { - /* Be sure we don't false trigger if switched back on */ - if (touch_ts) - per_cpu(softlockup_touch_ts, this_cpu) = 0; - return; - } - - if (touch_ts == 0) { - if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - per_cpu(softlock_touch_sync, this_cpu) = false; - sched_clock_tick(); - } - __touch_softlockup_watchdog(); - return; - } - - print_ts = per_cpu(softlockup_print_ts, this_cpu); - - /* report at most once a second */ - if (print_ts == touch_ts || did_panic) - return; - - /* do not print during early bootup: */ - if (unlikely(system_state != SYSTEM_RUNNING)) { - __touch_softlockup_watchdog(); - return; - } - - now = get_timestamp(this_cpu); - - /* - * Wake up the high-prio watchdog task twice per - * threshold timespan. - */ - if (time_after(now - softlockup_thresh/2, touch_ts)) - wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); - - /* Warn about unreasonable delays: */ - if (time_before_eq(now - softlockup_thresh, touch_ts)) - return; - - per_cpu(softlockup_print_ts, this_cpu) = touch_ts; - - spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_ts, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - spin_unlock(&print_lock); - - if (softlockup_panic) - panic("softlockup: hung tasks"); -} - -/* - * The watchdog thread - runs every second and touches the timestamp. - */ -static int watchdog(void *__bind_cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_softlockup_watchdog(); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). - */ - while (!kthread_should_stop()) { - __touch_softlockup_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); - p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); - if (IS_ERR(p)) { - printk(KERN_ERR "watchdog for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - per_cpu(softlockup_touch_ts, hotcpu) = 0; - per_cpu(softlockup_watchdog, hotcpu) = p; - kthread_bind(p, hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(softlockup_watchdog, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(softlockup_watchdog, hotcpu), - cpumask_any(cpu_online_mask)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - p = per_cpu(softlockup_watchdog, hotcpu); - per_cpu(softlockup_watchdog, hotcpu) = NULL; - kthread_stop(p); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static int __init spawn_softlockup_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nosoftlockup) - return 0; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - - return 0; -} -early_initcall(spawn_softlockup_task); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 999bc3fccf47..04bcd8abe09c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,7 +108,7 @@ extern int blk_iopoll_enabled; #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; static int neg_one = -1; #endif @@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, .extra2 = &sixty, }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { @@ -807,17 +816,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif #ifdef CONFIG_DETECT_HUNG_TASK { .procname = "hung_task_panic", -- cgit v1.2.3-70-g09d2 From f69bcf60c3f17aa367e16eef7bc6ab001ea6d58a Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:47 -0400 Subject: lockup_detector: Remove nmi_watchdog.c file This file migrated to kernel/watchdog.c and then combined with kernel/softlockup.c. As a result kernel/nmi_watchdog.c is no longer needed. Just remove it. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-5-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/nmi_watchdog.c | 259 -------------------------------------------------- 1 file changed, 259 deletions(-) delete mode 100644 kernel/nmi_watchdog.c (limited to 'kernel') diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c deleted file mode 100644 index a79d211c30df..000000000000 --- a/kernel/nmi_watchdog.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Detect Hard Lockups using the NMI - * - * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. - * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks - * to those contributors as well. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev); -static DEFINE_PER_CPU(int, nmi_watchdog_touch); -static DEFINE_PER_CPU(long, alert_counter); - -static int panic_on_timeout; - -void touch_nmi_watchdog(void) -{ - __raw_get_cpu_var(nmi_watchdog_touch) = 1; - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -void touch_all_nmi_watchdog(void) -{ - int cpu; - - for_each_online_cpu(cpu) - per_cpu(nmi_watchdog_touch, cpu) = 1; - touch_softlockup_watchdog(); -} - -static int __init setup_nmi_watchdog(char *str) -{ - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - return 1; -} -__setup("nmi_watchdog=", setup_nmi_watchdog); - -struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -struct perf_event_attr wd_sw_attr = { - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_CPU_CLOCK, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -void wd_overflow(struct perf_event *event, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - int touched = 0; - - if (__get_cpu_var(nmi_watchdog_touch)) { - per_cpu(nmi_watchdog_touch, cpu) = 0; - touched = 1; - } - - /* check to see if the cpu is doing anything */ - if (!touched && hw_nmi_is_cpu_stuck(regs)) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - per_cpu(alert_counter, cpu) += 1; - if (per_cpu(alert_counter, cpu) == 5) { - if (panic_on_timeout) - panic("NMI Watchdog detected LOCKUP on cpu %d", cpu); - else - WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu); - } - } else { - per_cpu(alert_counter, cpu) = 0; - } - - return; -} - -static int enable_nmi_watchdog(int cpu) -{ - struct perf_event *event; - struct perf_event_attr *wd_attr; - - event = per_cpu(nmi_watchdog_ev, cpu); - if (event && event->state > PERF_EVENT_STATE_OFF) - return 0; - - if (event == NULL) { - /* Try to register using hardware perf events first */ - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); - if (IS_ERR(event)) { - /* hardware doesn't exist or not supported, fallback to software events */ - printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n"); - wd_attr = &wd_sw_attr; - wd_attr->sample_period = NSEC_PER_SEC; - event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow); - if (IS_ERR(event)) { - printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", cpu, event); - return -1; - } - } - per_cpu(nmi_watchdog_ev, cpu) = event; - } - perf_event_enable(per_cpu(nmi_watchdog_ev, cpu)); - return 0; -} - -static void disable_nmi_watchdog(int cpu) -{ - struct perf_event *event; - - event = per_cpu(nmi_watchdog_ev, cpu); - if (event) { - perf_event_disable(per_cpu(nmi_watchdog_ev, cpu)); - per_cpu(nmi_watchdog_ev, cpu) = NULL; - perf_event_release_kernel(event); - } -} - -#ifdef CONFIG_SYSCTL -/* - * proc handler for /proc/sys/kernel/nmi_watchdog - */ -int nmi_watchdog_enabled; - -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int cpu; - - if (!write) { - struct perf_event *event; - for_each_online_cpu(cpu) { - event = per_cpu(nmi_watchdog_ev, cpu); - if (event && event->state > PERF_EVENT_STATE_OFF) { - nmi_watchdog_enabled = 1; - break; - } - } - proc_dointvec(table, write, buffer, length, ppos); - return 0; - } - - touch_all_nmi_watchdog(); - proc_dointvec(table, write, buffer, length, ppos); - if (nmi_watchdog_enabled) { - for_each_online_cpu(cpu) - if (enable_nmi_watchdog(cpu)) { - printk(KERN_ERR "NMI watchdog failed configuration, " - " can not be enabled\n"); - } - } else { - for_each_online_cpu(cpu) - disable_nmi_watchdog(cpu); - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - per_cpu(nmi_watchdog_touch, hotcpu) = 0; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (enable_nmi_watchdog(hotcpu)) - return NOTIFY_BAD; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - disable_nmi_watchdog(hotcpu); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nonmi_watchdog; - -static int __init nonmi_watchdog_setup(char *str) -{ - nonmi_watchdog = 1; - return 1; -} -__setup("nonmi_watchdog", nonmi_watchdog_setup); - -static int __init spawn_nmi_watchdog_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nonmi_watchdog) - return 0; - - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return 0; -} -early_initcall(spawn_nmi_watchdog_task); -- cgit v1.2.3-70-g09d2 From d7c547335fa6b0090fa09c46ea0e965ac273a27e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:51 -0400 Subject: lockup_detector: Separate touch_nmi_watchdog code path from touch_watchdog When I combined the nmi_watchdog (hardlockup) and softlockup code, I also combined the paths the touch_watchdog and touch_nmi_watchdog took. This may not be the best idea as pointed out by Frederic W., that the touch_watchdog case probably should not reset the hardlockup count. Therefore the patch below falls back to the previous idea of keeping the touch_nmi_watchdog a superset of the touch_watchdog case. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-9-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/watchdog.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f1541b7e3244..57b8e2c25eda 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,7 @@ int watchdog_enabled; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); @@ -139,6 +140,7 @@ void touch_all_softlockup_watchdogs(void) void touch_nmi_watchdog(void) { + __get_cpu_var(watchdog_nmi_touch) = true; touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -201,10 +203,9 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, struct pt_regs *regs) { int this_cpu = smp_processor_id(); - unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu); - if (touch_ts == 0) { - __touch_watchdog(); + if (__get_cpu_var(watchdog_nmi_touch) == true) { + __get_cpu_var(watchdog_nmi_touch) = false; return; } -- cgit v1.2.3-70-g09d2 From 0167c781907fcdc3e1f144ef5ce31d402c91eb94 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 May 2010 08:53:33 +0200 Subject: watchdog: Export touch_softlockup_watchdog There are modules that rely on it: ERROR: "touch_softlockup_watchdog" [drivers/video/nvidia/nvidiafb.ko] undefined! Cc: Frederic Weisbecker Cc: Don Zickus Cc: Peter Zijlstra Cc: Cyrill Gorcunov LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 57b8e2c25eda..be5e74e62be6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -124,6 +124,7 @@ void touch_softlockup_watchdog(void) { __get_cpu_var(watchdog_touch_ts) = 0; } +EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) { -- cgit v1.2.3-70-g09d2 From 23637d477c1f53acbb176a02c241d60a25888fae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 May 2010 23:15:20 +0200 Subject: lockup_detector: Introduce CONFIG_HARDLOCKUP_DETECTOR This new config is deemed to simplify even more the lockup detector dependencies and can make it easier to bring a smooth sorting between archs that support the new generic lockup detector and those that still have their own, especially for those that are in the middle of this migration. Instead of checking whether we have CONFIG_LOCKUP_DETECTOR + CONFIG_PERF_EVENTS_NMI each time an arch wants to know if it needs to build its own lockup detector, take a shortcut with this new config. It is enabled only if the hardlockup detection part of the whole lockup detector is on. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov --- arch/Kconfig | 4 ++++ init/Kconfig | 7 ------- kernel/watchdog.c | 14 +++++++------- lib/Kconfig.debug | 3 +++ 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 89b0efb50948..35084f280087 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -147,5 +147,9 @@ config HAVE_USER_RETURN_NOTIFIER config HAVE_PERF_EVENTS_NMI bool + help + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. source "kernel/gcov/Kconfig" diff --git a/init/Kconfig b/init/Kconfig index ab733c32292c..eb77e8ccde1c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -942,13 +942,6 @@ config PERF_USE_VMALLOC help See tools/perf/design.txt for details -config PERF_EVENTS_NMI - def_bool PERF_EVENTS && HAVE_PERF_EVENTS_NMI - help - System hardware can generate an NMI using the perf event - subsystem. Also has support for calculating CPU cycle events - to determine how many clock cycles in a given period. - menu "Kernel Performance Events And Counters" config PERF_EVENTS diff --git a/kernel/watchdog.c b/kernel/watchdog.c index be5e74e62be6..83fb63155cbc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, soft_watchdog_warn); -#ifdef CONFIG_PERF_EVENTS_NMI +#ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); @@ -51,7 +51,7 @@ static int __initdata no_watchdog; /* * Should we panic when a soft-lockup or hard-lockup occurs: */ -#ifdef CONFIG_PERF_EVENTS_NMI +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic; static int __init hardlockup_panic_setup(char *str) @@ -152,7 +152,7 @@ void touch_softlockup_watchdog_sync(void) __raw_get_cpu_var(watchdog_touch_ts) = 0; } -#ifdef CONFIG_PERF_EVENTS_NMI +#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ static int is_hardlockup(int cpu) { @@ -189,7 +189,7 @@ static struct notifier_block panic_block = { .notifier_call = watchdog_panic, }; -#ifdef CONFIG_PERF_EVENTS_NMI +#ifdef CONFIG_HARDLOCKUP_DETECTOR static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -239,7 +239,7 @@ static void watchdog_interrupt_count(void) } #else static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_PERF_EVENTS_NMI */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) @@ -342,7 +342,7 @@ static int watchdog(void *__bind_cpu) } -#ifdef CONFIG_PERF_EVENTS_NMI +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -393,7 +393,7 @@ static void watchdog_nmi_disable(int cpu) #else static int watchdog_nmi_enable(int cpu) { return 0; } static void watchdog_nmi_disable(int cpu) { return; } -#endif /* CONFIG_PERF_EVENTS_NMI */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ static int watchdog_prepare_cpu(int cpu) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3a18b0b856ce..e65e47d5c5e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -174,6 +174,9 @@ config LOCKUP_DETECTOR generate interrupts and kick the watchdog task every 10-12 seconds. An NMI is generated every 60 seconds or so to check for hardlockups. +config HARDLOCKUP_DETECTOR + def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI + config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" depends on LOCKUP_DETECTOR -- cgit v1.2.3-70-g09d2 From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 14 May 2010 11:11:21 -0400 Subject: lockup_detector: Cross arch compile fixes Combining the softlockup and hardlockup code causes watchdog.c to build even without the hardlockup detection support. So if an arch, that has the previous and the new nmi watchdog implementations cohabiting, wants to know if the generic one is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check. We need to use CONFIG_HARDLOCKUP_DETECTOR instead. Fixes: kernel/built-in.o: In function `touch_nmi_watchdog': (.text+0x449bc): multiple definition of `touch_nmi_watchdog' arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <20100514151121.GR15159@redhat.com> [ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI] Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- include/linux/nmi.h | 2 +- kernel/watchdog.c | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52f32e0ea194..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_LOCKUP_DETECTOR),y) +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/include/linux/nmi.h b/include/linux/nmi.h index abd48aacaf79..06aab5eee134 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void); extern void acpi_nmi_disable(void); extern void acpi_nmi_enable(void); #else -#ifndef CONFIG_LOCKUP_DETECTOR +#ifndef CONFIG_HARDLOCKUP_DETECTOR static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 83fb63155cbc..e53622c1465e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,13 +31,13 @@ int watchdog_enabled; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, soft_watchdog_warn); #ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); @@ -139,6 +139,7 @@ void touch_all_softlockup_watchdogs(void) per_cpu(watchdog_touch_ts, cpu) = 0; } +#ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { __get_cpu_var(watchdog_nmi_touch) = true; @@ -146,6 +147,8 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); +#endif + void touch_softlockup_watchdog_sync(void) { __raw_get_cpu_var(softlockup_touch_sync) = true; -- cgit v1.2.3-70-g09d2 From 26e09c6eee14f4827b55137ba0eedc4e77cd50ab Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 17 May 2010 18:06:04 -0400 Subject: lockup_detector: Convert per_cpu to __get_cpu_var for readability Just a bunch of conversions as suggested by Frederic W. __get_cpu_var() provides preemption disabled checks. Plus it gives more readability as it makes it obvious we are dealing locally now with these vars. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <1274133966-18415-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/watchdog.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e53622c1465e..91b0b26adc67 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -115,7 +115,7 @@ static unsigned long get_sample_period(void) /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { - int this_cpu = raw_smp_processor_id(); + int this_cpu = smp_processor_id(); __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); } @@ -157,21 +157,21 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static int is_hardlockup(int cpu) +static int is_hardlockup(void) { - unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + unsigned long hrint = __get_cpu_var(hrtimer_interrupts); - if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) return 1; - per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + __get_cpu_var(hrtimer_interrupts_saved) = hrint; return 0; } #endif -static int is_softlockup(unsigned long touch_ts, int cpu) +static int is_softlockup(unsigned long touch_ts) { - unsigned long now = get_timestamp(cpu); + unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ if (time_after(now, touch_ts + softlockup_thresh)) @@ -206,8 +206,6 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - int this_cpu = smp_processor_id(); - if (__get_cpu_var(watchdog_nmi_touch) == true) { __get_cpu_var(watchdog_nmi_touch) = false; return; @@ -219,7 +217,9 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ - if (is_hardlockup(this_cpu)) { + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + /* only print hardlockups once */ if (__get_cpu_var(hard_watchdog_warn) == true) return; @@ -247,7 +247,6 @@ static inline void watchdog_interrupt_count(void) { return; } /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { - int this_cpu = smp_processor_id(); unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; @@ -262,12 +261,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); if (touch_ts == 0) { - if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) { + if (unlikely(__get_cpu_var(softlockup_touch_sync))) { /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ - per_cpu(softlockup_touch_sync, this_cpu) = false; + __get_cpu_var(softlockup_touch_sync) = false; sched_clock_tick(); } __touch_watchdog(); @@ -280,14 +279,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ - duration = is_softlockup(touch_ts, this_cpu); + duration = is_softlockup(touch_ts); if (unlikely(duration)) { /* only warn once */ if (__get_cpu_var(soft_watchdog_warn) == true) return HRTIMER_RESTART; printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", - this_cpu, duration, + smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); print_irqtrace_events(current); @@ -309,10 +308,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* * The watchdog thread - touches the timestamp. */ -static int watchdog(void *__bind_cpu) +static int watchdog(void *unused) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu); + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -328,7 +327,7 @@ static int watchdog(void *__bind_cpu) /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). + * debug-printout triggers in watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); -- cgit v1.2.3-70-g09d2 From d1f74e20b5b064a130cd0743a256c2d3cfe84010 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Jun 2010 21:52:29 -0400 Subject: tracing/sched: Make preempt_schedule() notrace The function tracer code uses ftrace_preempt_disable() to disable preemption instead of normal preempt_disable(). But there's a slight race condition that may cause it to lose a preemption check. This was made to keep the function tracer from recursing on itself by disabling preemption then having the enable call the function tracer again, causing infinite recursion. The bug was assumed to happen if the call was just in schedule, but this is incorrect. The bug is caused by preempt_schedule() which is called by preempt_enable(). The calling of preempt_enable() when NEED_RESCHED was set would call preempt_schedule() which would call the function tracer again. By making the preempt_schedule() and add_preempt_count() notrace then this will prevent the inifinite recursion. This is because the add_preempt_count() would stop the preempt_enable() in the function tracer from calling preempt_schedule() again. The sub_preempt_count() is also made notrace just to keep it symmetric. Signed-off-by: Steven Rostedt --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 15b93f617fd7..cd6787e57174 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3730,7 +3730,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -3742,9 +3742,9 @@ asmlinkage void __sched preempt_schedule(void) return; do { - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity -- cgit v1.2.3-70-g09d2 From 5168ae50a66e3ff7184c2b16d661bd6d70367e50 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Jun 2010 09:36:50 -0400 Subject: tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 ++-- kernel/trace/ring_buffer.c | 38 +++++++------------------------ kernel/trace/trace.c | 5 ++-- kernel/trace/trace.h | 48 --------------------------------------- kernel/trace/trace_clock.c | 5 ++-- kernel/trace/trace_events.c | 5 ++-- kernel/trace/trace_functions.c | 6 ++--- kernel/trace/trace_sched_wakeup.c | 5 ++-- kernel/trace/trace_stack.c | 6 ++--- 9 files changed, 24 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..0d88ce9b9fb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) struct hlist_head *hhd; struct hlist_node *n; unsigned long key; - int resched; key = hash_long(ip, FTRACE_HASH_BITS); @@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * period. This syncs the hash iteration and freeing of items * on the hash. rcu_read_lock is too dangerous here. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); hlist_for_each_entry_rcu(entry, n, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_probe_ops __read_mostly = diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f6059c5aa94..c3d3cd9c2a53 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2234,8 +2234,6 @@ static void trace_recursive_unlock(void) #endif -static DEFINE_PER_CPU(int, rb_need_resched); - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2256,13 +2254,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out_nocheck; @@ -2287,21 +2285,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (!event) goto out; - /* - * Need to store resched state on this cpu. - * Only the first needs to. - */ - - if (preempt_count() == 1) - per_cpu(rb_need_resched, cpu) = resched; - return event; out: trace_recursive_unlock(); out_nocheck: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); @@ -2347,13 +2337,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return 0; } @@ -2461,13 +2445,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); @@ -2493,12 +2471,12 @@ int ring_buffer_write(struct ring_buffer *buffer, struct ring_buffer_event *event; void *body; int ret = -EBUSY; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; @@ -2528,7 +2506,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55e48511d7c8..35727140f4fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1404,7 +1404,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct bprint_entry *entry; unsigned long flags; int disable; - int resched; int cpu, len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1414,7 +1413,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1452,7 +1451,7 @@ out_unlock: out: atomic_dec_return(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); unpause_graph_tracing(); return len; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..6c45e55097ce 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -628,54 +628,6 @@ enum trace_iterator_flags { extern struct tracer nop_trace; -/** - * ftrace_preempt_disable - disable preemption scheduler safe - * - * When tracing can happen inside the scheduler, there exists - * cases that the tracing might happen before the need_resched - * flag is checked. If this happens and the tracer calls - * preempt_enable (after a disable), a schedule might take place - * causing an infinite recursion. - * - * To prevent this, we read the need_resched flag before - * disabling preemption. When we want to enable preemption we - * check the flag, if it is set, then we call preempt_enable_no_resched. - * Otherwise, we call preempt_enable. - * - * The rational for doing the above is that if need_resched is set - * and we have yet to reschedule, we are either in an atomic location - * (where we do not need to check for scheduling) or we are inside - * the scheduler and do not want to resched. - */ -static inline int ftrace_preempt_disable(void) -{ - int resched; - - resched = need_resched(); - preempt_disable_notrace(); - - return resched; -} - -/** - * ftrace_preempt_enable - enable preemption scheduler safe - * @resched: the return value from ftrace_preempt_disable - * - * This is a scheduler safe way to enable preemption and not miss - * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we are either inside an atomic or - * are inside the scheduler (we would have already scheduled - * otherwise). In this case, we do not want to call normal - * preempt_enable, but preempt_enable_no_resched instead. - */ -static inline void ftrace_preempt_enable(int resched) -{ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..52fda6c04ac3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -32,16 +32,15 @@ u64 notrace trace_clock_local(void) { u64 clock; - int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); clock = sched_clock(); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return clock; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..a594f9a7ee3d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1524,12 +1524,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) struct ftrace_entry *entry; unsigned long flags; long disabled; - int resched; int cpu; int pc; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1551,7 +1550,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu, resched; + int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static void diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..c9fd5bd02036 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int resched; int cpu; int pc; @@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) @@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); out_enable: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..056468eae7cf 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -110,12 +110,12 @@ static inline void check_stack(void) static void stack_trace_call(unsigned long ip, unsigned long parent_ip) { - int cpu, resched; + int cpu; if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = -- cgit v1.2.3-70-g09d2 From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2010 21:35:17 +0200 Subject: x86: Unify dumpstack.h and stacktrace.h arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h declare headers of objects that deal with the same topic. Actually most of the files that include stacktrace.h also include dumpstack.h Although dumpstack.h seems more reserved for internals of stack traces, those are quite often needed to define specialized stack trace operations. And perf event arch headers are going to need access to such low level operations anyway. So don't continue to bother with dumpstack.h as it's not anymore about isolated deep internals. v2: fix struct stack_frame definition conflict in sysprof Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Soeren Sandmann --- arch/x86/include/asm/stacktrace.h | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.c | 2 -- arch/x86/kernel/dumpstack.c | 1 - arch/x86/kernel/dumpstack.h | 56 --------------------------------------- arch/x86/kernel/dumpstack_32.c | 2 -- arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/stacktrace.c | 7 ++--- kernel/trace/trace_sysprof.c | 7 ++--- 8 files changed, 60 insertions(+), 68 deletions(-) delete mode 100644 arch/x86/kernel/dumpstack.h (limited to 'kernel') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 4dab78edbad9..a957463d3c7a 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -1,6 +1,13 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H +#include + extern int kstack_depth_to_print; struct thread_info; @@ -42,4 +49,49 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data); +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } +#endif + + return (unsigned long)frame; +} + #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c77586061bcb..9632fb61e8f9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386930b7..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,7 +18,6 @@ #include -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -#include - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -struct stack_frame_ia32 { - u32 next_frame; - u32 return_address; -}; - -static inline unsigned long rewind_frame_pointer(int n) -{ - struct stack_frame *frame; - - get_bp(frame); - -#ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } -#endif - - return (unsigned long)frame; -} - -#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,6 @@ #include -#include "dumpstack.h" - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,7 +16,6 @@ #include -#include "dumpstack.h" #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..ea54d029fe27 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index a7974a552ca9..c080956f4d8e 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -33,12 +33,13 @@ static DEFINE_MUTEX(sample_timer_lock); */ static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long return_address; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -125,7 +126,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr, static void timer_notify(struct pt_regs *regs, int cpu) { struct trace_array_cpu *data; - struct stack_frame frame; + struct stack_frame_user frame; struct trace_array *tr; const void __user *fp; int is_user; -- cgit v1.2.3-70-g09d2 From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 20 May 2010 07:47:21 +0200 Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller Drop this argument now that we always want to rewind only to the state of the first caller. It means frame pointers are not necessary anymore to reliably get the source of an event. But this also means we need this helper to be a macro now, as an inline function is not an option since we need to know when to provide a default implentation. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul Mackerras Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++ arch/powerpc/kernel/misc.S | 26 -------------------------- arch/sparc/include/asm/perf_event.h | 8 ++++++++ arch/sparc/kernel/helpers.S | 6 +++--- arch/x86/include/asm/perf_event.h | 13 +++++++++++++ arch/x86/include/asm/stacktrace.h | 7 ++----- arch/x86/kernel/cpu/perf_event.c | 16 ---------------- include/linux/perf_event.h | 32 +++++++------------------------- include/trace/ftrace.h | 2 +- kernel/perf_event.c | 5 ----- kernel/trace/trace_event_perf.c | 2 -- 11 files changed, 46 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index e6d4ce69b126..5c16b891d501 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -21,3 +21,15 @@ #ifdef CONFIG_FSL_EMB_PERF_EVENT #include #endif + +#ifdef CONFIG_PERF_EVENTS +#include +#include + +#define perf_arch_fetch_caller_regs(regs, __ip) \ + do { \ + (regs)->nip = __ip; \ + (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ + } while (0) +#endif diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 22e507c8a556..2d29752cbe16 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7) _GLOBAL(__restore_cpu_power7) /* place holder */ blr - -/* - * Get a minimal set of registers for our caller's nth caller. - * r3 = regs pointer, r5 = n. - * - * We only get R1 (stack pointer), NIP (next instruction pointer) - * and LR (link register). These are all we can get in the - * general case without doing complicated stack unwinding, but - * fortunately they are enough to do a stack backtrace, which - * is all we need them for. - */ -_GLOBAL(perf_arch_fetch_caller_regs) - mr r6,r1 - cmpwi r5,0 - mflr r4 - ble 2f - mtctr r5 -1: PPC_LL r6,0(r6) - bdnz 1b - PPC_LL r4,PPC_LR_STKOFF(r6) -2: PPC_LL r7,0(r6) - PPC_LL r7,PPC_LR_STKOFF(r7) - PPC_STL r6,GPR1-STACK_FRAME_OVERHEAD(r3) - PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3) - PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3) - blr diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h index 7e2669894ce8..74c4e0cd889c 100644 --- a/arch/sparc/include/asm/perf_event.h +++ b/arch/sparc/include/asm/perf_event.h @@ -6,7 +6,15 @@ extern void set_perf_event_pending(void); #define PERF_EVENT_INDEX_OFFSET 0 #ifdef CONFIG_PERF_EVENTS +#include + extern void init_hw_perf_events(void); + +extern void +__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); + +#define perf_arch_fetch_caller_regs(pt_regs, ip) \ + __perf_arch_fetch_caller_regs(pt_regs, ip, 1); #else static inline void init_hw_perf_events(void) { } #endif diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S index 92090cc9e829..682fee06a16b 100644 --- a/arch/sparc/kernel/helpers.S +++ b/arch/sparc/kernel/helpers.S @@ -47,9 +47,9 @@ stack_trace_flush: .size stack_trace_flush,.-stack_trace_flush #ifdef CONFIG_PERF_EVENTS - .globl perf_arch_fetch_caller_regs - .type perf_arch_fetch_caller_regs,#function -perf_arch_fetch_caller_regs: + .globl __perf_arch_fetch_caller_regs + .type __perf_arch_fetch_caller_regs,#function +__perf_arch_fetch_caller_regs: /* We always read the %pstate into %o5 since we will use * that to construct a fake %tstate to store into the regs. */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d0c7e0..02de29830ffe 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) +#include + +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ +} + #else static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a957463d3c7a..2b16a2ad23dc 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,17 +78,14 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long rewind_frame_pointer(int n) +static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } + frame = frame->next_frame; #endif return (unsigned long)frame; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9632fb61e8f9..2c075fe573d0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - /* - * We abuse bit 3 to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. - */ - regs->flags = 0; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb6c91eac7e3..bea785cef493 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -905,8 +905,10 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); -extern void -perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); +#ifndef perf_arch_fetch_caller_regs +static inline void +perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { } +#endif /* * Take a snapshot of the regs. Skip ip and frame pointer to @@ -916,31 +918,11 @@ perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); * - bp for callchains * - eflags, for future purposes, just in case */ -static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip) +static inline void perf_fetch_caller_regs(struct pt_regs *regs) { - unsigned long ip; - memset(regs, 0, sizeof(*regs)); - switch (skip) { - case 1 : - ip = CALLER_ADDR0; - break; - case 2 : - ip = CALLER_ADDR1; - break; - case 3 : - ip = CALLER_ADDR2; - break; - case 4: - ip = CALLER_ADDR3; - break; - /* No need to support further for now */ - default: - ip = 0; - } - - return perf_arch_fetch_caller_regs(regs, ip, skip); + perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static inline void @@ -950,7 +932,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) struct pt_regs hot_regs; if (!regs) { - perf_fetch_caller_regs(&hot_regs, 1); + perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; } __perf_sw_event(event_id, nr, nmi, regs, addr); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3d685d1f2a03..8ee8b6e6b25e 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -705,7 +705,7 @@ perf_trace_##call(void *__data, proto) \ int __data_size; \ int rctx; \ \ - perf_fetch_caller_regs(&__regs, 1); \ + perf_fetch_caller_regs(&__regs); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e099650cd249..9ae4dbcdf469 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2851,11 +2851,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -__weak -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ -} - /* * We assume there is only KVM supporting the callbacks. diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cb6f365016e4..21db1d3a48d0 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,8 +9,6 @@ #include #include "trace.h" -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); - static char *perf_trace_buf[4]; /* -- cgit v1.2.3-70-g09d2 From 30dbb20e68e6f7df974b77d2350ebad5eb6f6c9e Mon Sep 17 00:00:00 2001 From: Américo Wang Date: Wed, 26 May 2010 18:57:53 +0800 Subject: tracing: Remove boot tracer The boot tracer is useless. It simply logs the initcalls but in fact these initcalls are also logged through printk while using the initcall_debug kernel parameter. Nobody seem to be using it so far. Then just remove it. Signed-off-by: WANG Cong Cc: Chase Douglas Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Li Zefan LKML-Reference: <20100526105753.GA5677@cr0.nay.redhat.com> [ remove the hooks in main.c, and the headers ] Signed-off-by: Frederic Weisbecker --- include/trace/boot.h | 60 -------------- init/main.c | 27 +++---- kernel/trace/Kconfig | 17 ---- kernel/trace/Makefile | 1 - kernel/trace/trace.c | 3 - kernel/trace/trace.h | 8 -- kernel/trace/trace_boot.c | 185 ------------------------------------------- kernel/trace/trace_entries.h | 27 ------- 8 files changed, 10 insertions(+), 318 deletions(-) delete mode 100644 include/trace/boot.h delete mode 100644 kernel/trace/trace_boot.c (limited to 'kernel') diff --git a/include/trace/boot.h b/include/trace/boot.h deleted file mode 100644 index 088ea089e31d..000000000000 --- a/include/trace/boot.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef _LINUX_TRACE_BOOT_H -#define _LINUX_TRACE_BOOT_H - -#include -#include -#include - -/* - * Structure which defines the trace of an initcall - * while it is called. - * You don't have to fill the func field since it is - * only used internally by the tracer. - */ -struct boot_trace_call { - pid_t caller; - char func[KSYM_SYMBOL_LEN]; -}; - -/* - * Structure which defines the trace of an initcall - * while it returns. - */ -struct boot_trace_ret { - char func[KSYM_SYMBOL_LEN]; - int result; - unsigned long long duration; /* nsecs */ -}; - -#ifdef CONFIG_BOOT_TRACER -/* Append the traces on the ring-buffer */ -extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn); -extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn); - -/* Tells the tracer that smp_pre_initcall is finished. - * So we can start the tracing - */ -extern void start_boot_trace(void); - -/* Resume the tracing of other necessary events - * such as sched switches - */ -extern void enable_boot_trace(void); - -/* Suspend this tracing. Actually, only sched_switches tracing have - * to be suspended. Initcalls doesn't need it.) - */ -extern void disable_boot_trace(void); -#else -static inline -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { } - -static inline -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { } - -static inline void start_boot_trace(void) { } -static inline void enable_boot_trace(void) { } -static inline void disable_boot_trace(void) { } -#endif /* CONFIG_BOOT_TRACER */ - -#endif /* __LINUX_TRACE_BOOT_H */ diff --git a/init/main.c b/init/main.c index 3bdb152f412f..94f65efdc65a 100644 --- a/init/main.c +++ b/init/main.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include @@ -715,38 +714,33 @@ int initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); static char msgbuf[64]; -static struct boot_trace_call call; -static struct boot_trace_ret ret; int do_one_initcall(initcall_t fn) { int count = preempt_count(); ktime_t calltime, delta, rettime; + unsigned long long duration; + int ret; if (initcall_debug) { - call.caller = task_pid_nr(current); - printk("calling %pF @ %i\n", fn, call.caller); + printk("calling %pF @ %i\n", fn, task_pid_nr(current)); calltime = ktime_get(); - trace_boot_call(&call, fn); - enable_boot_trace(); } - ret.result = fn(); + ret = fn(); if (initcall_debug) { - disable_boot_trace(); rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; - trace_boot_ret(&ret, fn); - printk("initcall %pF returned %d after %Ld usecs\n", fn, - ret.result, ret.duration); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk("initcall %pF returned %d after %lld usecs\n", fn, + ret, duration); } msgbuf[0] = 0; - if (ret.result && ret.result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", ret.result); + if (ret && ret != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -760,7 +754,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return ret.result; + return ret; } @@ -880,7 +874,6 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); - start_boot_trace(); smp_init(); sched_init_smp(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545b..572992abc71c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -229,23 +229,6 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in initcall_debug and ftrace=initcall to the kernel - command line to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550e..c3aaeba82372 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55e48511d7c8..036fbc22858b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4603,9 +4603,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..75a5e800a737 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,10 +9,8 @@ #include #include #include -#include #include #include - #include #include @@ -29,8 +27,6 @@ enum trace_type { TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, @@ -48,8 +44,6 @@ enum kmemtrace_type_id { KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -extern struct tracer boot_tracer; - #undef __field #define __field(type, item) type item; @@ -209,8 +203,6 @@ extern void __ftrace_bad_type(void); TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956ad..000000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker - * - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - boot_trace = tr; - - if (!tr) - return 0; - - tracing_reset_online_cpus(tr); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_call; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_ret; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240da..c293364c984f 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -271,33 +271,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __entry->map_id, __entry->opcode) ); -FTRACE_ENTRY(boot_call, trace_boot_call, - - TRACE_BOOT_CALL, - - F_STRUCT( - __field_struct( struct boot_trace_call, boot_call ) - __field_desc( pid_t, boot_call, caller ) - __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) - ), - - F_printk("%d %s", __entry->caller, __entry->func) -); - -FTRACE_ENTRY(boot_ret, trace_boot_ret, - - TRACE_BOOT_RET, - - F_STRUCT( - __field_struct( struct boot_trace_ret, boot_ret ) - __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) - __field_desc( int, boot_ret, result ) - __field_desc( unsigned long, boot_ret, duration ) - ), - - F_printk("%s %d %lx", - __entry->func, __entry->result, __entry->duration) -); #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 -- cgit v1.2.3-70-g09d2 From ecc55f84b2e9741f29daa787ded93986df6cbe17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 15:11:34 +0200 Subject: perf, trace: Inline perf_swevent_put_recursion_context() Inline perf_swevent_put_recursion_context into perf_tp_event(), this shrinks the per trace template code footprint and saves a function call. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 3 +-- include/linux/perf_event.h | 2 +- kernel/perf_event.c | 8 ++++---- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 3167f2df4126..0af31cd335d6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -257,8 +257,7 @@ static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, u64 count, struct pt_regs *regs, void *head) { - perf_tp_event(addr, count, raw_data, size, regs, head); - perf_swevent_put_recursion_context(rctx); + perf_tp_event(addr, count, raw_data, size, regs, head, rctx); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d94985..c691a0b27bcd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1001,7 +1001,7 @@ static inline bool perf_paranoid_kernel(void) extern void perf_event_init(void); extern void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct pt_regs *regs, - struct hlist_head *head); + struct hlist_head *head, int rctx); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4c..4bd3b597bcca 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4213,14 +4213,12 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void perf_swevent_put_recursion_context(int rctx) +void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; } -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); - void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4601,7 +4599,7 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head) + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; @@ -4621,6 +4619,8 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_add(event, count, 1, &data, regs); } rcu_read_unlock(); + + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); -- cgit v1.2.3-70-g09d2 From 8ed92280be013180e24c84456ab6babcb07037cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 15:13:59 +0200 Subject: perf, trace: Remove superfluous rcu_read_lock() __DO_TRACE() already calls the callbacks under rcu_read_lock_sched(), which is sufficient for our needs, avoid doing it again. Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4bd3b597bcca..b39bec346e80 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4613,12 +4613,10 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr); data.raw = &raw; - rcu_read_lock(); hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_add(event, count, 1, &data, regs); } - rcu_read_unlock(); perf_swevent_put_recursion_context(rctx); } -- cgit v1.2.3-70-g09d2 From 3af9e859281bda7eb7c20b51879cf43aa788ac2e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Tue, 18 May 2010 15:30:49 +0100 Subject: perf: Add non-exec mmap() tracking Add the capacility to track data mmap()s. This can be used together with PERF_SAMPLE_ADDR for data profiling. Signed-off-by: Anton Blanchard [Updated code for stable perf ABI] Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274193049-25997-1-git-send-email-ebmunson@us.ibm.com> Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_event.h | 12 +++--------- kernel/perf_event.c | 34 +++++++++++++++++++++++----------- mm/mmap.c | 6 +++++- tools/perf/builtin-record.c | 4 +++- 5 files changed, 35 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index e19de6a80339..97d91a03fb13 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -653,6 +653,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else stack_base = vma->vm_start - stack_expand; #endif + current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c691a0b27bcd..36efad90cd43 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -214,8 +214,9 @@ struct perf_event_attr { * See also PERF_RECORD_MISC_EXACT_IP */ precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ - __reserved_1 : 47; + __reserved_1 : 46; union { __u32 wakeup_events; /* wakeup every n events */ @@ -962,14 +963,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) } } -extern void __perf_event_mmap(struct vm_area_struct *vma); - -static inline void perf_event_mmap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_EXEC) - __perf_event_mmap(vma); -} - +extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b39bec346e80..227ed9c8ec34 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1891,7 +1891,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { atomic_dec(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); @@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event, /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task */ struct perf_task_event { @@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event) if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.comm || event->attr.mmap || event->attr.task) + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) return 1; return 0; @@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event, } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; @@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.mmap) + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) return 1; return 0; } static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event)) + if (perf_event_mmap_match(event, mmap_event, executable)) perf_event_mmap_output(event, mmap_event); } } @@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } name = strncpy(tmp, "//anon", sizeof(tmp)); @@ -3846,17 +3858,17 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_event_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); } -void __perf_event_mmap(struct vm_area_struct *vma) +void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -4911,7 +4923,7 @@ done: if (!event->parent) { atomic_inc(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); diff --git a/mm/mmap.c b/mm/mmap.c index 456ec6f27889..e38e910cb756 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1734,8 +1734,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) grow = (address - vma->vm_end) >> PAGE_SHIFT; error = acct_stack_growth(vma, size, grow); - if (!error) + if (!error) { vma->vm_end = address; + perf_event_mmap(vma); + } } anon_vma_unlock(vma); return error; @@ -1781,6 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma, if (!error) { vma->vm_start = address; vma->vm_pgoff -= grow; + perf_event_mmap(vma); } } anon_vma_unlock(vma); @@ -2208,6 +2211,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 5e5c6403a315..39c7247bc54a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -268,8 +268,10 @@ static void create_counter(int counter, int cpu) if (inherit_stat) attr->inherit_stat = 1; - if (sample_address) + if (sample_address) { attr->sample_type |= PERF_SAMPLE_ADDR; + attr->mmap_data = track; + } if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; -- cgit v1.2.3-70-g09d2 From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 17:49:05 +0200 Subject: perf: Cleanup {start,commit,cancel}_txn details Clarify some of the transactional group scheduling API details and change it so that a successfull ->commit_txn also closes the transaction. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274803086.5882.1752.camel@twins> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_event.c | 7 ++++--- arch/sparc/kernel/perf_event.c | 7 ++++--- arch/x86/kernel/cpu/perf_event.c | 14 +++++--------- include/linux/perf_event.h | 27 ++++++++++++++++++++++----- kernel/perf_event.c | 9 +-------- 5 files changed, 36 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 43b83c35cf54..ac2a8c2554d9 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuhw->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) @@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu) for (i = cpuhw->n_txn_start; i < n; ++i) cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuhw->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0ec92c8861dd..beeb92fa3acd 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuc->event, n0, 1)) @@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; } /* @@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu) if (sparc_check_constraints(cpuc->event, cpuc->events, n)) return -EAGAIN; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a18..af04c6fa59cb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto out; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event) * The events never got scheduled and ->cancel_txn will truncate * the event_list. */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) return; x86_pmu_stop(event); @@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuc->group_flag &= ~PERF_EVENT_TXN; /* * Truncate the collected events. */ @@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - /* - * Clear out the txn count so that ->cancel_txn() which gets - * run after ->commit_txn() doesn't undo things. - */ - cpuc->n_txn = 0; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 36efad90cd43..f1b6ba0770e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -549,7 +549,10 @@ struct hw_perf_event { struct perf_event; -#define PERF_EVENT_TXN_STARTED 1 +/* + * Common implementation detail of pmu::{start,commit,cancel}_txn + */ +#define PERF_EVENT_TXN 0x1 /** * struct pmu - generic performance monitoring unit @@ -563,14 +566,28 @@ struct pmu { void (*unthrottle) (struct perf_event *event); /* - * group events scheduling is treated as a transaction, - * add group events as a whole and perform one schedulability test. - * If test fails, roll back the whole group + * Group events scheduling is treated as a transaction, add group + * events as a whole and perform one schedulability test. If the test + * fails, roll back the whole group */ + /* + * Start the transaction, after this ->enable() doesn't need + * to do schedulability tests. + */ void (*start_txn) (const struct pmu *pmu); - void (*cancel_txn) (const struct pmu *pmu); + /* + * If ->start_txn() disabled the ->enable() schedulability test + * then ->commit_txn() is required to perform one. On success + * the transaction is closed. On error the transaction is kept + * open until ->cancel_txn() is called. + */ int (*commit_txn) (const struct pmu *pmu); + /* + * Will cancel the transaction, assumes ->disable() is called for + * each successfull ->enable() during the transaction. + */ + void (*cancel_txn) (const struct pmu *pmu); }; /** diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 227ed9c8ec34..6f60920772b3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event, struct perf_event *event, *partial_group = NULL; const struct pmu *pmu = group_event->pmu; bool txn = false; - int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn) + if (!txn || !pmu->commit_txn(pmu)) return 0; - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); - return 0; - } - group_error: /* * Groups can be scheduled in as one unit only, so undo any -- cgit v1.2.3-70-g09d2 From ca5135e6b4a3cbc7e187737520fbc4b508f6f7a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 May 2010 19:33:23 +0200 Subject: perf: Rename perf_mmap_data to perf_buffer Rename to clarify code. s/perf_mmap_data/perf_buffer/g and selective s/data/buffer/g Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 +- kernel/perf_event.c | 308 ++++++++++++++++++++++----------------------- 2 files changed, 157 insertions(+), 157 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f1b6ba0770e0..2a0da021c23f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -602,7 +602,7 @@ enum perf_event_active_state { struct file; -struct perf_mmap_data { +struct perf_buffer { atomic_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC @@ -727,7 +727,7 @@ struct perf_event { atomic_t mmap_count; int mmap_locked; struct user_struct *mmap_user; - struct perf_mmap_data *data; + struct perf_buffer *buffer; /* poll related */ wait_queue_head_t waitq; @@ -825,7 +825,7 @@ struct perf_cpu_context { struct perf_output_handle { struct perf_event *event; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long wakeup; unsigned long size; void *addr; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6f60920772b3..93d545801e43 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1876,7 +1876,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); -static void perf_mmap_data_put(struct perf_mmap_data *data); +static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { @@ -1892,9 +1892,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->data) { - perf_mmap_data_put(event->data); - event->data = NULL; + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; } if (event->destroy) @@ -2119,13 +2119,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) - events = atomic_xchg(&data->poll, 0); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -2335,14 +2335,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; - userpg = data->user_page; + userpg = buffer->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -2376,15 +2376,15 @@ unlock: */ static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > data->nr_pages) + if (pgoff > buffer->nr_pages) return NULL; if (pgoff == 0) - return virt_to_page(data->user_page); + return virt_to_page(buffer->user_page); - return virt_to_page(data->data_pages[pgoff - 1]); + return virt_to_page(buffer->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) @@ -2400,42 +2400,42 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(struct perf_event *event, int nr_pages) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; int i; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - data->user_page = perf_mmap_alloc_page(event->cpu); - if (!data->user_page) + buffer->user_page = perf_mmap_alloc_page(event->cpu); + if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = perf_mmap_alloc_page(event->cpu); - if (!data->data_pages[i]) + buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu); + if (!buffer->data_pages[i]) goto fail_data_pages; } - data->nr_pages = nr_pages; + buffer->nr_pages = nr_pages; - return data; + return buffer; fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); + free_page((unsigned long)buffer->data_pages[i]); - free_page((unsigned long)data->user_page); + free_page((unsigned long)buffer->user_page); fail_user_page: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2449,17 +2449,17 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { int i; - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - kfree(data); + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); } -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { return 0; } @@ -2472,18 +2472,18 @@ static inline int page_order(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { - return data->page_order; + return buffer->page_order; } static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > (1UL << page_order(data))) + if (pgoff > (1UL << page_order(buffer))) return NULL; - return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) @@ -2493,57 +2493,57 @@ static void perf_mmap_unmark_page(void *addr) page->mapping = NULL; } -static void perf_mmap_data_free_work(struct work_struct *work) +static void perf_buffer_free_work(struct work_struct *work) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; void *base; int i, nr; - data = container_of(work, struct perf_mmap_data, work); - nr = 1 << page_order(data); + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); - base = data->user_page; + base = buffer->user_page; for (i = 0; i < nr + 1; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); - kfree(data); + kfree(buffer); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { - schedule_work(&data->work); + schedule_work(&buffer->work); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(struct perf_event *event, int nr_pages) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; void *all_buf; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - INIT_WORK(&data->work, perf_mmap_data_free_work); + INIT_WORK(&buffer->work, perf_buffer_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; - data->user_page = all_buf; - data->data_pages[0] = all_buf + PAGE_SIZE; - data->page_order = ilog2(nr_pages); - data->nr_pages = 1; + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; - return data; + return buffer; fail_all_buf: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2551,15 +2551,15 @@ fail: #endif -static unsigned long perf_data_size(struct perf_mmap_data *data) +static unsigned long perf_data_size(struct perf_buffer *buffer) { - return data->nr_pages << (PAGE_SHIFT + page_order(data)); + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -2569,14 +2569,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(data, vmf->pgoff); + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); if (!vmf->page) goto unlock; @@ -2592,51 +2592,51 @@ unlock: } static void -perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer) { - long max_size = perf_data_size(data); + long max_size = perf_data_size(buffer); if (event->attr.watermark) { - data->watermark = min_t(long, max_size, + buffer->watermark = min_t(long, max_size, event->attr.wakeup_watermark); } - if (!data->watermark) - data->watermark = max_size / 2; + if (!buffer->watermark) + buffer->watermark = max_size / 2; - atomic_set(&data->refcount, 1); - rcu_assign_pointer(event->data, data); + atomic_set(&buffer->refcount, 1); + rcu_assign_pointer(event->buffer, buffer); } -static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_data_free(data); + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); } -static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) +static struct perf_buffer *perf_buffer_get(struct perf_event *event) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) { - if (!atomic_inc_not_zero(&data->refcount)) - data = NULL; + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; } rcu_read_unlock(); - return data; + return buffer; } -static void perf_mmap_data_put(struct perf_mmap_data *data) +static void perf_buffer_put(struct perf_buffer *buffer) { - if (!atomic_dec_and_test(&data->refcount)) + if (!atomic_dec_and_test(&buffer->refcount)) return; - call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2651,16 +2651,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->data); + unsigned long size = perf_data_size(event->buffer); struct user_struct *user = event->mmap_user; - struct perf_mmap_data *data = event->data; + struct perf_buffer *buffer = event->buffer; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->data, NULL); + rcu_assign_pointer(event->buffer, NULL); mutex_unlock(&event->mmap_mutex); - perf_mmap_data_put(data); + perf_buffer_put(buffer); free_uid(user); } } @@ -2678,7 +2678,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2699,7 +2699,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have data pages ensure they're a power-of-two number, so we + * If we have buffer pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -2713,9 +2713,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->data) { - if (event->data->nr_pages == nr_pages) - atomic_inc(&event->data->refcount); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); else ret = -EINVAL; goto unlock; @@ -2745,17 +2745,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->data); + WARN_ON(event->buffer); - data = perf_mmap_data_alloc(event, nr_pages); - if (!data) { + buffer = perf_buffer_alloc(event, nr_pages); + if (!buffer) { ret = -ENOMEM; goto unlock; } - perf_mmap_data_init(event, data); + perf_buffer_init(event, buffer); if (vma->vm_flags & VM_WRITE) - event->data->writable = 1; + event->buffer->writable = 1; atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -2964,15 +2964,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, unsigned long offset, unsigned long head) { unsigned long mask; - if (!data->writable) + if (!buffer->writable) return true; - mask = perf_data_size(data) - 1; + mask = perf_data_size(buffer) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2985,7 +2985,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->data->poll, POLL_IN); + atomic_set(&handle->buffer->poll, POLL_IN); if (handle->nmi) { handle->event->pending_wakeup = 1; @@ -3005,45 +3005,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; preempt_disable(); - local_inc(&data->nest); - handle->wakeup = local_read(&data->wakeup); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; unsigned long head; again: - head = local_read(&data->head); + head = local_read(&buffer->head); /* * IRQ/NMI can happen here, which means we can miss a head update. */ - if (!local_dec_and_test(&data->nest)) + if (!local_dec_and_test(&buffer->nest)) goto out; /* * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the data->head read and this + * by atomic_dec_and_test() order the buffer->head read and this * write. */ - data->user_page->data_head = head; + buffer->user_page->data_head = head; /* * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read data->head. + * barrier in atomic_dec_and_test() to re-read buffer->head. */ - if (unlikely(head != local_read(&data->head))) { - local_inc(&data->nest); + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); goto again; } - if (handle->wakeup != local_read(&data->wakeup)) + if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); out: @@ -3063,12 +3063,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, buf += size; handle->size -= size; if (!handle->size) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; handle->page++; - handle->page &= data->nr_pages - 1; - handle->addr = data->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(data); + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); } } while (len); } @@ -3077,7 +3077,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; struct { @@ -3093,19 +3093,19 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto out; - handle->data = data; + handle->buffer = buffer; handle->event = event; handle->nmi = nmi; handle->sample = sample; - if (!data->nr_pages) + if (!buffer->nr_pages) goto out; - have_lost = local_read(&data->lost); + have_lost = local_read(&buffer->lost); if (have_lost) size += sizeof(lost_event); @@ -3117,30 +3117,30 @@ int perf_output_begin(struct perf_output_handle *handle, * tail pointer. So that all reads will be completed before the * write is issued. */ - tail = ACCESS_ONCE(data->user_page->data_tail); + tail = ACCESS_ONCE(buffer->user_page->data_tail); smp_rmb(); - offset = head = local_read(&data->head); + offset = head = local_read(&buffer->head); head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) + if (unlikely(!perf_output_space(buffer, tail, offset, head))) goto fail; - } while (local_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&buffer->head, offset, head) != offset); - if (head - local_read(&data->wakeup) > data->watermark) - local_add(data->watermark, &data->wakeup); + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(data)); - handle->page &= data->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); - handle->addr = data->data_pages[handle->page]; + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(data)) - handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = local_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); } @@ -3148,7 +3148,7 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - local_inc(&data->lost); + local_inc(&buffer->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3159,15 +3159,15 @@ out: void perf_output_end(struct perf_output_handle *handle) { struct perf_event *event = handle->event; - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = local_inc_return(&data->events); + int events = local_inc_return(&buffer->events); if (events >= wakeup_events) { - local_sub(wakeup_events, &data->events); - local_inc(&data->wakeup); + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); } } @@ -5010,7 +5010,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_mmap_data *data = NULL, *old_data = NULL; + struct perf_buffer *buffer = NULL, *old_buffer = NULL; int ret = -EINVAL; if (!output_event) @@ -5040,19 +5040,19 @@ set: if (output_event) { /* get the buffer we want to redirect to */ - data = perf_mmap_data_get(output_event); - if (!data) + buffer = perf_buffer_get(output_event); + if (!buffer) goto unlock; } - old_data = event->data; - rcu_assign_pointer(event->data, data); + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_data) - perf_mmap_data_put(old_data); + if (old_buffer) + perf_buffer_put(old_buffer); out: return ret; } -- cgit v1.2.3-70-g09d2 From d57e34fdd60be7ffd0b1d86bfa1a553df86b7172 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 May 2010 19:41:35 +0200 Subject: perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do everything needed Currently there are perf_buffer_alloc() + perf_buffer_init() + some separate bits, fold it all into a single perf_buffer_alloc() and only leave the attachment to the event separate. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/perf_event.c | 61 ++++++++++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2a0da021c23f..441992a9775c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -602,6 +602,8 @@ enum perf_event_active_state { struct file; +#define PERF_BUFFER_WRITABLE 0x01 + struct perf_buffer { atomic_t refcount; struct rcu_head rcu_head; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 93d545801e43..f75c9c9c8177 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2369,6 +2369,25 @@ unlock: rcu_read_unlock(); } +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2401,7 +2420,7 @@ static void *perf_mmap_alloc_page(int cpu) } static struct perf_buffer * -perf_buffer_alloc(struct perf_event *event, int nr_pages) +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *buffer; unsigned long size; @@ -2414,18 +2433,20 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages) if (!buffer) goto fail; - buffer->user_page = perf_mmap_alloc_page(event->cpu); + buffer->user_page = perf_mmap_alloc_page(cpu); if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu); + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); if (!buffer->data_pages[i]) goto fail_data_pages; } buffer->nr_pages = nr_pages; + perf_buffer_init(buffer, watermark, flags); + return buffer; fail_data_pages: @@ -2516,7 +2537,7 @@ static void perf_buffer_free(struct perf_buffer *buffer) } static struct perf_buffer * -perf_buffer_alloc(struct perf_event *event, int nr_pages) +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *buffer; unsigned long size; @@ -2540,6 +2561,8 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages) buffer->page_order = ilog2(nr_pages); buffer->nr_pages = 1; + perf_buffer_init(buffer, watermark, flags); + return buffer; fail_all_buf: @@ -2591,23 +2614,6 @@ unlock: return ret; } -static void -perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer) -{ - long max_size = perf_data_size(buffer); - - if (event->attr.watermark) { - buffer->watermark = min_t(long, max_size, - event->attr.wakeup_watermark); - } - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - atomic_set(&buffer->refcount, 1); - rcu_assign_pointer(event->buffer, buffer); -} - static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { struct perf_buffer *buffer; @@ -2682,7 +2688,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; - int ret = 0; + int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -2747,15 +2753,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->buffer); - buffer = perf_buffer_alloc(event, nr_pages); + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; + + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); if (!buffer) { ret = -ENOMEM; goto unlock; } - - perf_buffer_init(event, buffer); - if (vma->vm_flags & VM_WRITE) - event->buffer->writable = 1; + rcu_assign_pointer(event->buffer, buffer); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; -- cgit v1.2.3-70-g09d2 From b5e58793c7a8ec35e72ea6ec6c353499dd189809 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:12 +0200 Subject: perf: Add perf_event_count() Create a helper function for those sites that want to read the event count. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f75c9c9c8177..ab4c0ffc271c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1736,6 +1736,11 @@ static void __perf_event_read(void *info) event->pmu->read(event); } +static inline u64 perf_event_count(struct perf_event *event) +{ + return atomic64_read(&event->count); +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -1755,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event) raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return atomic64_read(&event->count); + return perf_event_count(event); } /* @@ -2352,7 +2357,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = atomic64_read(&event->count); + userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) userpg->offset -= atomic64_read(&event->hw.prev_count); @@ -3211,7 +3216,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[4]; int n = 0; - values[n++] = atomic64_read(&event->count); + values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -3248,7 +3253,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (leader != event) leader->pmu->read(leader); - values[n++] = atomic64_read(&leader->count); + values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); @@ -3260,7 +3265,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (sub != event) sub->pmu->read(sub); - values[n++] = atomic64_read(&sub->count); + values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -5369,7 +5374,7 @@ static void sync_child_event(struct perf_event *child_event, if (child_event->attr.inherit_stat) perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_event->count); + child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: -- cgit v1.2.3-70-g09d2 From a6e6dea68c18f705957573ee5596097c7e82d0e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:27:58 +0200 Subject: perf: Add perf_event::child_count Only child counters adding back their values into the parent counter are responsible for cross-cpu updates to event->count. So if we pull that out into a new child_count variable, we get an event->count that is only modified locally. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/perf_event.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 441992a9775c..f34dab9b275e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -671,6 +671,7 @@ struct perf_event { enum perf_event_active_state state; unsigned int attach_state; atomic64_t count; + atomic64_t child_count; /* * These are the total time in nanoseconds that the event diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ab4c0ffc271c..a395fda2d94c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return atomic64_read(&event->count); + return atomic64_read(&event->count) + atomic64_read(&event->child_count); } static u64 perf_event_read(struct perf_event *event) @@ -5379,7 +5379,7 @@ static void sync_child_event(struct perf_event *child_event, /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_event->count); + atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, -- cgit v1.2.3-70-g09d2 From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:08 +0200 Subject: perf: Convert perf_event to local_t Since now all modification to event->count (and ->prev_count and ->period_left) are local to a cpu, change then to local64_t so we avoid the LOCK'ed ops. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/kernel/perf_event.c | 18 ++++++++--------- arch/powerpc/kernel/perf_event.c | 34 ++++++++++++++++---------------- arch/sh/kernel/perf_event.c | 6 +++--- arch/sparc/kernel/perf_event.c | 18 ++++++++--------- arch/x86/kernel/cpu/perf_event.c | 18 ++++++++--------- include/linux/perf_event.h | 7 ++++--- kernel/perf_event.c | 42 ++++++++++++++++++++-------------------- 7 files changed, 72 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index c45768614c8a..5b7cfafc0720 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event, if (left > (s64)armpmu->max_period) left = armpmu->max_period; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(idx, (u64)(-left) & 0xffffffff); @@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = armpmu->max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } err = 0; diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index ac2a8c2554d9..af1d9a7c65d1 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event) * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); } /* @@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw, if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); event->hw.idx = 0; delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } } @@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw, event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); perf_event_update_userpage(event); } } @@ -666,11 +666,11 @@ void hw_perf_enable(void) } val = 0; if (event->hw.sample_period) { - left = atomic64_read(&event->hw.period_left); + left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); event->hw.idx = idx; write_pmc(idx, val); perf_event_update_userpage(event); @@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event) if (left < 0x80000000L) val = 0x80000000L - left; write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); perf_enable(); local_irq_restore(flags); @@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) event->hw.config = events[n]; event->hw.event_base = cflags[n]; event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val, int record = 0; /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; if (period) { if (left <= 0) { left += period; @@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); } diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 81b6de41ae5d..7a3dc3567258 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event, * this is the simplest approach for maintaining consistency. */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = sh_pmu->read(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -203,7 +203,7 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static void sh_pmu_disable(struct perf_event *event) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index beeb92fa3acd..8a6660da8e08 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = read_pmc(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -591,27 +591,27 @@ again: static int sparc_perf_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (left > MAX_PERIOD) left = MAX_PERIOD; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); write_pmc(idx, (u64)(-left) & 0xffffffff); @@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = MAX_PERIOD; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } return 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79e199843db6..2d0d29069275 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -314,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -886,7 +886,7 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; @@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f34dab9b275e..7342979f95f2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -487,6 +487,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #define PERF_MAX_STACK_DEPTH 255 @@ -536,10 +537,10 @@ struct hw_perf_event { struct arch_hw_breakpoint info; #endif }; - atomic64_t prev_count; + local64_t prev_count; u64 sample_period; u64 last_period; - atomic64_t period_left; + local64_t period_left; u64 interrupts; u64 freq_time_stamp; @@ -670,7 +671,7 @@ struct perf_event { enum perf_event_active_state state; unsigned int attach_state; - atomic64_t count; + local64_t count; atomic64_t child_count; /* diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a395fda2d94c..97c73018592e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1148,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_event->count); - value = atomic64_xchg(&event->count, value); - atomic64_set(&next_event->count, value); + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); @@ -1540,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; - if (atomic64_read(&hwc->period_left) > 8*sample_period) { + if (local64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); perf_event_stop(event); - atomic64_set(&hwc->period_left, 0); + local64_set(&hwc->period_left, 0); perf_event_start(event); perf_enable(); } @@ -1584,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_disable(); event->pmu->read(event); - now = atomic64_read(&event->count); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; @@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return atomic64_read(&event->count) + atomic64_read(&event->child_count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } static u64 perf_event_read(struct perf_event *event) @@ -2141,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); - atomic64_set(&event->count, 0); + local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -2359,7 +2359,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= atomic64_read(&event->hw.prev_count); + userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -4035,14 +4035,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; again: - old = val = atomic64_read(&hwc->period_left); + old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; @@ -4081,7 +4081,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &event->count); + local64_add(nr, &event->count); if (!regs) return; @@ -4092,7 +4092,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, nmi, data, regs); - if (atomic64_add_negative(nr, &hwc->period_left)) + if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, nmi, data, regs); @@ -4383,8 +4383,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_xchg(&event->hw.prev_count, now); - atomic64_add(now - prev, &event->count); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } static int cpu_clock_perf_event_enable(struct perf_event *event) @@ -4392,7 +4392,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + local64_set(&hwc->prev_count, cpu_clock(cpu)); perf_swevent_start_hrtimer(event); return 0; @@ -4424,9 +4424,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) u64 prev; s64 delta; - prev = atomic64_xchg(&event->hw.prev_count, now); + prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static int task_clock_perf_event_enable(struct perf_event *event) @@ -4436,7 +4436,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; - atomic64_set(&hwc->prev_count, now); + local64_set(&hwc->prev_count, now); perf_swevent_start_hrtimer(event); @@ -4879,7 +4879,7 @@ perf_event_alloc(struct perf_event_attr *attr, hwc->sample_period = 1; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_FORMAT_GROUP on inherited events @@ -5313,7 +5313,7 @@ inherit_event(struct perf_event *parent_event, hwc->sample_period = sample_period; hwc->last_period = sample_period; - atomic64_set(&hwc->period_left, sample_period); + local64_set(&hwc->period_left, sample_period); } child_event->overflow_handler = parent_event->overflow_handler; -- cgit v1.2.3-70-g09d2 From 039ca4e74a1cf60bd7487324a564ecf5c981f254 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 26 May 2010 17:22:17 +0800 Subject: tracing: Remove kmemtrace ftrace plugin We have been resisting new ftrace plugins and removing existing ones, and kmemtrace has been superseded by kmem trace events and perf-kmem, so we remove it. Signed-off-by: Li Zefan Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Ingo Molnar Cc: Steven Rostedt [ remove kmemtrace from the makefile, handle slob too ] Signed-off-by: Frederic Weisbecker --- Documentation/ABI/testing/debugfs-kmemtrace | 71 ---- Documentation/trace/kmemtrace.txt | 126 ------- MAINTAINERS | 7 - include/linux/kmemtrace.h | 25 -- include/linux/slab_def.h | 3 +- include/linux/slub_def.h | 3 +- init/main.c | 2 - kernel/trace/Kconfig | 20 -- kernel/trace/Makefile | 1 - kernel/trace/kmemtrace.c | 529 ---------------------------- kernel/trace/trace.h | 12 - kernel/trace/trace_entries.h | 35 -- mm/slab.c | 1 - mm/slob.c | 4 +- mm/slub.c | 1 - 15 files changed, 7 insertions(+), 833 deletions(-) delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace delete mode 100644 Documentation/trace/kmemtrace.txt delete mode 100644 include/linux/kmemtrace.h delete mode 100644 kernel/trace/kmemtrace.c (limited to 'kernel') diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace deleted file mode 100644 index 5e6a92a02d85..000000000000 --- a/Documentation/ABI/testing/debugfs-kmemtrace +++ /dev/null @@ -1,71 +0,0 @@ -What: /sys/kernel/debug/kmemtrace/ -Date: July 2008 -Contact: Eduard - Gabriel Munteanu -Description: - -In kmemtrace-enabled kernels, the following files are created: - -/sys/kernel/debug/kmemtrace/ - cpu (0400) Per-CPU tracing data, see below. (binary) - total_overruns (0400) Total number of bytes which were dropped from - cpu files because of full buffer condition, - non-binary. (text) - abi_version (0400) Kernel's kmemtrace ABI version. (text) - -Each per-CPU file should be read according to the relay interface. That is, -the reader should set affinity to that specific CPU and, as currently done by -the userspace application (though there are other methods), use poll() with -an infinite timeout before every read(). Otherwise, erroneous data may be -read. The binary data has the following _core_ format: - - Event ID (1 byte) Unsigned integer, one of: - 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) - 1 - represents a freeing of previously allocated memory - (KMEMTRACE_EVENT_FREE) - Type ID (1 byte) Unsigned integer, one of: - 0 - this is a kmalloc() / kfree() - 1 - this is a kmem_cache_alloc() / kmem_cache_free() - 2 - this is a __get_free_pages() et al. - Event size (2 bytes) Unsigned integer representing the - size of this event. Used to extend - kmemtrace. Discard the bytes you - don't know about. - Sequence number (4 bytes) Signed integer used to reorder data - logged on SMP machines. Wraparound - must be taken into account, although - it is unlikely. - Caller address (8 bytes) Return address to the caller. - Pointer to mem (8 bytes) Pointer to target memory area. Can be - NULL, but not all such calls might be - recorded. - -In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: - - Requested bytes (8 bytes) Total number of requested bytes, - unsigned, must not be zero. - Allocated bytes (8 bytes) Total number of actually allocated - bytes, unsigned, must not be lower - than requested bytes. - Requested flags (4 bytes) GFP flags supplied by the caller. - Target CPU (4 bytes) Signed integer, valid for event id 1. - If equal to -1, target CPU is the same - as origin CPU, but the reverse might - not be true. - -The data is made available in the same endianness the machine has. - -Other event ids and type ids may be defined and added. Other fields may be -added by increasing event size, but see below for details. -Every modification to the ABI, including new id definitions, are followed -by bumping the ABI version by one. - -Adding new data to the packet (features) is done at the end of the mandatory -data: - Feature size (2 byte) - Feature ID (1 byte) - Feature data (Feature size - 3 bytes) - - -Users: - kmemtrace-user - git://repo.or.cz/kmemtrace-user.git - diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt deleted file mode 100644 index 6308735e58ca..000000000000 --- a/Documentation/trace/kmemtrace.txt +++ /dev/null @@ -1,126 +0,0 @@ - kmemtrace - Kernel Memory Tracer - - by Eduard - Gabriel Munteanu - - -I. Introduction -=============== - -kmemtrace helps kernel developers figure out two things: -1) how different allocators (SLAB, SLUB etc.) perform -2) how kernel code allocates memory and how much - -To do this, we trace every allocation and export information to the userspace -through the relay interface. We export things such as the number of requested -bytes, the number of bytes actually allocated (i.e. including internal -fragmentation), whether this is a slab allocation or a plain kmalloc() and so -on. - -The actual analysis is performed by a userspace tool (see section III for -details on where to get it from). It logs the data exported by the kernel, -processes it and (as of writing this) can provide the following information: -- the total amount of memory allocated and fragmentation per call-site -- the amount of memory allocated and fragmentation per allocation -- total memory allocated and fragmentation in the collected dataset -- number of cross-CPU allocation and frees (makes sense in NUMA environments) - -Moreover, it can potentially find inconsistent and erroneous behavior in -kernel code, such as using slab free functions on kmalloc'ed memory or -allocating less memory than requested (but not truly failed allocations). - -kmemtrace also makes provisions for tracing on some arch and analysing the -data on another. - -II. Design and goals -==================== - -kmemtrace was designed to handle rather large amounts of data. Thus, it uses -the relay interface to export whatever is logged to userspace, which then -stores it. Analysis and reporting is done asynchronously, that is, after the -data is collected and stored. By design, it allows one to log and analyse -on different machines and different arches. - -As of writing this, the ABI is not considered stable, though it might not -change much. However, no guarantees are made about compatibility yet. When -deemed stable, the ABI should still allow easy extension while maintaining -backward compatibility. This is described further in Documentation/ABI. - -Summary of design goals: - - allow logging and analysis to be done across different machines - - be fast and anticipate usage in high-load environments (*) - - be reasonably extensible - - make it possible for GNU/Linux distributions to have kmemtrace - included in their repositories - -(*) - one of the reasons Pekka Enberg's original userspace data analysis - tool's code was rewritten from Perl to C (although this is more than a - simple conversion) - - -III. Quick usage guide -====================== - -1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE). - -2) Get the userspace tool and build it: -$ git clone git://repo.or.cz/kmemtrace-user.git # current repository -$ cd kmemtrace-user/ -$ ./autogen.sh -$ ./configure -$ make - -3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the -'single' runlevel (so that relay buffers don't fill up easily), and run -kmemtrace: -# '$' does not mean user, but root here. -$ mount -t debugfs none /sys/kernel/debug -$ mount -t proc none /proc -$ cd path/to/kmemtrace-user/ -$ ./kmemtraced -Wait a bit, then stop it with CTRL+C. -$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't - # overrun, should - # be zero. -$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to - check its correctness] -$ ./kmemtrace-report - -Now you should have a nice and short summary of how the allocator performs. - -IV. FAQ and known issues -======================== - -Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix -this? Should I worry? -A: If it's non-zero, this affects kmemtrace's accuracy, depending on how -large the number is. You can fix it by supplying a higher -'kmemtrace.subbufs=N' kernel parameter. ---- - -Q: kmemtrace_check reports errors, how do I fix this? Should I worry? -A: This is a bug and should be reported. It can occur for a variety of -reasons: - - possible bugs in relay code - - possible misuse of relay by kmemtrace - - timestamps being collected unorderly -Or you may fix it yourself and send us a patch. ---- - -Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? -A: This is a known issue and I'm working on it. These might be true errors -in kernel code, which may have inconsistent behavior (e.g. allocating memory -with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed -out this behavior may work with SLAB, but may fail with other allocators. - -It may also be due to lack of tracing in some unusual allocator functions. - -We don't want bug reports regarding this issue yet. ---- - -V. See also -=========== - -Documentation/kernel-parameters.txt -Documentation/ABI/testing/debugfs-kmemtrace - diff --git a/MAINTAINERS b/MAINTAINERS index 33047a605438..67e6e9d848db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3361,13 +3361,6 @@ F: include/linux/kmemleak.h F: mm/kmemleak.c F: mm/kmemleak-test.c -KMEMTRACE -M: Eduard - Gabriel Munteanu -S: Maintained -F: Documentation/trace/kmemtrace.txt -F: include/linux/kmemtrace.h -F: kernel/trace/kmemtrace.c - KPROBES M: Ananth N Mavinakayanahalli M: Anil S Keshavamurthy diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h deleted file mode 100644 index b616d3930c3b..000000000000 --- a/include/linux/kmemtrace.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ - -#include - -#ifdef CONFIG_KMEMTRACE -extern void kmemtrace_init(void); -#else -static inline void kmemtrace_init(void) -{ -} -#endif - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 1812dac8c496..1acfa73ce2ac 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,8 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include + +#include #ifndef ARCH_KMALLOC_MINALIGN /* diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 55695c8d2f8a..2345d3a033e6 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,9 +10,10 @@ #include #include #include -#include #include +#include + enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ diff --git a/init/main.c b/init/main.c index 94f65efdc65a..e2a2bf3a169f 100644 --- a/init/main.c +++ b/init/main.c @@ -66,7 +66,6 @@ #include #include #include -#include #include #include #include @@ -652,7 +651,6 @@ asmlinkage void __init start_kernel(void) #endif page_cgroup_init(); enable_debug_pagealloc(); - kmemtrace_init(); kmemleak_init(); debug_objects_mem_init(); idr_init_cache(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 572992abc71c..f669092fdead 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -354,26 +354,6 @@ config STACK_TRACER Say N if unsure. -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c3aaeba82372..469a1c7555a5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660b..000000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg - * Copyright (C) 2008 Frederic Weisbecker - */ - -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void -kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(void *ignore, - unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree, NULL); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - unregister_trace_kfree(kmemtrace_kfree, NULL); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - kmemtrace_array = tr; - - tracing_reset_online_cpus(tr); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, - (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, - (unsigned long)entry->gfp_flags, entry->node); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", - entry->type_id, (void *)entry->call_site, - (unsigned long)entry->ptr); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - struct kmemtrace_user_event *ev; - struct kmemtrace_user_event_alloc *ev_alloc; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - struct kmemtrace_user_event *ev; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter) -{ - struct kmemtrace_alloc_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node and call site*/ - ret = trace_seq_printf(s, "%4d %pf\n", entry->node, - (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter) -{ - struct kmemtrace_free_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node and print call site*/ - ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return TRACE_TYPE_UNHANDLED; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: - return kmemtrace_print_alloc_compress(iter); - case TRACE_KMEM_FREE: - return kmemtrace_print_free_compress(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct trace_event_functions kmem_trace_alloc_funcs = { - .trace = kmemtrace_print_alloc, - .binary = kmemtrace_print_alloc_user, -}; - -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, - .funcs = &kmem_trace_alloc_funcs, -}; - -static struct trace_event_functions kmem_trace_free_funcs = { - .trace = kmemtrace_print_free, - .binary = kmemtrace_print_free_user, -}; - -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, - .funcs = &kmem_trace_free_funcs, -}; - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - if (!register_ftrace_event(&kmem_trace_alloc)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (!register_ftrace_event(&kmem_trace_free)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (register_tracer(&kmem_tracer) != 0) { - pr_warning("Warning: could not register the kmem tracer\n"); - return 1; - } - - return 0; -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 75a5e800a737..075cd2ea84a2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -30,19 +29,12 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, TRACE_BLK, TRACE_KSYM, __TRACE_LAST_TYPE, }; -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; #undef __field #define __field(type, item) type item; @@ -208,10 +200,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c293364c984f..13abc157dbaf 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -291,41 +291,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, - - TRACE_KMEM_ALLOC, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" - " flags:%x node:%d", - __entry->type_id, __entry->call_site, __entry->ptr, - __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, __entry->node) -); - -FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, - - TRACE_KMEM_FREE, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - F_printk("type:%u call_site:%lx ptr:%p", - __entry->type_id, __entry->call_site, __entry->ptr) -); - FTRACE_ENTRY(ksym_trace, ksym_trace_entry, TRACE_KSYM, diff --git a/mm/slab.c b/mm/slab.c index e49f8f46f46d..47360c3e5abd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index 23631e2bb57a..a82ab5811bd9 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -66,8 +66,10 @@ #include #include #include -#include #include + +#include + #include /* diff --git a/mm/slub.c b/mm/slub.c index 26f0cb9cc584..a61f1aad1070 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-70-g09d2 From 45a73372efe4a63f44aa2e1125d4a777c2fdc8d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 23 Jun 2010 23:00:37 +0200 Subject: hw_breakpoints: Fix per task breakpoint tracking Freeing a perf event can happen in several ways. A task calls perf_event_exit_task() right before exiting. This helper will detach all the events from the task context and queue their removal through free_event() if they are child tasks. The task also loses its context reference there. Releasing the breakpoint slot from the constraint table is made from free_event() that calls release_bp_slot(). We count the number of breakpoints this task is running by looking at the task's perf_event_ctxp and iterating through its attached events. But at this time, the reference to this context has been cleaned up already. So looking at the event->ctx instead of task->perf_event_ctxp to count the remaining breakpoints should solve the problem. At least it would for child breakpoints, but not for parent ones. If the parent exits before the child, it will remove all its events from the context but free_event() will be called later, on fd release time. And checking the number of breakpoints the task has attached to its context at this time is unreliable as all events have been removed from the context. To solve this, we keep track of the list of per task breakpoints. On top of it, we maintain our array of numbers of breakpoints used by the tasks. We use the context address as a task id. So, instead of looking at the number of events attached to a context, we walk through our list of per task breakpoints and count the number of breakpoints that use the same ctx than the one to be reserved or released from the constraint table, and update the count on top of this result. In the meantime it solves a bad refcounting, it also solves a warning, reported by Paul. Badness at /home/paulus/kernel/perf/kernel/hw_breakpoint.c:114 NIP: c0000000000cb470 LR: c0000000000cb46c CTR: c00000000032d9b8 REGS: c000000118e7b570 TRAP: 0700 Not tainted (2.6.35-rc3-perf-00008-g76b0f13 ) MSR: 9000000000029032 CR: 44004424 XER: 000fffff TASK = c0000001187dcad0[3143] 'perf' THREAD: c000000118e78000 CPU: 1 GPR00: c0000000000cb46c c000000118e7b7f0 c0000000009866a0 0000000000000020 GPR04: 0000000000000000 000000000000001d 0000000000000000 0000000000000001 GPR08: c0000000009bed68 c00000000086dff8 c000000000a5bf10 0000000000000001 GPR12: 0000000024004422 c00000000ffff200 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000018 00000000101150f4 GPR20: 0000000010206b40 0000000000000000 0000000000000000 00000000101150f4 GPR24: c0000001199090c0 0000000000000001 0000000000000000 0000000000000001 GPR28: 0000000000000000 0000000000000000 c0000000008ec290 0000000000000000 NIP [c0000000000cb470] .task_bp_pinned+0x5c/0x12c LR [c0000000000cb46c] .task_bp_pinned+0x58/0x12c Call Trace: [c000000118e7b7f0] [c0000000000cb46c] .task_bp_pinned+0x58/0x12c (unreliable) [c000000118e7b8a0] [c0000000000cb584] .toggle_bp_task_slot+0x44/0xe4 [c000000118e7b940] [c0000000000cb6c8] .toggle_bp_slot+0xa4/0x164 [c000000118e7b9f0] [c0000000000cbafc] .release_bp_slot+0x44/0x6c [c000000118e7ba80] [c0000000000c4178] .bp_perf_event_destroy+0x10/0x24 [c000000118e7bb00] [c0000000000c4aec] .free_event+0x180/0x1bc [c000000118e7bbc0] [c0000000000c54c4] .perf_event_release_kernel+0x14c/0x170 Reported-by: Paul Mackerras Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Prasad Cc: Mahesh Salgaonkar Cc: Will Deacon Cc: Jason Wessel --- include/linux/perf_event.h | 6 ++-- kernel/hw_breakpoint.c | 78 ++++++++++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 63b5aa5dce69..0dd5f8ad77ac 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -533,8 +533,10 @@ struct hw_perf_event { struct hrtimer hrtimer; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT - /* breakpoint */ - struct arch_hw_breakpoint info; + struct { /* breakpoint */ + struct arch_hw_breakpoint info; + struct list_head bp_list; + }; #endif }; local64_t prev_count; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22e0602..e34d94d50924 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ @@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) return 0; } -static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = tsk->perf_event_ctxp; - struct list_head *list; - struct perf_event *bp; - unsigned long flags; + struct perf_event_context *ctx = bp->ctx; + struct perf_event *iter; int count = 0; - if (WARN_ONCE(!ctx, "No perf context for this task")) - return 0; - - list = &ctx->event_list; - - raw_spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - if (find_slot_idx(bp) == type) - count += hw_breakpoint_weight(bp); + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->ctx == ctx && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); } - raw_spin_unlock_irqrestore(&ctx->lock, flags); - return count; } @@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk, type); + slots->pinned += task_bp_pinned(bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk, type); + nr += task_bp_pinned(bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; @@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(tsk, type); + old_count = task_bp_pinned(bp, type); old_idx = old_count - 1; idx = old_idx + weight; + /* tsk_pinned[n] is the number of tasks having n breakpoints */ tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { tsk_pinned[idx]++; @@ -222,23 +215,30 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + /* Pinned counter task profiling */ - if (tsk) { - if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; - } + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; + toggle_bp_task_slot(bp, cpu, enable, type, weight); } - /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + list_add_tail(&bp->hw.bp_list, &bp_task_head); } /* @@ -301,6 +301,10 @@ static int __reserve_bp_slot(struct perf_event *bp) weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ -- cgit v1.2.3-70-g09d2 From c9642c49aae1272d7c24008a40ae614470b957a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 May 2010 16:22:30 +0800 Subject: tracing: Use a global field list for all syscall exit events All syscall exit events have the same fields. The kernel size drops 2.5K: text data bss dec hex filename 7018612 2034376 7251132 16304120 f8c7f8 vmlinux.o.orig 7018612 2031888 7251132 16301632 f8be40 vmlinux.o Signed-off-by: Li Zefan LKML-Reference: <4BFA3746.8070100@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 2 -- include/trace/syscall.h | 1 - kernel/trace/trace_syscalls.c | 7 ++++--- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7f614ce274a9..7994bd44eb56 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -165,7 +165,6 @@ extern struct trace_event_functions exit_syscall_print_funcs; .enter_event = &event_enter_##sname, \ .exit_event = &event_exit_##sname, \ .enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \ - .exit_fields = LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \ }; #define SYSCALL_DEFINE0(sname) \ @@ -180,7 +179,6 @@ extern struct trace_event_functions exit_syscall_print_funcs; .enter_event = &event_enter__##sname, \ .exit_event = &event_exit__##sname, \ .enter_fields = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \ - .exit_fields = LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \ }; \ asmlinkage long sys_##sname(void) #else diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 257e08960d7b..31966a4fb8cc 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -26,7 +26,6 @@ struct syscall_metadata { const char **types; const char **args; struct list_head enter_fields; - struct list_head exit_fields; struct ftrace_event_call *enter_event; struct ftrace_event_call *exit_event; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304b..bac752f0cfb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); +/* All syscall exit events have the same fields */ +static LIST_HEAD(syscall_exit_fields); + static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) static struct list_head * syscall_get_exit_fields(struct ftrace_event_call *call) { - struct syscall_metadata *entry = call->data; - - return &entry->exit_fields; + return &syscall_exit_fields; } struct trace_event_functions enter_syscall_print_funcs = { -- cgit v1.2.3-70-g09d2 From 8728fe501ed562c1b46dde3c195eadec77bca033 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 May 2010 16:22:49 +0800 Subject: tracing: Don't allocate common fields for every trace events Every event has the same common fields, so it's a big waste of memory to have a copy of those fields for every event. Signed-off-by: Li Zefan LKML-Reference: <4BFA3759.30105@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 113 +++++++++++++++++++++---------------- kernel/trace/trace_events_filter.c | 18 +++++- 3 files changed, 81 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 01ce088c1cdf..cc90ccdad469 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -698,6 +698,8 @@ struct filter_pred { int pop_n; }; +extern struct list_head ftrace_common_fields; + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a594f9a7ee3d..d3b4bdf00b39 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,6 +28,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); struct list_head * trace_get_fields(struct ftrace_event_call *event_call) @@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } -int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed, - int filter_type) +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) { struct ftrace_event_field *field; - struct list_head *head; - - if (WARN_ON(!call->class)) - return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - head = trace_get_fields(call); list_add(&field->link, head); return 0; @@ -80,17 +76,32 @@ err: return -ENOMEM; } + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} EXPORT_SYMBOL_GPL(trace_define_field); #define __common_field(type, item) \ - ret = trace_define_field(call, #type, "common_" #item, \ - offsetof(typeof(ent), item), \ - sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -static int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(void) { int ret; struct trace_entry ent; @@ -544,32 +555,10 @@ out: return ret; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +static void print_event_fields(struct trace_seq *s, struct list_head *head) { - struct ftrace_event_call *call = filp->private_data; struct ftrace_event_field *field; - struct list_head *head; - struct trace_seq *s; - int common_field_count = 5; - char *buf; - int r = 0; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); - - head = trace_get_fields(call); list_for_each_entry_reverse(field, head, link) { /* * Smartly shows the array type(except dynamic array). @@ -584,29 +573,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, array_descriptor = NULL; if (!array_descriptor) { - r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); } else { - r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, array_descriptor, field->offset, field->size, !!field->is_signed); } + } +} - if (--common_field_count == 0) - r = trace_seq_printf(s, "\n"); +static ssize_t +event_format_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct list_head *head; + struct trace_seq *s; + char *buf; + int r; - if (!r) - break; - } + if (*ppos) + return 0; - if (r) - r = trace_seq_printf(s, "\nprint fmt: %s\n", - call->print_fmt); + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + trace_seq_printf(s, "name: %s\n", call->name); + trace_seq_printf(s, "ID: %d\n", call->event.type); + trace_seq_printf(s, "format:\n"); + + /* print common fields */ + print_event_fields(s, &ftrace_common_fields); + + trace_seq_putc(s, '\n'); + + /* print event specific fields */ + head = trace_get_fields(call); + print_event_fields(s, head); + + r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); if (!r) { /* @@ -980,9 +994,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, */ head = trace_get_fields(call); if (list_empty(head)) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->class->define_fields(call); + ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); @@ -1319,6 +1331,9 @@ static __init int event_trace_init(void) trace_create_file("enable", 0644, d_events, NULL, &ftrace_system_enable_fops); + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb32999..330fefd1de1c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, } static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +__find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; - struct list_head *head; - head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; @@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static void filter_free_pred(struct filter_pred *pred) { if (!pred) -- cgit v1.2.3-70-g09d2 From c9d932cf8a1c608b676021aef0189376ba6ef151 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 May 2010 16:24:28 +0800 Subject: tracing: Remove test of NULL define_fields callback Every event (or event class) has it's define_fields callback, so the test is redundant. Signed-off-by: Li Zefan LKML-Reference: <4BFA37BC.8080707@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 28 +++++++++++++--------------- kernel/trace/trace_events_filter.c | 9 --------- 2 files changed, 13 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d3b4bdf00b39..5bad9cbbf974 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -987,23 +987,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); #endif - if (call->class->define_fields) { - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - ret = call->class->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; } - trace_create_file("filter", 0644, call->dir, call, - filter); } + trace_create_file("filter", 0644, call->dir, call, + filter); trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 330fefd1de1c..36d40104b17f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -639,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -658,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -1263,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; -- cgit v1.2.3-70-g09d2 From ffb9f99528574ab9a55d4c8fd22e9d3ca49efa86 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 May 2010 16:24:52 +0800 Subject: tracing: Remove redundant raw_init callbacks raw_init callback is optional. Signed-off-by: Li Zefan LKML-Reference: <4BFA37D4.7070500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 8 +------- kernel/trace/trace_kprobe.c | 10 +--------- 2 files changed, 2 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a65969..4ba44deaac25 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" -static int ftrace_raw_init_event(struct ftrace_event_call *call) -{ - INIT_LIST_HEAD(&call->class->fields); - return 0; -} - #undef __entry #define __entry REC @@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ - .raw_init = ftrace_raw_init_event, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ struct ftrace_event_call __used \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..3b831d8e201e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1214,11 +1214,6 @@ static void probe_event_disable(struct ftrace_event_call *call) } } -static int probe_event_raw_init(struct ftrace_event_call *event_call) -{ - return 0; -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1486,15 +1481,12 @@ static int register_probe_event(struct trace_probe *tp) int ret; /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); if (probe_is_return(tp)) { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kretprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kretprobe_event_define_fields; } else { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) -- cgit v1.2.3-70-g09d2 From 67ead0a6ceb001b4cb891d782e440f0e79493ba2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 May 2010 16:25:13 +0800 Subject: tracing: Remove open-coded __trace_add_event_call() Let trace_module_add_events() and event_trace_init() call __trace_add_event_call(). Signed-off-by: Li Zefan LKML-Reference: <4BFA37E9.1020106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 70 ++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5bad9cbbf974..69bee4cc0e10 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1009,11 +1009,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } -static int __trace_add_event_call(struct ftrace_event_call *call) +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *d_events; int ret; + /* The linker may leave blanks */ if (!call->name) return -EINVAL; @@ -1021,8 +1027,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "events/%s\n", call->name); + pr_warning("Could not initialize trace events/%s\n", + call->name); return ret; } } @@ -1031,11 +1037,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!d_events) return -ENOENT; - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, id, enable, filter, format); if (!ret) list_add(&call->list, &ftrace_events); + call->mod = mod; return ret; } @@ -1045,7 +1050,10 @@ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); mutex_unlock(&event_mutex); return ret; } @@ -1162,8 +1170,6 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; - struct dentry *d_events; - int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1171,38 +1177,14 @@ static void trace_module_add_events(struct module *mod) if (start == end) return; - d_events = event_trace_events_dir(); - if (!d_events) + file_ops = trace_create_file_ops(mod); + if (!file_ops) return; for_each_event(call, start, end) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - /* - * This module has events, create file ops for this module - * if not already done. - */ - if (!file_ops) { - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - } - call->mod = mod; - ret = event_create_dir(call, d_events, + __trace_add_event_call(call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); - if (!ret) - list_add(&call->list, &ftrace_events); } } @@ -1333,24 +1315,10 @@ static __init int event_trace_init(void) pr_warning("tracing: Failed to allocate common fields"); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + __trace_add_event_call(call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - if (!ret) - list_add(&call->list, &ftrace_events); } while (true) { -- cgit v1.2.3-70-g09d2 From d62f85d1e22e537192ce494c89540e1ac0d8bfc7 Mon Sep 17 00:00:00 2001 From: Chase Douglas Date: Tue, 15 Jun 2010 12:29:15 -0400 Subject: tracing/function-graph: Use correct string size for snprintf The nsecs_str string is a local variable defined as: char nsecs_str[5]; It is possible for the snprintf call to use a size value larger than the size of the string. This should not cause a buffer overrun as it is written now due to the value for the string format "%03lu" can not be larger than 1000. However, this change makes it correct. By making the size correct we guard against potential future changes that could actually cause a buffer overrun. Signed-off-by: Chase Douglas LKML-Reference: <1276619355-18116-1-git-send-email-chase.douglas@canonical.com> [ added 'UL' to number 8 to fix gcc warning comparing it to sizeof() ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..6bff23625781 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", + nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-70-g09d2 From a1d0ce8213e9ddf4046ef5ba95c55762d075f541 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Jun 2010 11:22:06 -0400 Subject: tracing: Use class->reg() for all registering of events Because kprobes and syscalls need special processing to register events, the class->reg() method was created to handle the differences. But instead of creating a default ->reg for perf and ftrace events, the code was scattered with: if (class->reg) class->reg(); else default_reg(); This is messy and can also lead to bugs. This patch cleans up this code and creates a default reg() entry for the events allowing for the code to directly call the class->reg() without the condition. Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 +++ include/trace/ftrace.h | 2 ++ kernel/trace/trace_event_perf.c | 19 +++----------- kernel/trace/trace_events.c | 55 +++++++++++++++++++++++++++-------------- 4 files changed, 44 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0af31cd335d6..01df7ca4ead7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -146,6 +146,9 @@ struct ftrace_event_class { int (*raw_init)(struct ftrace_event_call *); }; +extern int ftrace_event_reg(struct ftrace_event_call *event, + enum trace_reg type); + enum { TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fc013a8201e9..55c1fd1bbc3d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -439,6 +439,7 @@ static inline notrace int ftrace_get_offsets_##call( \ * .fields = LIST_HEAD_INIT(event_class_##call.fields), * .raw_init = trace_event_raw_init, * .probe = ftrace_raw_event_##call, + * .reg = ftrace_event_reg, * }; * * static struct ftrace_event_call __used @@ -567,6 +568,7 @@ static struct ftrace_event_class __used event_class_##call = { \ .fields = LIST_HEAD_INIT(event_class_##call.fields),\ .raw_init = trace_event_raw_init, \ .probe = ftrace_raw_event_##call, \ + .reg = ftrace_event_reg, \ _TRACE_PERF_INIT(call) \ }; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 6053982dc302..23751659582e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -54,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - if (tp_event->class->reg) - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); - else - ret = tracepoint_probe_register(tp_event->name, - tp_event->class->perf_probe, - tp_event); - + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); if (ret) goto fail; @@ -94,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && - (tp_event->class->perf_probe || - tp_event->class->reg) && + tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; @@ -136,12 +128,7 @@ void perf_trace_destroy(struct perf_event *p_event) if (--tp_event->perf_refcount > 0) goto out; - if (tp_event->class->reg) - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - else - tracepoint_probe_unregister(tp_event->name, - tp_event->class->perf_probe, - tp_event); + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); /* * Ensure our callback won't be called anymore. See diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 69bee4cc0e10..e8e6043f4d29 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -141,6 +141,35 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -151,23 +180,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->flags & TRACE_EVENT_FL_ENABLED) { call->flags &= ~TRACE_EVENT_FL_ENABLED; tracing_stop_cmdline_record(); - if (call->class->reg) - call->class->reg(call, TRACE_REG_UNREGISTER); - else - tracepoint_probe_unregister(call->name, - call->class->probe, - call); + call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { tracing_start_cmdline_record(); - if (call->class->reg) - ret = call->class->reg(call, TRACE_REG_REGISTER); - else - ret = tracepoint_probe_register(call->name, - call->class->probe, - call); + ret = call->class->reg(call, TRACE_REG_REGISTER); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " @@ -205,8 +224,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (match && @@ -332,7 +350,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && (call->class->probe || call->class->reg)) + if (call->class && call->class->reg) return call; } @@ -485,8 +503,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system) != 0) @@ -977,12 +994,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->probe || call->class->reg) + if (call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); #ifdef CONFIG_PERF_EVENTS - if (call->event.type && (call->class->perf_probe || call->class->reg)) + if (call->event.type && call->class->reg) trace_create_file("id", 0444, call->dir, call, id); #endif -- cgit v1.2.3-70-g09d2 From e09c8614b32915c16f68e039ac7040e602d73e35 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Jul 2010 15:54:45 -0300 Subject: tracing/kprobes: Support "string" type Support string type tracing and printing in kprobe-tracer. This allows user to trace string data in kernel including __user data. Note that sometimes __user data may not be accessed if it is paged-out (sorry, but kprobes operation should be done in atomic, we can not wait for page-in). Commiter note: Fixed up conflicts with b7e2ece. Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frederic Weisbecker LKML-Reference: <20100519195724.2885.18788.stgit@localhost6.localdomain6> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/trace/kprobetrace.txt | 2 +- kernel/trace/trace_kprobe.c | 370 ++++++++++++++++++++++++++++-------- 2 files changed, 293 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index ec94748ae65b..5f77d94598dd 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -42,7 +42,7 @@ Synopsis of kprobe_events +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types - (u8/u16/u32/u64/s8/s16/s32/s64) are supported. + (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported. (*) only for return probe. (**) this is useful for fetching a field of data structures. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3b831d8e201e..1b79d1c15726 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include "trace.h" @@ -38,6 +40,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX #define KPROBE_EVENT_SYSTEM "kprobes" /* Reserved field names */ @@ -58,14 +61,16 @@ const char *reserved_field_names[] = { }; /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); #define PRINT_TYPE_FUNC_NAME(type) print_type_##type #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, void *data)\ + const char *name, \ + void *data, void *ent)\ { \ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ } \ @@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); @@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, return fprm->fn(regs, fprm->data, dest); } -#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. */ -#define DEFINE_BASIC_FETCH_FUNCS(kind) \ -DEFINE_FETCH_##kind(u8) \ -DEFINE_FETCH_##kind(u16) \ -DEFINE_FETCH_##kind(u32) \ -DEFINE_FETCH_##kind(u64) - -#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ - ((FETCH_FUNC_NAME(kind, u8) == fn) || \ - (FETCH_FUNC_NAME(kind, u16) == fn) || \ - (FETCH_FUNC_NAME(kind, u32) == fn) || \ - (FETCH_FUNC_NAME(kind, u64) == fn)) +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ + void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL #define DEFINE_FETCH_stack(type) \ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ @@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL #define DEFINE_FETCH_retval(type) \ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ @@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ *(type *)dest = (type)regs_return_value(regs); \ } DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL #define DEFINE_FETCH_memory(type) \ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ @@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = retval; \ } DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} /* Memory fetching by symbol */ struct symbol_cache { @@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) /* Dereference memory access function */ struct deref_fetch_param { @@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } @@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(kind, type) \ - .kind = FETCH_FUNC_NAME(kind, type) - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - {.name = #ptype, \ - .size = sizeof(ftype), \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } \ } +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + /* Fetch type information table */ static const struct fetch_type { const char *name; /* Name of type */ @@ -264,14 +404,16 @@ static const struct fetch_type { int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ /* Fetch functions */ - fetch_func_t reg; - fetch_func_t stack; - fetch_func_t retval; - fetch_func_t memory; - fetch_func_t symbol; - fetch_func_t deref; + fetch_func_t fetch[FETCH_MTD_END]; } fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), ASSIGN_FETCH_TYPE(u32, u32, 0), @@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, *(unsigned long *)dest = kernel_stack_pointer(regs); } +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + /** * Kprobe event core functions */ struct probe_arg { struct fetch_param fetch; + struct fetch_param fetch_size; unsigned int offset; /* Offset from argument entry */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ @@ -429,9 +587,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); kfree(arg->name); kfree(arg->comm); @@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (is_return) - f->fn = t->retval; + f->fn = t->fetch[FETCH_MTD_retval]; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { @@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - f->fn = t->stack; + f->fn = t->fetch[FETCH_MTD_stack]; f->data = (void *)param; } } else @@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->reg; + f->fn = t->fetch[FETCH_MTD_reg]; f->data = (void *)(unsigned long)ret; ret = 0; } @@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - f->fn = t->memory; + f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); @@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, break; f->data = alloc_symbol_cache(arg + 1, offset); if (f->data) - f->fn = t->symbol; + f->fn = t->fetch[FETCH_MTD_symbol]; } break; case '+': /* deref memory */ @@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) kfree(dprm); else { - f->fn = t->deref; + f->fn = t->fetch[FETCH_MTD_deref]; f->data = (void *)dprm; } } break; } - if (!ret && !f->fn) + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); ret = -EINVAL; + } return ret; } @@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) { const char *t; + int ret; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, } parg->offset = tp->size; tp->size += parg->type->size; - return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; } /* Return 1 if name is reserved or already used by another argument */ @@ -1043,6 +1211,54 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + /* Kprobe handler */ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { @@ -1050,8 +1266,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, dsize, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -1060,7 +1275,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1069,9 +1285,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) entry = ring_buffer_event_data(event); entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1085,15 +1299,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, pc, dsize; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1103,9 +1317,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1137,7 +1349,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1179,7 +1391,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1234,7 +1446,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1256,7 +1468,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1296,8 +1508,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); } #undef LEN_OR_ZERO @@ -1334,11 +1551,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1350,9 +1567,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, return; entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); @@ -1366,11 +1582,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1383,9 +1599,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); -- cgit v1.2.3-70-g09d2 From eb703f98191a505f78d0066712ad67d5dedc4c90 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Mon, 5 Jul 2010 12:00:54 +0400 Subject: kernel/watchdog: Initialize 'result' Variable on the stack is not initialized to zero, do it explicitly. This bug was found by a compiler warning: kernel/watchdog.c:463: warning: 'result' may be used uninitialized in this function Signed-off-by: Kulikov Vasiliy Acked-by: Don Zickus Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1278316854-28442-1-git-send-email-segooon@gmail.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 91b0b26adc67..613bc1f04610 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -460,7 +460,7 @@ static void watchdog_disable(int cpu) static void watchdog_enable_all_cpus(void) { int cpu; - int result; + int result = 0; for_each_online_cpu(cpu) result += watchdog_enable(cpu); -- cgit v1.2.3-70-g09d2 From 5d550467b9770042e9699690907babc32104a8d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 15 Jul 2010 23:27:35 +0200 Subject: tracing: Remove ksym tracer The ksym (breakpoint) ftrace plugin has been superseded by perf tools that are much more poweful to use the cpu breakpoints. This tracer doesn't bring more feature. It has been deprecated for a while now, lets remove it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Prasad Cc: Ingo Molnar --- kernel/trace/Kconfig | 22 -- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 6 - kernel/trace/trace_entries.h | 15 -- kernel/trace/trace_ksym.c | 508 ------------------------------------------ kernel/trace/trace_selftest.c | 54 ----- 6 files changed, 606 deletions(-) delete mode 100644 kernel/trace/trace_ksym.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f669092fdead..f5306cb0afb1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -308,28 +308,6 @@ config BRANCH_TRACER Say N if unsure. -config KSYM_TRACER - bool "Trace read and write access on kernel memory locations" - depends on HAVE_HW_BREAKPOINT - select TRACING - help - This tracer helps find read and write operations on any given kernel - symbol i.e. /proc/kallsyms. - -config PROFILE_KSYM_TRACER - bool "Profile all kernel memory accesses on 'watched' variables" - depends on KSYM_TRACER - help - This tracer profiles kernel accesses on variables watched through the - ksym tracer ftrace plugin. Depending upon the hardware, all read - and write operations on kernel variables can be monitored for - accesses. - - The results will be displayed in: - /debugfs/tracing/profile_ksym - - Say N if unsure. - config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 469a1c7555a5..84b2c9908dae 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,7 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc90ccdad469..84d3f123e86f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -30,7 +30,6 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, - TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -200,7 +199,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -361,8 +359,6 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); -extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); - extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; @@ -436,8 +432,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_ksym(struct tracer *trace, - struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 13abc157dbaf..84128371f254 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -291,18 +291,3 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(ksym_trace, ksym_trace_entry, - - TRACE_KSYM, - - F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned char, type ) - __array( char , cmd, TASK_COMM_LEN ) - __field( unsigned long, addr ) - ), - - F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", - (void *)__entry->ip, (unsigned int)__entry->type, - (void *)__entry->addr, __entry->cmd) -); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b65..000000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - * trace_ksym.c - Kernel Symbol Tracer - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "trace_output.h" -#include "trace.h" - -#include -#include - -#include - -#define KSYM_TRACER_OP_LEN 3 /* rw- */ - -struct trace_ksym { - struct perf_event **ksym_hbp; - struct perf_event_attr attr; -#ifdef CONFIG_PROFILE_KSYM_TRACER - atomic64_t counter; -#endif - struct hlist_node ksym_hlist; -}; - -static struct trace_array *ksym_trace_array; - -static unsigned int ksym_tracing_enabled; - -static HLIST_HEAD(ksym_filter_head); - -static DEFINE_MUTEX(ksym_tracer_mutex); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - -#define MAX_UL_INT 0xffffffff - -void ksym_collect_stats(unsigned long hbp_hit_addr) -{ - struct hlist_node *node; - struct trace_ksym *entry; - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == hbp_hit_addr) { - atomic64_inc(&entry->counter); - break; - } - } - rcu_read_unlock(); -} -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -void ksym_hbp_handler(struct perf_event *hbp, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct ring_buffer_event *event; - struct ksym_trace_entry *entry; - struct ring_buffer *buffer; - int pc; - - if (!ksym_tracing_enabled) - return; - - buffer = ksym_trace_array->buffer; - - pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, - sizeof(*entry), 0, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(regs); - entry->type = hw_breakpoint_type(hbp); - entry->addr = hw_breakpoint_addr(hbp); - strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - ksym_collect_stats(hw_breakpoint_addr(hbp)); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -/* Valid access types are represented as - * - * rw- : Set Read/Write Access Breakpoint - * -w- : Set Write Access Breakpoint - * --- : Clear Breakpoints - * --x : Set Execution Break points (Not available yet) - * - */ -static int ksym_trace_get_access_type(char *str) -{ - int access = 0; - - if (str[0] == 'r') - access |= HW_BREAKPOINT_R; - - if (str[1] == 'w') - access |= HW_BREAKPOINT_W; - - if (str[2] == 'x') - access |= HW_BREAKPOINT_X; - - switch (access) { - case HW_BREAKPOINT_R: - case HW_BREAKPOINT_W: - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - return access; - default: - return -EINVAL; - } -} - -/* - * There can be several possible malformed requests and we attempt to capture - * all of them. We enumerate some of the rules - * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. - * i.e. multiple ':' symbols disallowed. Possible uses are of the form - * ::. - * 2. No delimiter symbol ':' in the input string - * 3. Spurious operator symbols or symbols not in their respective positions - * 4. :--- i.e. clear breakpoint request when ksym_name not in file - * 5. Kernel symbol not a part of /proc/kallsyms - * 6. Duplicate requests - */ -static int parse_ksym_trace_str(char *input_string, char **ksymname, - unsigned long *addr) -{ - int ret; - - *ksymname = strsep(&input_string, ":"); - *addr = kallsyms_lookup_name(*ksymname); - - /* Check for malformed request: (2), (1) and (5) */ - if ((!input_string) || - (strlen(input_string) != KSYM_TRACER_OP_LEN) || - (*addr == 0)) - return -EINVAL;; - - ret = ksym_trace_get_access_type(input_string); - - return ret; -} - -int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) -{ - struct trace_ksym *entry; - int ret = -ENOMEM; - - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - hw_breakpoint_init(&entry->attr); - - entry->attr.bp_type = op; - entry->attr.bp_addr = addr; - entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - - entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - - if (IS_ERR(entry->ksym_hbp)) { - ret = PTR_ERR(entry->ksym_hbp); - if (ret == -ENOSPC) { - printk(KERN_ERR "ksym_tracer: Maximum limit reached." - " No new requests for tracing can be accepted now.\n"); - } else { - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); - } - goto err; - } - - hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - - return 0; - -err: - kfree(entry); - - return ret; -} - -static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - struct trace_seq *s; - ssize_t cnt = 0; - int ret; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - trace_seq_init(s); - - mutex_lock(&ksym_tracer_mutex); - - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", - (void *)(unsigned long)entry->attr.bp_addr); - if (entry->attr.bp_type == HW_BREAKPOINT_R) - ret = trace_seq_puts(s, "r--\n"); - else if (entry->attr.bp_type == HW_BREAKPOINT_W) - ret = trace_seq_puts(s, "-w-\n"); - else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) - ret = trace_seq_puts(s, "rw-\n"); - WARN_ON_ONCE(!ret); - } - - cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); - - mutex_unlock(&ksym_tracer_mutex); - - kfree(s); - - return cnt; -} - -static void __ksym_trace_reset(void) -{ - struct trace_ksym *entry; - struct hlist_node *node, *node1; - - mutex_lock(&ksym_tracer_mutex); - hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, - ksym_hlist) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - } - mutex_unlock(&ksym_tracer_mutex); -} - -static ssize_t ksym_trace_filter_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - char *buf, *input_string, *ksymname = NULL; - unsigned long ksym_addr = 0; - int ret, op, changed = 0; - - buf = kzalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(buf, buffer, count)) - goto out; - - buf[count] = '\0'; - input_string = strstrip(buf); - - /* - * Clear all breakpoints if: - * 1: echo > ksym_trace_filter - * 2: echo 0 > ksym_trace_filter - * 3: echo "*:---" > ksym_trace_filter - */ - if (!input_string[0] || !strcmp(input_string, "0") || - !strcmp(input_string, "*:---")) { - __ksym_trace_reset(); - ret = 0; - goto out; - } - - ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) - goto out; - - mutex_lock(&ksym_tracer_mutex); - - ret = -EINVAL; - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == ksym_addr) { - /* Check for malformed request: (6) */ - if (entry->attr.bp_type != op) - changed = 1; - else - goto out_unlock; - break; - } - } - if (changed) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - entry->attr.bp_type = op; - ret = 0; - if (op > 0) { - entry->ksym_hbp = - register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - if (IS_ERR(entry->ksym_hbp)) - ret = PTR_ERR(entry->ksym_hbp); - else - goto out_unlock; - } - /* Error or "symbol:---" case: drop it */ - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - goto out_unlock; - } else { - /* Check for malformed request: (4) */ - if (op) - ret = process_new_ksym_entry(ksymname, op, ksym_addr); - } -out_unlock: - mutex_unlock(&ksym_tracer_mutex); -out: - kfree(buf); - return !ret ? count : ret; -} - -static const struct file_operations ksym_tracing_fops = { - .open = tracing_open_generic, - .read = ksym_trace_filter_read, - .write = ksym_trace_filter_write, -}; - -static void ksym_trace_reset(struct trace_array *tr) -{ - ksym_tracing_enabled = 0; - __ksym_trace_reset(); -} - -static int ksym_trace_init(struct trace_array *tr) -{ - int cpu, ret = 0; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - ksym_tracing_enabled = 1; - ksym_trace_array = tr; - - return ret; -} - -static void ksym_trace_print_header(struct seq_file *m) -{ - seq_puts(m, - "# TASK-PID CPU# Symbol " - "Type Function\n"); - seq_puts(m, - "# | | | " - " | |\n"); -} - -static enum print_line_t ksym_trace_output(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct ksym_trace_entry *field; - char str[KSYM_SYMBOL_LEN]; - int ret; - - if (entry->type != TRACE_KSYM) - return TRACE_TYPE_UNHANDLED; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, - entry->pid, iter->cpu, (char *)field->addr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (field->type) { - case HW_BREAKPOINT_R: - ret = trace_seq_printf(s, " R "); - break; - case HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " RW "); - break; - default: - return TRACE_TYPE_PARTIAL_LINE; - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - sprint_symbol(str, field->ip); - ret = trace_seq_printf(s, "%s\n", str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -struct tracer ksym_tracer __read_mostly = -{ - .name = "ksym_tracer", - .init = ksym_trace_init, - .reset = ksym_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_ksym, -#endif - .print_header = ksym_trace_print_header, - .print_line = ksym_trace_output -}; - -#ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_profile_show(struct seq_file *m, void *v) -{ - struct hlist_node *node; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; - - seq_puts(m, " Access Type "); - seq_puts(m, " Symbol Counter\n"); - seq_puts(m, " ----------- "); - seq_puts(m, " ------ -------\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - - access_type = entry->attr.bp_type; - - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); - } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", ""); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); - } - rcu_read_unlock(); - - return 0; -} - -static int ksym_profile_open(struct inode *node, struct file *file) -{ - return single_open(file, ksym_profile_show, NULL); -} - -static const struct file_operations ksym_profile_fops = { - .open = ksym_profile_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - trace_create_file("ksym_profile", 0444, d_tracer, - NULL, &ksym_profile_fops); -#endif - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f0..39a5ca4cf15b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_KSYM: return 1; } return 0; @@ -755,56 +754,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_KSYM_TRACER -static int ksym_selftest_dummy; - -int -trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - ksym_selftest_dummy = 0; - /* Register the read-write tracing request */ - - ret = process_new_ksym_entry("ksym_selftest_dummy", - HW_BREAKPOINT_R | HW_BREAKPOINT_W, - (unsigned long)(&ksym_selftest_dummy)); - - if (ret < 0) { - printk(KERN_CONT "ksym_trace read-write startup test failed\n"); - goto ret_path; - } - /* Perform a read and a write operation over the dummy variable to - * trigger the tracer - */ - if (ksym_selftest_dummy == 0) - ksym_selftest_dummy++; - - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - /* read & write operations - one each is performed on the dummy variable - * triggering two entries in the trace buffer - */ - if (!ret && count != 2) { - printk(KERN_CONT "Ksym tracer startup test failed"); - ret = -1; - } - -ret_path: - return ret; -} -#endif /* CONFIG_KSYM_TRACER */ - -- cgit v1.2.3-70-g09d2 From f376bf5ffbad863d4bc3b2586b7e34cdf756ad17 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 16 Jul 2010 00:26:26 +0200 Subject: tracing: Remove sysprof ftrace plugin The sysprof ftrace plugin doesn't seem to be seriously used somewhere. There is a branch in the sysprof tree that makes an interface to it, but the real sysprof tool uses either its own module or perf events. Drop the sysprof ftrace plugin then, as it's mostly useless. Signed-off-by: Frederic Weisbecker Acked-by: Soeren Sandmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/Kconfig | 9 -- kernel/trace/Makefile | 1 - kernel/trace/trace.c | 3 - kernel/trace/trace.h | 3 - kernel/trace/trace_selftest.c | 32 ---- kernel/trace/trace_sysprof.c | 330 ------------------------------------------ 6 files changed, 378 deletions(-) delete mode 100644 kernel/trace/trace_sysprof.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f5306cb0afb1..c7683fd8a03a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -194,15 +194,6 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 84b2c9908dae..438e84a56ab3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o -obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8683dec6946b..78a49e67f7db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4354,9 +4354,6 @@ static __init int tracer_init_debugfs(void) trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif create_trace_options_dir(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 84d3f123e86f..2114b4c1150f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -296,7 +296,6 @@ struct dentry *trace_create_file(const char *name, const struct file_operations *fops); struct dentry *tracing_init_dentry(void); -void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; @@ -428,8 +427,6 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sysprof(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 39a5ca4cf15b..6ed05ee6cbc7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -690,38 +690,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index c080956f4d8e..000000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame_user { - const void __user *next_fp; - unsigned long return_address; -}; - -static int -copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame_user frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(void *unused) -{ - struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); -} - -static void start_stack_timers(void) -{ - on_each_cpu(start_stack_timer, NULL, 1); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static int stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - tracing_start_cmdline_record(); - - mutex_lock(&sample_timer_lock); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); - return 0; -} - -static void stack_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static const struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); -} -- cgit v1.2.3-70-g09d2 From eb7beb5c09af75494234ea6acd09d0a647cf7338 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 16 Jul 2010 00:50:03 +0200 Subject: tracing: Remove special traces Special traces type was only used by sysprof. Lets remove it now that sysprof ftrace plugin has been dropped. Signed-off-by: Frederic Weisbecker Acked-by: Soeren Sandmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan --- include/linux/kernel.h | 5 ---- include/linux/sched.h | 12 -------- kernel/trace/trace.c | 55 ------------------------------------ kernel/trace/trace.h | 7 ----- kernel/trace/trace_entries.h | 17 ----------- kernel/trace/trace_output.c | 66 ------------------------------------------- kernel/trace/trace_selftest.c | 1 - 7 files changed, 163 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8317ec4b9f3b..adee958b5989 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -508,9 +508,6 @@ extern void tracing_start(void); extern void tracing_stop(void); extern void ftrace_off_permanent(void); -extern void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); - static inline void __attribute__ ((format (printf, 1, 2))) ____trace_printk_check_format(const char *fmt, ...) { @@ -586,8 +583,6 @@ __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); #else -static inline void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); diff --git a/include/linux/sched.h b/include/linux/sched.h index 747fcaedddb7..f751ea9dcb7b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2434,18 +2434,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ -#ifdef CONFIG_TRACING -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); -#else -static inline void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ -} -#endif - extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78a49e67f7db..d9a4aa02c384 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1331,61 +1331,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ -static void -ftrace_trace_special(void *__tr, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - int pc) -{ - struct ftrace_event_call *call = &event_special; - struct ring_buffer_event *event; - struct trace_array *tr = __tr; - struct ring_buffer *buffer = tr->buffer; - struct special_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, - sizeof(*entry), 0, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->arg1 = arg1; - entry->arg2 = arg2; - entry->arg3 = arg3; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (tracing_disabled) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, arg1, arg2, arg3, pc); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - /** * trace_vbprintk - write binary msg to tracing buffer * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2114b4c1150f..638a5887e2ec 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,7 +22,6 @@ enum trace_type { TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, - TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, @@ -189,7 +188,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ - IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -332,11 +330,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); -void trace_special(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 84128371f254..e3dfecaf13e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -150,23 +150,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, ) ); -/* - * Special (free-form) trace entry: - */ -FTRACE_ENTRY(special, special_entry, - - TRACE_SPECIAL, - - F_STRUCT( - __field( unsigned long, arg1 ) - __field( unsigned long, arg2 ) - __field( unsigned long, arg3 ) - ), - - F_printk("(%08lx) (%08lx) (%08lx)", - __entry->arg1, __entry->arg2, __entry->arg3) -); - /* * Stack-trace entry: */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..a46197b80b7f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1069,65 +1069,6 @@ static struct trace_event trace_wake_event = { .funcs = &trace_wake_funcs, }; -/* TRACE_SPECIAL */ -static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_special_funcs = { - .trace = trace_special_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, -}; - -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, - .funcs = &trace_special_funcs, -}; - /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, @@ -1161,9 +1102,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_stack_event = { @@ -1194,9 +1132,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_user_stack_event = { @@ -1314,7 +1249,6 @@ static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, &trace_wake_event, - &trace_special_event, &trace_stack_event, &trace_user_stack_event, &trace_bprint_event, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6ed05ee6cbc7..155a415b3209 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,7 +13,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_WAKE: case TRACE_STACK: case TRACE_PRINT: - case TRACE_SPECIAL: case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: -- cgit v1.2.3-70-g09d2 From b444786f1a797a7f84e2561346a670649f9c7b3c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Jul 2010 23:40:11 +0200 Subject: tracing: Use generic_file_llseek for debugfs The default for llseek will change to no_llseek, so the tracing debugfs files need to add explicit .llseek assignments. Since we're dealing with regular files from a VFS perspective, use generic_file_llseek. Signed-off-by: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: John Kacur Cc: Li Zefan LKML-Reference: <1278538820-1392-10-git-send-email-arnd@arndb.de> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9a4aa02c384..c1752dac613e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2338,6 +2338,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, + .llseek = seq_lseek, }; /* @@ -2431,6 +2432,7 @@ static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) @@ -2597,6 +2599,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2647,6 +2650,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2976,6 +2980,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (iter->trace->pipe_open) iter->trace->pipe_open(iter); + nonseekable_open(inode, filp); out: mutex_unlock(&trace_types_lock); return ret; @@ -3534,18 +3539,21 @@ static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, + .llseek = generic_file_llseek, }; static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_pipe_fops = { @@ -3554,17 +3562,20 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, + .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, + .llseek = generic_file_llseek, }; static const struct file_operations trace_clock_fops = { @@ -3870,6 +3881,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic, .read = tracing_stats_read, + .llseek = generic_file_llseek, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -3906,6 +3918,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, }; #endif @@ -4059,6 +4072,7 @@ static const struct file_operations trace_options_fops = { .open = tracing_open_generic, .read = trace_options_read, .write = trace_options_write, + .llseek = generic_file_llseek, }; static ssize_t @@ -4110,6 +4124,7 @@ static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, + .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, -- cgit v1.2.3-70-g09d2 From e870e9a1240bcef1157ffaaf71dac63362e71904 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Jul 2010 11:07:32 +0800 Subject: tracing: Allow to disable cmdline recording We found that even enabling a single trace event that will rarely be triggered can add big overhead to context switch. (lmbench context switch test) ------------------------------------------------- 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ------ ------ ------ ------ ------ ------- ------- 2.19 2.3 2.21 2.56 2.13 2.54 2.07 2.39 2.51 2.35 2.75 2.27 2.81 2.24 The overhead is 6% ~ 11%. It's because when a trace event is enabled 3 tracepoints (sched_switch, sched_wakeup, sched_wakeup_new) will be activated to map pid to cmdname. We'd like to avoid this overhead, so add a trace option '(no)record-cmd' to allow to disable cmdline recording. Signed-off-by: Li Zefan LKML-Reference: <4C2D57F4.2050204@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 7 +++++-- kernel/trace/trace.c | 6 +++++- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 30 ++++++++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 01df7ca4ead7..2b7b1395b4d5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -152,11 +152,13 @@ extern int ftrace_event_reg(struct ftrace_event_call *event, enum { TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, + TRACE_EVENT_FL_RECORDED_CMD_BIT, }; enum { - TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), + TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), }; struct ftrace_event_call { @@ -174,6 +176,7 @@ struct ftrace_event_call { * 32 bit flags: * bit 1: enabled * bit 2: filter_active + * bit 3: enabled cmd record * * Changes to flags must hold the event_mutex. * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8683dec6946b..af9042977c08 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -344,7 +344,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -428,6 +428,7 @@ static const char *trace_options[] = { "latency-format", "sleep-time", "graph-time", + "record-cmd", NULL }; @@ -2561,6 +2562,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); } static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 84d3f123e86f..7778f067fc8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -591,6 +591,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x20000, TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, }; /* @@ -723,6 +724,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +extern void trace_event_enable_cmd_record(bool enable); + extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e8e6043f4d29..09b4fa6e4d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -170,6 +170,26 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) } EXPORT_SYMBOL_GPL(ftrace_event_reg); +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} + static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -179,13 +199,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->flags & TRACE_EVENT_FL_ENABLED) { call->flags &= ~TRACE_EVENT_FL_ENABLED; - tracing_stop_cmdline_record(); + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { - tracing_start_cmdline_record(); + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } ret = call->class->reg(call, TRACE_REG_REGISTER); if (ret) { tracing_stop_cmdline_record(); -- cgit v1.2.3-70-g09d2 From 985023dee6e212493831431ba2e3ce8918f001b2 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 25 Mar 2010 11:27:36 +0000 Subject: trace: Reorder struct ring_buffer_per_cpu to remove padding on 64bit Reorder structure to remove 8 bytes of padding on 64 bit builds. This shrinks the size to 128 bytes so allowing allocation from a smaller slab & needed one fewer cache lines. Signed-off-by: Richard Kennedy LKML-Reference: <1269516456.2054.8.camel@localhost> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28d0615a513f..3632ce87674f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) */ struct ring_buffer_per_cpu { int cpu; + atomic_t record_disabled; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { unsigned long read; u64 write_stamp; u64 read_stamp; - atomic_t record_disabled; }; struct ring_buffer { -- cgit v1.2.3-70-g09d2 From bc289ae98b75d93228d24f521ef02a076e506e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2010 18:26:24 +0800 Subject: tracing: Reduce latency and remove percpu trace_seq __print_flags() and __print_symbolic() use percpu trace_seq: 1) Its memory is allocated at compile time, it wastes memory if we don't use tracing. 2) It is percpu data and it wastes more memory for multi-cpus system. 3) It disables preemption when it executes its core routine "trace_seq_printf(s, "%s: ", #call);" and introduces latency. So we move this trace_seq to struct trace_iterator. Signed-off-by: Lai Jiangshan LKML-Reference: <4C078350.7090106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 5 +++-- include/trace/ftrace.h | 12 +++--------- kernel/trace/trace_output.c | 3 --- 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2b7b1395b4d5..02b8b24f8f51 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -11,8 +11,6 @@ struct trace_array; struct tracer; struct dentry; -DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq); - struct trace_print_flags { unsigned long mask; const char *name; @@ -58,6 +56,9 @@ struct trace_iterator { struct ring_buffer_iter *buffer_iter[NR_CPUS]; unsigned long iter_flags; + /* trace_seq for __print_flags() and __print_symbolic() etc. */ + struct trace_seq tmp_seq; + /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 55c1fd1bbc3d..fb783d94fc54 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -145,7 +145,7 @@ * struct trace_seq *s = &iter->seq; * struct ftrace_raw_ *field; <-- defined in stage 1 * struct trace_entry *entry; - * struct trace_seq *p; + * struct trace_seq *p = &iter->tmp_seq; * int ret; * * entry = iter->ent; @@ -157,12 +157,10 @@ * * field = (typeof(field))entry; * - * p = &get_cpu_var(ftrace_event_seq); * trace_seq_init(p); * ret = trace_seq_printf(s, "%s: ", ); * if (ret) * ret = trace_seq_printf(s, "\n"); - * put_cpu(); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -216,7 +214,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##call *field; \ struct trace_entry *entry; \ - struct trace_seq *p; \ + struct trace_seq *p = &iter->tmp_seq; \ int ret; \ \ event = container_of(trace_event, struct ftrace_event_call, \ @@ -231,12 +229,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ \ field = (typeof(field))entry; \ \ - p = &get_cpu_var(ftrace_event_seq); \ trace_seq_init(p); \ ret = trace_seq_printf(s, "%s: ", event->name); \ if (ret) \ ret = trace_seq_printf(s, print); \ - put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ @@ -255,7 +251,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##template *field; \ struct trace_entry *entry; \ - struct trace_seq *p; \ + struct trace_seq *p = &iter->tmp_seq; \ int ret; \ \ entry = iter->ent; \ @@ -267,12 +263,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ \ field = (typeof(field))entry; \ \ - p = &get_cpu_var(ftrace_event_seq); \ trace_seq_init(p); \ ret = trace_seq_printf(s, "%s: ", #call); \ if (ret) \ ret = trace_seq_printf(s, print); \ - put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..1ba64d3cc567 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,9 +16,6 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -- cgit v1.2.3-70-g09d2 From ef710e100c1068d3dd5774d2b34c5485219e06ce Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 1 Jul 2010 14:34:35 +0900 Subject: tracing: Shrink max latency ringbuffer if unnecessary Documentation/trace/ftrace.txt says buffer_size_kb: This sets or displays the number of kilobytes each CPU buffer can hold. The tracer buffers are the same size for each CPU. The displayed number is the size of the CPU buffer and not total size of all buffers. The trace buffers are allocated in pages (blocks of memory that the kernel uses for allocation, usually 4 KB in size). If the last page allocated has room for more bytes than requested, the rest of the page will be used, making the actual allocation bigger than requested. ( Note, the size may not be a multiple of the page size due to buffer management overhead. ) This can only be updated when the current_tracer is set to "nop". But it's incorrect. currently total memory consumption is 'buffer_size_kb x CPUs x 2'. Why two times difference is there? because ftrace implicitly allocate the buffer for max latency too. That makes sad result when admin want to use large buffer. (If admin want full logging and makes detail analysis). example, If admin have 24 CPUs machine and write 200MB to buffer_size_kb, the system consume ~10GB memory (200MB x 24 x 2). umm.. 5GB memory waste is usually unacceptable. Fortunatelly, almost all users don't use max latency feature. The max latency buffer can be disabled easily. This patch shrink buffer size of the max latency buffer if unnecessary. Signed-off-by: KOSAKI Motohiro LKML-Reference: <20100701104554.DA2D.A69D9226@jp.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 3 +++ kernel/trace/trace_sched_wakeup.c | 2 ++ 4 files changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index af9042977c08..f7488f44d26b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -660,6 +660,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -686,6 +690,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -2801,6 +2810,9 @@ static int tracing_resize_ring_buffer(unsigned long size) if (ret < 0) return ret; + if (!current_trace->use_max_tr) + goto out; + ret = ring_buffer_resize(max_tr.buffer, size); if (ret < 0) { int r; @@ -2828,11 +2840,14 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } + max_tr.entries = size; + out: global_trace.entries = size; return ret; } + /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @@ -2893,12 +2908,26 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } destroy_trace_option_files(topts); current_trace = t; topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } if (t->init) { ret = tracer_init(t, tr); @@ -3480,7 +3509,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } tracing_start(); - max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); return cnt; @@ -4578,16 +4606,14 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = ring_buffer_size(max_tr.buffer); - WARN_ON(max_tr.entries != global_trace.entries); + max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7778f067fc8b..cb629b3b108c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -276,6 +276,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + int use_max_tr; }; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef4..73a6b0601f2e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index c9fd5bd02036..4086eae6e81b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -382,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -396,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; __init static int init_wakeup_tracer(void) -- cgit v1.2.3-70-g09d2 From 24a461d537f49f9da6533d83100999ea08c6c755 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 10 Jul 2010 12:06:44 +0200 Subject: trace: strlen() return doesn't account for the NULL We need to add one to the strlen() return because of the NULL character. The type->name here generally comes from the kernel and I don't think any of them come close to being MAX_TRACER_SIZE (100) characters long so this is basically a cleanup. Signed-off-by: Dan Carpenter LKML-Reference: <20100710100644.GV19184@bicker> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f7488f44d26b..cacb6f083ecb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -739,7 +739,7 @@ __acquires(kernel_lock) return -1; } - if (strlen(type->name) > MAX_TRACER_SIZE) { + if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } -- cgit v1.2.3-70-g09d2 From 669336e4cf3e1cb95800f3f5924558a76d723c21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 20 Jul 2010 17:29:54 +0200 Subject: perf: Use tracepoint_synchronize_unregister() to flush any pending tracepoint call We use synchronize_sched() to ensure a tracepoint won't be called while/after we release the perf buffers it references. But the tracepoint API has its own API for that: tracepoint_synchronize_unregister(). Use it instead as it's self-explanatory and eases maintainance. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/trace_event_perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 23751659582e..000e6e85b445 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -131,10 +131,10 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); /* - * Ensure our callback won't be called anymore. See - * tracepoint_probe_unregister() and __DO_TRACE(). + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; -- cgit v1.2.3-70-g09d2 From 9da79ab83ee33ddc1fdd0858fd3d70925a1bde99 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 30 Jun 2010 14:15:48 +0530 Subject: tracing/kprobes: unregister_trace_probe needs to be called under mutex Comment in unregister_trace_probe() says probe_lock will be held when it gets called. However there is a case where it might called without the probe_lock being held. Also since we are traversing the probe_list and deleting an element from the probe_list, probe_lock should be held. This was first pointed in uprobes traceevent review by Frederic Weisbecker here. (http://lkml.org/lkml/2010/5/12/106) Cc: Ingo Molnar Cc: Masami Hiramatsu Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt LKML-Reference: <20100630084548.GA10325@linux.vnet.ibm.com> Signed-off-by: Srikar Dronamraju Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1b79d1c15726..8b27c9849b42 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -925,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) pr_info("Delete command needs an event name.\n"); return -EINVAL; } + mutex_lock(&probe_lock); tp = find_probe_event(event, group); if (!tp) { + mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ unregister_trace_probe(tp); free_trace_probe(tp); + mutex_unlock(&probe_lock); return 0; } -- cgit v1.2.3-70-g09d2