diff options
Diffstat (limited to 'kernel')
64 files changed, 1817 insertions, 475 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b03..2a202a846757 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) + def_bool HIGH_RES_TIMERS diff --git a/kernel/Makefile b/kernel/Makefile index a4d1aa8da9bc..09a9c94f42bd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,22 +7,19 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o semaphore.o \ + kthread.o sys_ni.o posix-cpu-timers.o \ + hrtimer.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o lglock.o smpboot.o + async.o range.o groups.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += locking/ obj-y += power/ obj-y += printk/ obj-y += cpu/ @@ -34,26 +31,15 @@ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o -obj-$(CONFIG_LOCKDEP) += lockdep.o -ifeq ($(CONFIG_PROC_FS),y) -obj-$(CONFIG_LOCKDEP) += lockdep_proc.o -endif obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o diff --git a/kernel/bounds.c b/kernel/bounds.c index e8ca97b5c386..5253204afdca 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -11,6 +11,7 @@ #include <linux/kbuild.h> #include <linux/page_cgroup.h> #include <linux/log2.h> +#include <linux/spinlock_types.h> void foo(void) { @@ -21,5 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif + DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); /* End of constants */ } diff --git a/kernel/cpu.c b/kernel/cpu.c index 973d034acf84..deff2e693766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } - smpboot_park_threads(cpu); /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might * not imply sync_sched(), so explicitly call both. + * + * Do sync before park smpboot threads to take care the rcu boost case. */ #ifdef CONFIG_PREEMPT synchronize_sched(); #endif synchronize_rcu(); + smpboot_park_threads(cpu); + /* * So now all preempt/rcu users must observe !cpu_active(). */ diff --git a/kernel/fork.c b/kernel/fork.c index f6d11fc67f72..728d5be9548c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a5214..80ba086f021d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -66,7 +66,7 @@ #include <asm/futex.h> -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h" int __read_mostly futex_cmpxchg_enabled; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3e97fb126e6b..9328b80eaf14 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -16,11 +16,12 @@ #include <linux/export.h> #include <linux/sysctl.h> #include <linux/utsname.h> +#include <trace/events/sched.h> /* * The number of tasks checked: */ -unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Limit number of tasks checked in a batch. @@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) t->last_switch_count = switch_count; return; } + + trace_sched_process_hang(t); + if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ + atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); + /* * kthread which checks for tasks stuck in D state */ @@ -216,6 +228,9 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; + if (atomic_xchg(&reset_hung_task, 0)) + continue; + check_hung_uninterruptible_tasks(timeout); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3bb14fbe5c6..dc04c166c54d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc) } /** - * irq_disable - Mark interupt disabled + * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3e59f951d42f..481a13c43b17 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) } /* - * Interrupts explicitely requested as threaded interupts want to be + * Interrupts explicitly requested as threaded interrupts want to be * preemtible - many of them need to sleep and wait for slow busses to * complete. */ diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 1162f1030f18..3320b84cc60f 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,6 +14,7 @@ enum { _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, + _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -26,6 +27,7 @@ enum { #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON +#define IRQ_IS_POLLED GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NESTED_THREAD; } + +static inline bool irq_settings_is_polled(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_IS_POLLED; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7b5f012bde9d..a1d8cc63b56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); - /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + /* + * PER_CPU, nested thread interrupts and interrupts explicitely + * marked polled are excluded from polling. + */ + if (irq_settings_is_per_cpu(desc) || + irq_settings_is_nested_thread(desc) || + irq_settings_is_polled(desc)) goto out; /* @@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->istate & IRQS_POLL_INPROGRESS) + if (desc->istate & IRQS_POLL_INPROGRESS || + irq_settings_is_polled(desc)) return; /* we get here again via the threaded handler */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a74f307c5ec..490afc03627e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image, * reinitialize them. * * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And + * and then copies the image to it's final destination. And * jumps into the image at entry. * * kexec does not sync, or unmount filesystems so if you need diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 000000000000..baab8e5e7f66 --- /dev/null +++ b/kernel/locking/Makefile @@ -0,0 +1,25 @@ + +obj-y += mutex.o semaphore.o rwsem.o lglock.o + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +endif + +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o +obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/kernel/lglock.c b/kernel/locking/lglock.c index 86ae2aebf004..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/locking/lglock.c diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c index 4e8e14c34e42..576ba756a32d 100644 --- a/kernel/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data) return 0; } -unsigned long __lockdep_count_forward_deps(struct lock_list *this) +static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); @@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) return ret; } -unsigned long __lockdep_count_backward_deps(struct lock_list *this) +static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..4f560cfedc8f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b2c71c5873e4..ef43ac4bafb5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); + seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) @@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) } if (i) { seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); + seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); seq_puts(m, "\n"); } } static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.3\n"); + seq_puts(m, "lock_stat version 0.4\n"); if (unlikely(!debug_locks)) seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", "class name", "con-bounces", @@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m) "waittime-min", "waittime-max", "waittime-total", + "waittime-avg", "acq-bounces", "acquisitions", "holdtime-min", "holdtime-max", - "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + "holdtime-total", + "holdtime-avg"); + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); seq_printf(m, "\n"); } diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..995b0cc2b84c 100644 --- a/kernel/lockdep_states.h +++ b/kernel/locking/lockdep_states.h diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/locking/mutex-debug.c diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/locking/mutex-debug.h diff --git a/kernel/mutex.c b/kernel/locking/mutex.c index d24105b1b794..4dd6e4c219de 100644 --- a/kernel/mutex.c +++ b/kernel/locking/mutex.c @@ -1,5 +1,5 @@ /* - * kernel/mutex.c + * kernel/locking/mutex.c * * Mutexes: blocking mutual exclusion locks * diff --git a/kernel/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/locking/mutex.h diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/kernel/locking/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/percpu.h> +#include <linux/wait.h> +#include <linux/lockdep.h> +#include <linux/percpu-rwsem.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/errno.h> + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, + const char *name, struct lock_class_key *rwsem_key) +{ + brw->fast_read_ctr = alloc_percpu(int); + if (unlikely(!brw->fast_read_ctr)) + return -ENOMEM; + + /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + __init_rwsem(&brw->rw_sem, name, rwsem_key); + atomic_set(&brw->write_ctr, 0); + atomic_set(&brw->slow_read_ctr, 0); + init_waitqueue_head(&brw->write_waitq); + return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ + free_percpu(brw->fast_read_ctr); + brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + * R_W: down_write() comes after up_read(), the writer should see all + * changes done by the reader + * or + * W_R: down_read() comes after up_write(), the reader should see all + * changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ + bool success = false; + + preempt_disable(); + if (likely(!atomic_read(&brw->write_ctr))) { + __this_cpu_add(*brw->fast_read_ctr, val); + success = true; + } + preempt_enable(); + + return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ + might_sleep(); + if (likely(update_fast_ctr(brw, +1))) { + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + return; + } + + down_read(&brw->rw_sem); + atomic_inc(&brw->slow_read_ctr); + /* avoid up_read()->rwsem_release() */ + __up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ + rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + + if (likely(update_fast_ctr(brw, -1))) + return; + + /* false-positive is possible but harmless */ + if (atomic_dec_and_test(&brw->slow_read_ctr)) + wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ + unsigned int sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + sum += per_cpu(*brw->fast_read_ctr, cpu); + per_cpu(*brw->fast_read_ctr, cpu) = 0; + } + + return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ + /* tell update_fast_ctr() there is a pending writer */ + atomic_inc(&brw->write_ctr); + /* + * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read + * so that update_fast_ctr() can't succeed. + * + * 2. Ensures we see the result of every previous this_cpu_add() in + * update_fast_ctr(). + * + * 3. Ensures that if any reader has exited its critical section via + * fast-path, it executes a full memory barrier before we return. + * See R_W case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + + /* exclude other writers, and block the new readers completely */ + down_write(&brw->rw_sem); + + /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ + atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + + /* wait for all readers to complete their percpu_up_read() */ + wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ + /* release the lock, but the readers can't use the fast-path */ + up_write(&brw->rw_sem); + /* + * Insert the barrier before the next fast-path in down_read, + * see W_R case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + /* the last writer unblocks update_fast_ctr() */ + atomic_dec(&brw->write_ctr); +} diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..14193d596d78 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c index 1d96dd0d93c1..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/locking/rtmutex-tester.c diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/locking/rtmutex.c diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..a1a1dd06421d 100644 --- a/kernel/rtmutex.h +++ b/kernel/locking/rtmutex.h diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/export.h> + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ + int ret = 1; + unsigned long flags; + + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { + ret = (sem->activity != 0); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + } + return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->activity = 0; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + * - the 'active count' _reached_ zero + * - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + int woken; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); + goto out; + } + + /* grant an infinite number of read locks to the front of the queue */ + woken = 0; + do { + struct list_head *next = waiter->list.next; + + list_del(&waiter->list); + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + woken++; + if (next == &sem->wait_list) + break; + waiter = list_entry(next, struct rwsem_waiter, list); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + sem->activity += woken; + + out: + return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ + struct rwsem_waiter *waiter; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + wake_up_process(waiter->task); + + return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + goto out; + } + + tsk = current; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we don't need to touch the semaphore struct anymore */ + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + + tsk->state = TASK_RUNNING; + out: + ; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* set up my own style of waitqueue */ + tsk = current; + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + list_add_tail(&waiter.list, &sem->wait_list); + + /* wait for someone to release the lock */ + for (;;) { + /* + * That is the key to support write lock stealing: allows the + * task already on CPU to get the lock soon rather than put + * itself into sleep and waiting for system woke it or someone + * else in the head of the wait list up. + */ + if (sem->activity == 0) + break; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + schedule(); + raw_spin_lock_irqsave(&sem->wait_lock, flags); + } + /* got the lock */ + sem->activity = -1; + list_del(&waiter.list); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity == 0) { + /* got the lock */ + sem->activity = -1; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + sem = __rwsem_wake_one_writer(sem); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 0; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 1); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 1; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 0); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,293 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/export.h> + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->count = RWSEM_UNLOCKED_VALUE; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + struct list_head *next; + long oldcount, woken, loop, adjustment; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); + goto out; + } + + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } + + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. + */ + woken = 0; + do { + woken++; + + if (waiter->list.next == &sem->wait_list) + break; + + waiter = list_entry(waiter->list.next, + struct rwsem_waiter, list); + + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (waiter->type != RWSEM_WAITING_FOR_WRITE) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + + if (adjustment) + rwsem_atomic_add(adjustment, sem); + + next = sem->wait_list.next; + loop = woken; + do { + waiter = list_entry(next, struct rwsem_waiter, list); + next = waiter->list.next; + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + } while (--loop); + + sem->wait_list.next = next; + next->prev = &sem->wait_list; + + out: + return sem; +} + +/* + * wait for the read lock to be granted + */ +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * wait until we successfully acquire the write lock + */ +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (true) { + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/locking/rwsem.c diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/locking/semaphore.c diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/locking/spinlock.c diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include <linux/spinlock.h> +#include <linux/nmi.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/delay.h> +#include <linux/export.h> + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + lock->magic = SPINLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; + lock->magic = RWLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ + struct task_struct *owner = NULL; + + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) + owner = lock->owner; + printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", + msg, raw_smp_processor_id(), + current->comm, task_pid_nr(current)); + printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d\n", + lock, lock->magic, + owner ? owner->comm : "<none>", + owner ? task_pid_nr(owner) : -1, + lock->owner_cpu); + dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); + SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); + SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); + SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + + for (i = 0; i < loops; i++) { + if (arch_spin_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif + + /* + * The trylock above was causing a livelock. Give the lower level arch + * specific lock code a chance to acquire the lock. We have already + * printed a warning/backtrace at this point. The non-debug arch + * specific code might actually succeed in acquiring the lock. If it is + * not successful, the end-result is the same - there is no forward + * progress. + */ + arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ + debug_spin_lock_before(lock); + if (unlikely(!arch_spin_trylock(&lock->raw_lock))) + __spin_lock_debug(lock); + debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ + int ret = arch_spin_trylock(&lock->raw_lock); + + if (ret) + debug_spin_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ + debug_spin_unlock(lock); + arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", + msg, raw_smp_processor_id(), current->comm, + task_pid_nr(current), lock); + dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_read_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ + int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); + RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); + RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +#if 0 /* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_write_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ + debug_write_lock_before(lock); + arch_write_lock(&lock->raw_lock); + debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ + int ret = arch_write_trylock(&lock->raw_lock); + + if (ret) + debug_write_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); +} diff --git a/kernel/module.c b/kernel/module.c index af5ebd21d77b..f5a3b1e8ec51 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -641,8 +641,6 @@ static int module_unload_init(struct module *mod) /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; return 0; } @@ -768,16 +766,9 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; + struct stopref sref = { mod, flags, forced }; - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } + return stop_machine(__try_stop_module, &sref, NULL); } unsigned long module_refcount(struct module *mod) @@ -810,21 +801,6 @@ EXPORT_SYMBOL(module_refcount); /* This exists whether we can unload or not */ static void free_module(struct module *mod); -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - pr_debug("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -839,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + if (!(flags & O_NONBLOCK)) { + printk(KERN_WARNING + "waiting module removal not supported: please upgrade"); + } + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -856,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Doing init or already dying? */ if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ + /* FIXME: if (force), slam module count damn the torpedoes */ pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; @@ -873,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Set this up before setting mod->state */ - mod->waiter = current; - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - mutex_unlock(&module_mutex); /* Final destruction now no one is using it. */ if (mod->exit != NULL) @@ -1002,9 +975,6 @@ void module_put(struct module *module) __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); preempt_enable(); } } @@ -2728,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return 0; } -static void find_module_sections(struct module *mod, struct load_info *info) +static int find_module_sections(struct module *mod, struct load_info *info) { mod->kp = section_objs(info, "__param", sizeof(*mod->kp), &mod->num_kp); @@ -2758,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); + if (!mod->ctors) + mod->ctors = section_objs(info, ".init_array", + sizeof(*mod->ctors), &mod->num_ctors); + else if (find_sec(info, ".init_array")) { + /* + * This shouldn't happen with same compiler and binutils + * building all parts of the module. + */ + printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + mod->name); + return -EINVAL; + } #endif #ifdef CONFIG_TRACEPOINTS @@ -2795,6 +2777,8 @@ static void find_module_sections(struct module *mod, struct load_info *info) info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); + + return 0; } static int move_module(struct module *mod, struct load_info *info) @@ -3248,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, info); + err = find_module_sections(mod, info); + if (err) + goto free_unload; err = check_module_license_and_versions(mod); if (err) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..2fac9cc79b3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 12 + depends on DPM_WATCHDOG + config PM_TRACE bool help diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a394297f8b2f..8dff9b48075a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count <= 11) { /* ASCII perhaps? */ - char ascii_value[11]; - unsigned long int ulval; + } else { int ret; - if (copy_from_user(ascii_value, buf, count)) - return -EFAULT; - - if (count > 10) { - if (ascii_value[10] == '\n') - ascii_value[10] = '\0'; - else - return -EINVAL; - } else { - ascii_value[count] = '\0'; - } - ret = kstrtoul(ascii_value, 16, &ulval); - if (ret) { - pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); - return -EINVAL; - } - value = (s32)lower_32_bits(ulval); - } else { - return -EINVAL; + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; } req = filp->private_data; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cff..10c22cae83a0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1402,7 +1402,11 @@ int hibernate_preallocate_memory(void) * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = (count - max_size) - pages_highmem; + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; pages = preallocate_image_memory(alloc, avail_normal); if (pages < alloc) { /* We have exhausted non-highmem pages, try highmem. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad1..24850270c802 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -36,9 +36,9 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - char frozen; - char ready; - char platform_support; + bool frozen; + bool ready; + bool platform_support; bool free_bitmaps; } snapshot_state; @@ -93,9 +93,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) atomic_inc(&snapshot_device_available); - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; + data->frozen = false; + data->ready = false; + data->platform_support = false; Unlock: unlock_system_sleep(); @@ -229,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (error) thaw_processes(); else - data->frozen = 1; + data->frozen = true; break; @@ -240,7 +240,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); - data->frozen = 0; + data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: @@ -270,7 +270,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -334,7 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = 0; + data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0c9a934cfec1..1254f312d024 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); /* * Test whether RCU thinks that the current CPU is idle. */ -bool __rcu_is_watching(void) +bool notrace __rcu_is_watching(void) { return rcu_dynticks_nesting; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c06ddfea7cd..dd081987a8ec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -664,7 +664,7 @@ void rcu_nmi_exit(void) * rcu_is_watching(), the caller of __rcu_is_watching() must have at * least disabled preemption. */ -bool __rcu_is_watching(void) +bool notrace __rcu_is_watching(void) { return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; } @@ -675,7 +675,7 @@ bool __rcu_is_watching(void) * If the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ -bool rcu_is_watching(void) +bool notrace rcu_is_watching(void) { int ret; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3822ac0c4b27..6abb03dff5c0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1133,7 +1133,7 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../rtmutex_common.h" +#include "../locking/rtmutex_common.h" #ifdef CONFIG_RCU_TRACE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1deccd78be98..c1808606ee5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + /* + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + */ + if (!p->on_cpu) + return p->se.sum_exec_runtime; +#endif + rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df77c605c7a6..e8b652ebe027 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1000,7 +1000,7 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu; + int cpu, cpus = 0; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); ns->power += power_of(cpu); + + cpus++; } + /* + * If we raced with hotplug and there are no CPUs left in our mask + * the @ns structure is NULL'ed and task_numa_compare() will + * not find this node attractive. + * + * We'll either bail at !has_capacity, or we'll detect a huge imbalance + * and bail there. + */ + if (!cpus) + return; + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); ns->has_capacity = (ns->nr_running < ns->capacity); @@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; rcu_read_unlock(); + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + p->numa_preferred_nid = cpu_to_node(task_cpu(p)); + return -EINVAL; + } + taskweight = task_weight(p, env.src_nid); groupweight = group_weight(p, env.src_nid); update_numa_stats(&env.src_stats, env.src_nid); @@ -2153,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; diff --git a/kernel/smp.c b/kernel/smp.c index f5768b0c816a..bd9f94028838 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,9 +15,9 @@ #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, + CSD_FLAG_WAIT = 0x02, }; struct call_function_data { @@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd) static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: @@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static -void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; int ipi; + if (wait) + csd->flags |= CSD_FLAG_WAIT; + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&csd->list, &dst->list); @@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, } put_cpu(); } +EXPORT_SYMBOL_GPL(__smp_call_function_single); /** * smp_call_function_many(): Run a function on a set of other CPUs. @@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; diff --git a/kernel/softirq.c b/kernel/softirq.c index b24988353458..11025ccc06dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/export.h> @@ -627,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = *(int *)cp->info; - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = &softirq; - cp->flags = 0; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { - int i; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d37d9dd8f463..34a604726d0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -969,9 +969,10 @@ static struct ctl_table kern_table[] = { { .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "hung_task_timeout_secs", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560bfb95..f785aef65799 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03cf44ac54d3..22fa55696760 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob) static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; int ret = 0; @@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd) return ret; } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; int ret = -ENODEV; @@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) { @@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf) func = strsep(&buf, ","); /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - func); + FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { + unsigned long *table; + size_t size; + int *count; + const struct seq_operations *seq_ops; +}; static void * __g_next(struct seq_file *m, loff_t *pos) { - if (*pos >= ftrace_graph_count) + struct ftrace_graph_data *fgd = m->private; + + if (*pos >= *fgd->count) return NULL; - return &ftrace_graph_funcs[*pos]; + return &fgd->table[*pos]; } static void * @@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos) static void *g_start(struct seq_file *m, loff_t *pos) { + struct ftrace_graph_data *fgd = m->private; + mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_filter_enabled && !*pos) + if (!*fgd->count && !*pos) return (void *)1; return __g_next(m, pos); @@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = { }; static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) { int ret = 0; - if (unlikely(ftrace_disabled)) - return -ENODEV; - mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ftrace_graph_filter_enabled = 0; - ftrace_graph_count = 0; - memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + *fgd->count = 0; + memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); } mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_graph_seq_ops); + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, fgd->seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } + } else + file->private_data = fgd; return ret; } static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_notrace_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_notrace_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int ftrace_graph_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + kfree(m->private); seq_release(inode, file); + } else { + kfree(file->private_data); + } + return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + if (!not && *idx >= size) return -EBUSY; search_len = strlen(search); @@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) fail = 0; if (!exists) { array[(*idx)++] = rec->ip; - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + if (*idx >= size) goto out; } } else { @@ -3925,8 +3995,6 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = !!(*idx); - return 0; } @@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - ssize_t read, ret; + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; if (!cnt) return 0; - mutex_lock(&graph_lock); - - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { - ret = -ENOMEM; - goto out_unlock; - } + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) + return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; + mutex_lock(&graph_lock); + /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - parser.buffer); - if (ret) - goto out_free; + ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, + parser.buffer); + + mutex_unlock(&graph_lock); } - ret = read; + if (!ret) + ret = read; -out_free: trace_parser_put(&parser); -out_unlock: - mutex_unlock(&graph_lock); return ret; } @@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = { .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, }; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, + .release = ftrace_graph_release, +}; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) @@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); + trace_create_file("set_graph_notrace", 0444, d_tracer, + NULL, + &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -4320,12 +4396,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); + + /* + * Control funcs (perf) uses RCU. Only trace if + * RCU is currently active. + */ + if (!rcu_is_watching()) + goto out; + do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); + out: trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9fea7dfd5d3..9d20cd9743ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) +int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) { - return filter_check_discard(call, rec, buffer, event); + if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + !filter_match_preds(file->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } -EXPORT_SYMBOL_GPL(filter_current_check_discard); +EXPORT_SYMBOL_GPL(call_filter_check_discard); cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, if (isspace(ch)) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else { + } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + } else { + ret = -EINVAL; + goto out; } *ppos += read; @@ -1261,21 +1284,6 @@ int is_tracing_stopped(void) } /** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); -} - -/** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, @@ -1631,7 +1639,7 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -1715,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: @@ -1817,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: @@ -2009,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2064,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2761,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m) seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); seq_printf(m, "# is not a '0' or '1')\n"); } @@ -2965,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -5455,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { .func = ftrace_trace_snapshot_callback, }; -static int register_snapshot_cmd(void) +static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else -static inline int register_snapshot_cmd(void) { return 0; } +static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) @@ -6254,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->trace_buffer = &global_trace.trace_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 73d08aa25b55..ea189e027b80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -193,8 +193,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); - DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); + struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -515,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -712,6 +713,8 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -731,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_filter_enabled) + if (!ftrace_graph_count) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -759,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_notrace_count) + return 0; + + for (i = 0; i < ftrace_graph_notrace_count; i++) { + if (addr == ftrace_graph_notrace_funcs[i]) + return 1; + } + + return 0; +} #else static inline int ftrace_graph_addr(unsigned long addr) { return 1; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t @@ -987,9 +1011,9 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file, char *filter_string); extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); @@ -1000,20 +1024,6 @@ extern int filter_assign_type(const char *type); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} - extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d594da0dc03c..697fb9bac8f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 368a4d50cc30..f919a2e21bf3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -989,7 +989,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; struct trace_seq *s; int r = -ENODEV; @@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - print_event_filter(call, s); + file = event_file_data(filp); + if (file) + print_event_filter(file, s); mutex_unlock(&event_mutex); - if (call) + if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -1021,7 +1021,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; char *buf; int err = -ENODEV; @@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, buf[cnt] = '\0'; mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - err = apply_event_filter(call, buf); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); free_page((unsigned long) buf); @@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct trace_array *tr; int ret; + if (tracing_is_disabled()) + return -ENODEV; + /* Make sure the system still exists */ mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); @@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (tracing_is_disabled()) + return -ENODEV; + if (trace_array_get(tr) < 0) return -ENODEV; @@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp) if (ret < 0) { trace_array_put(tr); kfree(dir); + return ret; } filp->private_data = dir; - return ret; + return 0; } static int subsystem_release(struct inode *inode, struct file *file) @@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, call, + trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); trace_create_file("format", 0444, file->dir, call, @@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); + destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_preds(call); + destroy_call_preds(call); } static int probe_remove_event_call(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97daa8cf958d..2468f56dc5db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } +static inline struct event_filter *event_filter(struct ftrace_event_file *file) +{ + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + return file->event_call->filter; + else + return file->filter; +} + /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter = event_filter(file); if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); @@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_call *call) +static void call_filter_disable(struct ftrace_event_call *call) { call->flags &= ~TRACE_EVENT_FL_FILTERED; } +static void filter_disable(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call_filter_disable(call); + else + file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + static void __free_filter(struct event_filter *filter) { if (!filter) @@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void destroy_call_preds(struct ftrace_event_call *call) +{ + __free_filter(call->filter); + call->filter = NULL; +} + +static void destroy_file_preds(struct ftrace_event_file *file) +{ + __free_filter(file->filter); + file->filter = NULL; +} + /* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing * the tracepoints from within it. */ -void destroy_preds(struct ftrace_event_call *call) +void destroy_preds(struct ftrace_event_file *file) { - __free_filter(call->filter); - call->filter = NULL; + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + destroy_call_preds(file->event_call); + else + destroy_file_preds(file); } static struct event_filter *__alloc_filter(void) @@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static inline void __remove_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + filter_disable(file); + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + remove_filter_string(call->filter); + else + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - filter_disable(call); - remove_filter_string(call->filter); + __remove_filter(file); } } -static void filter_free_subsystem_filters(struct event_subsystem *system) +static inline void __free_subsystem_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { + __free_filter(call->filter); + call->filter = NULL; + } else { + __free_filter(file->filter); + file->filter = NULL; + } +} + +static void filter_free_subsystem_filters(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - __free_filter(call->filter); - call->filter = NULL; + __free_subsystem_filter(file); } } @@ -1617,15 +1677,85 @@ fail: return err; } +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, + struct event_filter *filter) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + rcu_assign_pointer(call->filter, filter); + else + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + RCU_INIT_POINTER(call->filter, NULL); + else + RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + return true; + + if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && + (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) + return true; + + return false; +} + struct filter_list { struct list_head list; struct event_filter *filter; }; static int replace_system_preds(struct event_subsystem *system, + struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { + struct ftrace_event_file *file; struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; @@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system, bool fail = true; int err; - list_for_each_entry(call, &ftrace_events, list) { - + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; @@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + event_set_no_set_filter_flag(file); else - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + event_clear_no_set_filter_flag(file); } - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; + call = file->event_call; + if (strcmp(call->class->system, system->name) != 0) continue; - if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + if (event_no_set_filter_flag(file)) continue; filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system, err = replace_preds(call, filter, ps, filter_string, false); if (err) { - filter_disable(call); + filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(ps, filter); } else - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = call->filter; - rcu_assign_pointer(call->filter, filter_item->filter); + filter = event_filter(file); + event_set_filter(file, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call, * and always remembers @filter_str. */ static int create_system_filter(struct event_subsystem *system, + struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; @@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, ps, filter_str); + err = replace_system_preds(system, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) { + struct ftrace_event_call *call = file->event_call; struct event_filter *filter; int err; if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(call); - filter = call->filter; + filter_disable(file); + filter = event_filter(file); + if (!filter) return 0; - RCU_INIT_POINTER(call->filter, NULL); + + event_clear_filter(file); + /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); + return 0; } @@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) * string */ if (filter) { - struct event_filter *tmp = call->filter; + struct event_filter *tmp; + tmp = event_filter(file); if (!err) - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); else - filter_disable(call); + filter_disable(file); - rcu_assign_pointer(call->filter, filter); + event_set_filter(file, filter); if (tmp) { /* Make sure the call is done with the filter */ @@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; struct event_filter *filter; int err = 0; @@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system); + filter_free_subsystem_filters(system, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, filter_string, &filter); + err = create_system_filter(system, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b5c09242683d..0b99120d395c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -82,9 +82,9 @@ static struct trace_array *graph_array; * to fill in space into DURATION column. */ enum { - DURATION_FILL_FULL = -1, - DURATION_FILL_START = -2, - DURATION_FILL_END = -3, + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; static enum print_line_t @@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return -EBUSY; } + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in debugfs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + calltime = trace_clock_local(); index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; - *depth = index; + *depth = current->curr_ret_stack; return 0; } @@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - if (unlikely(index < 0)) { + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); /* Might as well panic, otherwise we have no where to go */ @@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } /* * The trace should run after decrementing the ret counter @@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); return 1; @@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) /* trace it when it is-nested-in or is a function enabled. */ if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || + ftrace_graph_ignore_irqs()) || (trace->depth < 0) || (max_depth && trace->depth >= max_depth)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_duration(DURATION_FILL_START, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = print_graph_duration(DURATION_FILL_END, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ - switch (duration) { - case DURATION_FILL_FULL: + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_START: + case FLAGS_FILL_START: ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_END: + case FLAGS_FILL_END: ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return TRACE_TYPE_PARTIAL_LINE; /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 243f6834d026..dae9541ada9e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, entry->ip = (unsigned long)tp->rp.kp.addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } @@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b3dcfb2f0fef..0abd9b863474 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4e98e3b257a3..3f34dc9b40f3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } @@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 847f88a6194b..7af67360b330 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, - struct rb_node *node) +static void __reset_stat_session(struct stat_session *session) { - struct stat_node *snode; - struct rb_node *parent = rb_parent(node); - - if (node->rb_left) - return node->rb_left; - else if (node->rb_right) - return node->rb_right; - else { - if (!parent) - ; - else if (parent->rb_left == node) - parent->rb_left = NULL; - else - parent->rb_right = NULL; + struct stat_node *snode, *n; - snode = container_of(node, struct stat_node, node); - if (ts->stat_release) - ts->stat_release(snode->stat); + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); kfree(snode); - - return parent; } -} - -static void __reset_stat_session(struct stat_session *session) -{ - struct rb_node *node = session->stat_root.rb_node; - - while (node) - node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 559329d9bd2f..e4b6d11bdf78 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_current_check_discard(buffer, sys_data->enter_event, - entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_current_check_discard(buffer, sys_data->exit_event, - entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -415,10 +427,15 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - clear_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); + /* + * Callers expect the event to be completely disabled on + * return, so wait for current handlers to finish. + */ + synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -435,7 +452,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -453,10 +470,15 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - clear_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); + /* + * Callers expect the event to be completely disabled on + * return, so wait for current handlers to finish. + */ + synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 272261b5f94f..b6dcc42ef7f5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); + tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu, for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); } diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e41..509403e3fbc6 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +void __smp_call_function_single(int cpu, struct call_single_data *csd, + int wait) +{ + unsigned long flags; + + local_irq_save(flags); + csd->func(csd->info); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__smp_call_function_single); + int on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; |